本文介绍了如何配置cublas {t} symm()函数参数的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧! 问题描述 此函数使用CUDA执行对称矩阵乘法。虽然,我成功地使用非对称版本cublas {t} gemm()我不能正确使用cublas {t} symm()函数。 我知道CUBLAS库使用列主要矩阵存储。我使用行主要的C / C ++矩阵,我知道如何解决这个问题为cublas {t} gemm()通过替换输入矩阵等。然而,我不能解决它的对称情况。问题是即使我使用列主要矩阵存储我发现不可预测的结果。矩阵包含复杂浮点(cuComplex)。我假设我有行主矩阵。这里是代码和输出: //矩阵乘法:C = A * B. // Host码。 // //实用程序和系统包括 #include< assert.h> #include< helper_string.h> //帮助CUDA SDK示例通用的共享函数 // CUDA运行时 #include< cuda_runtime.h> #include< cublas_v2.h> #ifndef min #define min(a,b)((a< b)?a:b) #endif #ifndef max #define max(a,b)((a> b)?a:b) #endif ////////////// ////////////////////////////////////////////////// //////////////// //这些是CUDA Helper函数(除了helper_cuda.h) void inline checkError(cublasStatus_t status, const char * msg) { if(status!= CUBLAS_STATUS_SUCCESS) { printf(%s,msg); exit(EXIT_FAILURE); } } // CUDA Helper函数结束 //分配具有随机浮点数的矩阵。 void randomCmplxInit(cuComplex * data,int size) { for(int i = 0; i data [i] = make_cuComplex rand()/(float)RAND_MAX,rand()/(float)RAND_MAX); } // void initializeCUDA(int argc,char ** argv,int& devID,int& iSizeMultiple,sMatrixSize& matrix_size) void initializeCUDA(int argc,char ** argv,int& devID) { //默认情况下,我们使用设备0,否则我们根据命令行提供的内容覆盖设备ID cudaError_t error; devID = 0; int m,n,k; if(checkCmdLineFlag(argc,(const char **)argv,device)) { devID = getCmdLineArgumentInt(argc, 设备); error = cudaSetDevice(devID); if(error!= cudaSuccess) { printf(cudaSetDevice返回错误代码%d,行(%d)\\\,错误,__LINE__); exit(EXIT_FAILURE); } } //获取此GPU上的SM数量 error = cudaGetDevice(& devID); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(& deviceProp,devID); printf(GPU Device%d:\%s\with compute capability%d。%d\\\\\\,devID,deviceProp.name,deviceProp.major, deviceProp.minor); //对于Fermi和更高的块使用更大的块大小 int block_size =(deviceProp.major< 2)? 16:32; } /////////////////////////////////////// /////////////////////////////////////// //!运行一个简单的测试矩阵乘以使用CUBLAS ////////////////////////////////////////// //////////////////////////////////////////// int matrixMultiply(int argc,char ** argv,int devID) { int i,j; unsigned int m,n,k; cudaDeviceProp deviceProp; cudaError_t error; error = cudaGetDeviceProperties(& deviceProp,devID); if(error!= cudaSuccess) { printf(cudaGetDeviceProperties returns error code%d,line(%d)\\\,error,__LINE__); exit(EXIT_FAILURE); } //对于Fermi和更高的块使用更大的块大小 int block_size =(deviceProp.major< 2)? 16:32; m = 3; //矩阵op(A)和C的行数。 (m×k) n = 2; //矩阵op(B)和C的列数。 (k×n) k = m; // op(A)的列数和op(B)的行数。 C - > (mxn) //我想计算C = A * B的行主格式, //所以我必须找到C(T)= B(T)A )= C(T)A以列为主的格式 //为矩阵A和B分配主机存储器 unsigned int size_A = m *(m + 1)/ 2; //对称矩阵的大小 unsigned int mem_size_A = sizeof(cuComplex)* size_A; cuComplex * h_A =(cuComplex *)malloc(mem_size_A); unsigned int size_B = m * n; unsigned int mem_size_B = sizeof(cuComplex)* size_B; cuComplex * h_B =(cuComplex *)malloc(mem_size_B); //初始化主机存储器 for(i = 0; i h_A [i] = make_cuComplex ),(float)0); for(i = 0; i h_B [i] = make_cuComplex((float)(i + 2),(float)0) //分配设备内存 cuComplex * d_A,* d_B,* d_C; unsigned int size_C = m * n; unsigned int mem_size_C = sizeof(cuComplex)* size_C; //为结果分配主机内存 cuComplex * h_C =(cuComplex *)malloc(mem_size_C); cuComplex * h_CUBLAS =(cuComplex *)malloc(mem_size_C); error = cudaMalloc((void **)& d_A,mem_size_A); error = cudaMalloc((void **)& d_B,mem_size_B) //将主机内存复制到设备 error = cudaMemcpy(d_A,h_A,mem_size_A,cudaMemcpyHostToDevice); error = cudaMemcpy(d_B,h_B,mem_size_B,cudaMemcpyHostToDevice); error = cudaMalloc((void **)& d_C,mem_size_C); //设置执行参数 dim3 threads(block_size,block_size); dim3 grid(n / threads.x,m / threads.y); //创建并启动计时器 printf(使用CUBLAS ...的计算结果); // CUBLAS版本2.0 { cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(& handle); if(ret!= CUBLAS_STATUS_SUCCESS) { printf(cublasCreate返回错误代码%d,行(%d)\\\,ret,__LINE__); exit(EXIT_FAILURE); } const cuComplex alpha = make_cuComplex(1.0f,0.0f); const cuComplex beta = make_cuComplex(0.0f,0.0f); //使用cublas ret = cublasCsymm(handle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_UPPER,n,m,& alpha,d_A,m,d_B,m,& beta,d_C,m)执行操作; //将结果从设备复制到主机 error = cudaMemcpy(h_CUBLAS,d_C,mem_size_C,cudaMemcpyDeviceToHost); checkError(cublasDestroy(handle),cublasDestroy()error!\\\); } printf(\\\Computations completed.\\\\\\); printf(symm matrix A:\\\); int s = 0; for(i = 0; i for(j = 0; j // printf 7.5G + j(%7.5G),h_A [j + i * k] .x,h_A [j + i * k]。 printf(%7.5G,h_A [s] .x); s ++; } printf(\\\); } printf(\\\ matrix B:\\\); for(i = 0; i for(j = 0; j // printf(%7.5G + j(%7.5G),h_B [j + i * n] .x,h_B [j + i * n]。 printf(%7.5G,h_B [j + i * n] .x); } printf(\\\); } printf(\\\ matrix C = A * B:\\\); for(i = 0; i for(j = 0; j // printf(%7.5G + j(%7.5G),h_CUBLAS [j + i * n] .x,h_CUBLAS [j + i * n]。 printf(%7.5G,h_CUBLAS [j + i * n] .x); } printf(\\\); } //清除内存 free(h_A); free(h_B); free(h_C); // free(reference); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaDeviceReset(); } /////////////////////////////////////// ///////////////////////////////////////// //程序主 ///////////////////////////////////////////// /////////////////////////////////// int main(int argc,char ** argv) { printf([Matrix Multiply CUBLAS] - Starting ... \\\); int devID = 0,sizeMult = 5; initializeCUDA(argc,argv,devID); int matrix_result = matrixMultiply(argc,argv,devID); } 我假设我有以下乘法矩阵: A = 1 2 4 2 3 5 4 5 6 B = 2 3 4 5 6 7 并期望获得 A * B = 34 41 46 56 64 79 但获得的 OUTPUT 如下: symm矩阵A: 1 2 3 4 5 6 矩阵B: 2 3 4 5 6 7 矩阵C = A * B: 78 90 74 97 114 146 代码?可能cublasCsymm函数的参数是错误的。 谢谢, Kagan 方案 编辑: 根据下面提出的问题,我选择重新工作我的答案和示例代码。 至少对于这些操作,您可以处理行主存储而不进行转置。此外, symm 函数不使用压缩存储器,因此进一步促进了这一观察结果。 回答其他问题: cublasCsymm 函数不使用压缩存储格式(像其他一些功能,例如 cublasCspmv 例如),因为 cublasCsymm 函数旨在复制相应的 netlib函数,它也不使用压缩存储格式。根据我对cublas API的评论,我没有看到一个对称打包存储矩阵矩阵乘法函数。 您可以使用row-主要存储(例如C风格)与cublas,没有转置,至少对于这些操作(矩阵矩阵乘,没有压缩存储)通过遵循给出的建议此处。 以下是我上一个示例的重做版本,其中包含上述第2项中的信息。 //矩阵乘法:C = A * B. //主机代码。 // //实用程序和系统包括 #include< assert.h> #include< helper_string.h> //用于CUDA SDK共享的共享函数的帮助程序sa mples // CUDA运行时 #include< cuda_runtime.h> #include< cublas_v2.h> //错误检查宏 #define cudaCheckErrors(msg)\ do {\ cudaError_t __err = cudaGetLastError(); \ if(__err!= cudaSuccess){\ fprintf(stderr,致命错误:%s(%s在%s:%d)\\\,\ msg,cudaGetErrorString(__ err),\ __FILE__,__LINE__); \ fprintf(stderr,*** FAILED - ABORTING\\\); \ exit(1); \ } \ } while(0) // CUBLAS V2 API #define cublasCheckErrors(fn)\ do { \ cublasStatus_t __err = fn; \ if(__err!= CUBLAS_STATUS_SUCCESS){\ fprintf(stderr,Fatal cublas error:%d(at%s:%d)\\\,\ (int)(__ err),\ __FILE__,__LINE__); \ fprintf(stderr,*** FAILED - ABORTING\\\); \ exit(1); \ } \ } while(0) #ifndef min #define min(a,b) (a< b)?a:b) #endif #ifndef max #define max(a,b)((a> b)?a:b) #endif //////////////////////////////////////// /////////////////////////////////////////// b $ b // CUDA Helper函数(除了helper_cuda.h) void inline checkError(cublasStatus_t status,const char * msg) { if(status!= CUBLAS_STATUS_SUCCESS) { printf(%s,msg); exit(EXIT_FAILURE); } } // CUDA Helper函数结束 //分配具有随机浮点数的矩阵。 void randomCmplxInit(cuComplex * data,int size) { for(int i = 0; i data [i] = make_cuComplex rand()/(float)RAND_MAX,rand()/(float)RAND _MAX); } // void initializeCUDA(int argc,char ** argv,int& devID,int& iSizeMultiple,sMa trixSize& matrix_size) void initializeCUDA(int argc,char ** argv,int& devID) { //默认情况下,我们使用设备0,否则我们基于命令行中提供的内容 cudaError_t error; devID = 0; if(checkCmdLineFlag(argc,(const char **)argv,device)) { devID = getCmdLineArgumentInt(argc, 设备); error = cudaSetDevice(devID); if(error!= cudaSuccess) { printf(cudaSetDevice返回错误代码%d,行(%d)\\\,错误,__ LINE__); exit(EXIT_FAILURE); } } //获取此GPU上的SM数量 error = cudaGetDevice(& devID); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(& deviceProp,devID); printf(GPU Device%d:\%s\with compute capability%d。%d\\\\\\,devID,dev iceProp.name ,deviceProp.major,deviceProp.minor); } ///////////////////////////// ////////////////////////////////////////////////// / //!运行一个简单的测试矩阵乘以使用CUBLAS ////////////////////////////////////////// //////////////////////////////////////////// int matrixMultiply(int argc,char ** argv,int devID) { int i,j; unsigned int m,n,k; cudaDeviceProp deviceProp; cudaError_t error; error = cudaGetDeviceProperties(& deviceProp,devID); if(error!= cudaSuccess) { printf(cudaGetDeviceProperties returns error code%d,line(%d)\\\,error,__LINE__); exit(EXIT_FAILURE); } //对于Fermi和更高的块使用更大的块大小 m = 3; //矩阵op(A)和C的行数。 (m×k) n = 2; //矩阵op(B)和C的列数。 (k×n) k = m; // op(A)的列数和op(B)的行数。 C - > (mxn) //我想计算C = A * B的行主格式, //所以我必须找到C(T)= B(T)A )= C(T)A以列为主的格式 //为矩阵A和B分配主机内存 unsigned int size_A = m * m; //对称矩阵的大小 printf(size_A =%d\\\,size_A); unsigned int mem_size_A = sizeof(cuComplex)* size_A; cuComplex * h_A =(cuComplex *)malloc(mem_size_A); unsigned int size_B = m * n; unsigned int mem_size_B = sizeof(cuComplex)* size_B; cuComplex * h_B =(cuComplex *)malloc(mem_size_B); //初始化主机存储器 // for(i = 0; i // h_A [i] = make_cuComplex (i + 1),(float)0); h_A [0] = make_cuComplex((float)1,(float)0); h_A [1] = make_cuComplex((float)2,(float)0); h_A [2] = make_cuComplex((float)4,(float)0); h_A [3] = make_cuComplex((float)0,(float)0); h_A [4] = make_cuComplex((float)3,(float)0); h_A [5] = make_cuComplex((float)5,(float)0); h_A [6] = make_cuComplex((float)0,(float)0); h_A [7] = make_cuComplex((float)0,(float)0); h_A [8] = make_cuComplex((float)6,(float)0); // for(i = 0; i // h_B [i] = make_cuComplex((float)(i + 2) 0); h_B [0] = make_cuComplex((float)2,(float)0); h_B [1] = make_cuComplex((float)3,(float)0); h_B [2] = make_cuConplex((float)4,(float)0); h_B [3] = make_cuComplex((float)5,(float)0); h_B [4] = make_cuComplex((float)6,(float)0); h_B [5] = make_cuComplex((float)7,(float)0); //分配设备内存 cuComplex * d_A,* d_B,* d_C; unsigned int size_C = m * n; unsigned int mem_size_C = sizeof(cuComplex)* size_C; //为结果分配主机内存 cuComplex * h_C =(cuComplex *)malloc(mem_size_C); cuComplex * h_CUBLAS =(cuComplex *)malloc(mem_size_C); error = cudaMalloc((void **)& d_A,mem_size_A); error = cudaMalloc((void **)& d_B,mem_size_B); //将主机内存复制到设备 error = cudaMemcpy(d_A,h_A,mem_size_A,cudaMemcpyHostToDevice); error = cudaMemcpy(d_B,h_B,mem_size_B,cudaMemcpyHostToDevice); error = cudaMalloc((void **)& d_C,mem_size_C); //创建和启动计时器 printf(使用CUBLAS ...的计算结果); // CUBLAS版本2.0 { cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(& handle); if(ret!= CUBLAS_STATUS_SUCCESS) { printf(cublasCreate返回错误代码%d,行(%d)\\\,ret,__LINE__); exit(EXIT_FAILURE); } const cuComplex alpha = make_cuComplex(1.0f,0.0f); const cuComplex beta = make_cuComplex(0.0f,0.0f); //使用cublas 执行操作ret = cublasCsymm(handle,CUBLAS_SIDE_RIGHT,CUBLAS_FILL_MODE_LOWER,n,m,& alpha,d_A,m,d_B,n,& beta,d_C,n) if(ret!= CUBLAS_STATUS_SUCCESS) { printf(cublasCsymm returns error code%d,line(%d)\\\,ret,__LINE__); exit(EXIT_FAILURE); } //将结果从设备复制到主机 error = cudaMemcpy(h_CUBLAS,d_C,mem_size_C,cudaMemcpyDeviceToHost); checkError(cublasDestroy(handle),cublasDestroy()error!\\\); } printf(\\\Computations completed.\\\\\\); printf(symm matrix A:\\\); // int s = 0; for(i = 0; i for(j = 0; j // printf(%7.5G + j(%7.5G),h_A [j + i * k] .x,h_A [j + i * k]。 // printf(%7.5G,h_A [s] .x); printf(%7.5G,h_A [j +(i * m)]。 // s ++; } printf(\\\); } printf(\\\ matrix B:\ n); for(i = 0; i for(j = 0; j // printf(%7.5G + j(%7.5G),h_B [j + i * n] .x,h_B [j + i * n]。 printf(%7.5G,h_B [j +(i * n)]。 } printf(\\\); } printf(\\\ matrix C = A * B:\\\); for(i = 0; i for(j = 0; j // printf(%7.5G + j(%7.5G),h_CUBLAS [j + i * n] .x,h_CUBLAS [j + i * n]。 printf(%7.5G,h_CUBLAS [j +(i * n)]。 } printf(\\\); } //清除内存 free(h_A); free(h_B); free(h_C); // free(reference); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaDeviceReset(); return 0; } /////////////////////////////////////// ///////////////////////////////////////// //程序主 ///////////////////////////////////////////// /////////////////////////////////// int main(int argc,char ** argv) { printf([Matrix Multiply CUBLAS] - Starting ... \\\); int devID = 0; initializeCUDA(argc,argv,devID); int matrix_result = matrixMultiply(argc,argv,devID); cudaCheckErrors(some error); return 0; } $ ./t213 [Matrix Multiply CUBLAS] - 正在启动... GPU设备0:具有计算能力的Tesla M20702.0 size_A = 9 使用CUBLAS ... 计算结果计算完成。 symm矩阵A: 1 2 4 0 3 5 0 0 6 矩阵B: 2 3 4 5 6 7 矩阵C = A * B: 34 41 46 56 64 79 原始回应: 当我运行你的代码,因为你现在发布,我没有得到结果你展示。这是我得到的: [Matrix Multiply CUBLAS] - 正在开始... GPU设备0:Tesla M2070 with compute capability 2.0 使用CUBLAS ... 计算结果计算完成。 symm矩阵A: 1 2 3 4 5 6 矩阵B: 2 3 4 5 6 7 矩阵C = A * B: -131 -128 260 -122 -115 266 代码会编译多个警告,没有进行正确的错误检查(例如,您不是检查 cublasCsymm 的返回值 您想要乘以C = A * B这意味着A在 ,,但您通过 CUBLAS_SIDE_RIGHT 到 cublasCsymm 其他几个 cublasCsymm 参数也错了我想你也许你可以做 A * B as(B(T)* A(T))但只适用于方阵。 > 您在矩阵中有行主存储,并将它们传递到cublas,它们按列主顺序解释它们。对于以下矩阵: 1 2 3 4 pre> 行主存储看起来像这样: 1 2 3 4 : 1 3 2 4 您可以使用 cublasCgeam 来转置这些矩阵,也可以手动修改存储。 您正在为对称矩阵 A 进行某种压缩的存储格式的假设是不正确的。 仔细阅读存储类型。 它不说基体的提供部分或存在它说明填充的矩阵部分。 这是一个完整的代码,其中修复了上述问题: //矩阵乘法:C = A * B. //主机代码。 // //实用程序和系统包括 #include< assert.h> #include< helper_string.h> //用于CUDA SDK共享的共享函数的帮助程序sa mples // CUDA运行时 #include< cuda_runtime.h> #include< cublas_v2.h> //错误检查宏 #define cudaCheckErrors(msg)\ do {\ cudaError_t __err = cudaGetLastError(); \ if(__err!= cudaSuccess){\ fprintf(stderr,致命错误:%s(%s在%s:%d)\\\,\ msg,cudaGetErrorString(__ err),\ __FILE__,__LINE__); \ fprintf(stderr,*** FAILED - ABORTING\\\); \ exit(1); \ } \ } while(0) // CUBLAS V2 API #define cublasCheckErrors(fn)\ do { \ cublasStatus_t __err = fn; \ if(__err!= CUBLAS_STATUS_SUCCESS){\ fprintf(stderr,Fatal cublas error:%d(at%s:%d)\\\,\ (int)(__ err),\ __FILE__,__LINE__); \ fprintf(stderr,*** FAILED - ABORTING\\\); \ exit(1); \ } \ } while(0) #ifndef min #define min(a,b) (a< b)?a:b) #endif #ifndef max #define max(a,b)((a> b)?a:b) #endif //////////////////////////////////////// /////////////////////////////////////////// b $ b // CUDA Helper函数(除了helper_cuda.h) void inline checkError(cublasStatus_t status,const char * msg) { if(status!= CUBLAS_STATUS_SUCCESS ) { printf(%s,msg); exit(EXIT_FAILURE); } } // CUDA Helper函数结束 //分配具有随机浮点数的矩阵。 void randomCmplxInit(cuComplex * data,int size) { for(int i = 0; i data [i] = make_cuComplex rand()/(float)RAND_MAX,rand()/(float)RAND_MAX); } // void initializeCUDA(int argc,char ** argv,int& devID,int& iSizeMultiple,sMatrixSize& matrix_size) void initializeCUDA(int argc,char ** argv,int& devID) { //默认情况下,我们使用设备0,否则我们根据命令行提供的内容覆盖设备ID cudaError_t error; devID = 0; if(checkCmdLineFlag(argc,(const char **)argv,device)) { devID = getCmdLineArgumentInt(argc, 设备); error = cudaSetDevice(devID); if(error!= cudaSuccess) { printf(cudaSetDevice返回错误代码%d,行(%d)\\\,错误,__LINE__); exit(EXIT_FAILURE); } } //获取此GPU上的SM数量 error = cudaGetDevice(& devID); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(& deviceProp,devID); printf(GPU Device%d:\%s\with compute capability%d。%d\\\\\\,devID,deviceProp.name,deviceProp.major, deviceProp.minor); } ////////////////////////////////// /////////////////////////////////////////// //!运行一个简单的测试矩阵乘以使用CUBLAS ////////////////////////////////////////// //////////////////////////////////////////// int matrixMultiply(int argc,char ** argv,int devID) { int i,j; unsigned int m,n,k; cudaDeviceProp deviceProp; cudaError_t error; error = cudaGetDeviceProperties(& deviceProp,devID); if(error!= cudaSuccess) { printf(cudaGetDeviceProperties returns error code%d,line(%d)\\\,error,__LINE__); exit(EXIT_FAILURE); } //对于Fermi和更高的块使用更大的块大小 m = 3; //矩阵op(A)和C的行数。 (m×k) n = 2; //矩阵op(B)和C的列数。 (k×n) k = m; // op(A)的列数和op(B)的行数。 C - > (mxn) //我想计算C = A * B的行主格式, //所以我必须找到C(T)= B(T)A )= C(T)A以列为主的格式 //为矩阵A和B分配主机内存 unsigned int size_A = m * m; //对称矩阵的大小 printf(size_A =%d\\\,size_A); unsigned int mem_size_A = sizeof(cuComplex)* size_A; cuComplex * h_A =(cuComplex *)malloc(mem_size_A); unsigned int size_B = m * n; unsigned int mem_size_B = sizeof(cuComplex)* size_B; cuComplex * h_B =(cuComplex *)malloc(mem_size_B); //初始化主机存储器 // for(i = 0; i // h_A [i] = make_cuComplex (i + 1),(float)0); h_A [0] = make_cuComplex((float)1,(float)0); h_A [1] = make_cuComplex((float)2,(float)0); h_A [2] = make_cuComplex((float)4,(float)0); h_A [3] = make_cuComplex((float)0,(float)0); h_A [4] = make_cuComplex((float)3,(float)0); h_A [5] = make_cuComplex((float)5,(float)0); h_A [6] = make_cuComplex((float)0,(float)0); h_A [7] = make_cuComplex((float)0,(float)0); h_A [8] = make_cuComplex((float)6,(float)0); // for(i = 0; i // h_B [i] = make_cuComplex((float)(i + 2) 0); h_B [0] = make_cuComplex((float)2,(float)0); h_B [1] = make_cuComplex((float)4,(float)0); h_B [2] = make_cuComplex((float)6,(float)0); h_B [3] = make_cuComplex((float)3,(float)0); h_B [4] = make_cuComplex((float)5,(float)0); h_B [5] = make_cuComplex((float)7,(float)0); //分配设备内存 cuComplex * d_A,* d_B,* d_C; unsigned int size_C = m * n; unsigned int mem_size_C = sizeof(cuComplex)* size_C; //为结果分配主机内存 cuComplex * h_C =(cuComplex *)malloc(mem_size_C); cuComplex * h_CUBLAS =(cuComplex *)malloc(mem_size_C); error = cudaMalloc((void **)& d_A,mem_size_A); error = cudaMalloc((void **)& d_B,mem_size_B); //将主机内存复制到设备 error = cudaMemcpy(d_A,h_A,mem_size_A,cudaMemcpyHostToDevice); error = cudaMemcpy(d_B,h_B,mem_size_B,cudaMemcpyHostToDevice); error = cudaMalloc((void **)& d_C,mem_size_C); //创建和启动计时器 printf(使用CUBLAS ...的计算结果); // CUBLAS版本2.0 { cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(& handle); if(ret!= CUBLAS_STATUS_SUCCESS) { printf(cublasCreate返回错误代码%d,行(%d)\\\,ret,__LINE__); exit(EXIT_FAILURE); } const cuComplex alpha = make_cuComplex(1.0f,0.0f); const cuComplex beta = make_cuComplex(0.0f,0.0f); //使用cublas 执行操作ret = cublasCsymm(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_LOWER,m,n,& alpha,d_A,m,d_B,m,& beta,d_C,m) if(ret!= CUBLAS_STATUS_SUCCESS) { printf(cublasCsymm returns error code%d,line(%d)\\\,ret,__LINE__); exit(EXIT_FAILURE); } 这是输出: [Matrix Multiply CUBLAS] - 正在开始... GPU设备0:具有计算能力的Tesla M20702.0 size_A = 9 使用CUBLAS ... 计算结果计算完成。 symm矩阵A: 1 0 0 2 3 0 4 5 6 矩阵B: 2 3 4 5 6 7 矩阵C = A * B: 34 41 46 56 64 79 This function performs the symmetric matrix-matrix multiplication using CUDA. Although, I succeeded in using the nonsymmetric version "cublas{t}gemm()" I couldn't use the "cublas{t}symm()" function properly. I know that CUBLAS library uses column-major matrix storage. I am using row-major C/C++ matrix and I know how to solve this issue for "cublas{t}gemm()" by replacing the input matrices and etc. However, I couldn't solve it for the symmetric case. The problem is even if I use column-major matrix storage I find unexpectable results. Matrices contain complex floats (cuComplex). I assume I have row-major matrices. Here is the code and the output: // Matrix multiplication: C = A * B.// Host code.//// Utilities and system includes#include <assert.h>#include <helper_string.h> // helper for shared functions common to CUDA SDK samples// CUDA runtime#include <cuda_runtime.h>#include <cublas_v2.h>#ifndef min#define min(a,b) ((a < b) ? a : b)#endif#ifndef max#define max(a,b) ((a > b) ? a : b)#endif////////////////////////////////////////////////////////////////////////////////// These are CUDA Helper functions (in addition to helper_cuda.h)void inline checkError(cublasStatus_t status, const char *msg){ if (status != CUBLAS_STATUS_SUCCESS) { printf("%s", msg); exit(EXIT_FAILURE); }}// end of CUDA Helper Functions// Allocates a matrix with random float entries.void randomCmplxInit(cuComplex *data, int size){ for (int i = 0; i < size; ++i) data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);}//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)void initializeCUDA(int argc, char **argv, int &devID){ // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line cudaError_t error; devID = 0; int m,n,k; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); error = cudaSetDevice(devID); if (error != cudaSuccess) { printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } } // get number of SMs on this GPU error = cudaGetDevice(&devID); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(&deviceProp, devID); printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32;}//////////////////////////////////////////////////////////////////////////////////! Run a simple test matrix multiply using CUBLAS////////////////////////////////////////////////////////////////////////////////int matrixMultiply(int argc, char **argv, int devID){ int i,j; unsigned int m,n,k; cudaDeviceProp deviceProp; cudaError_t error; error = cudaGetDeviceProperties(&deviceProp, devID); if (error != cudaSuccess) { printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; m=3; //number of rows of matrix op(A) and C. A--> (m x k) n=2; //number of columns of matrix op(B) and C. B--> (k x n) k=m; //number of columns of op(A) and rows of op(B). C--> (m x n) // I want to compute C = A*B in row-major format, //so I must find C(T)=B(T)A(T) = C(T)A in column-major format // allocate host memory for matrices A and B unsigned int size_A = m*(m+1)/2; //size of a symmetric matrix unsigned int mem_size_A = sizeof(cuComplex) * size_A; cuComplex *h_A = (cuComplex *)malloc(mem_size_A); unsigned int size_B = m*n; unsigned int mem_size_B = sizeof(cuComplex) * size_B; cuComplex *h_B = (cuComplex *)malloc(mem_size_B); // initialize host memory for (i = 0; i < size_A; ++i) h_A[i] = make_cuComplex( (float)(i+1),(float)0); for (i = 0; i < size_B; ++i) h_B[i] = make_cuComplex((float)(i+2), (float)0); // allocate device memory cuComplex *d_A, *d_B, *d_C; unsigned int size_C = m*n; unsigned int mem_size_C = sizeof(cuComplex) * size_C; // allocate host memory for the result cuComplex *h_C = (cuComplex *) malloc(mem_size_C); cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C); error = cudaMalloc((void **) &d_A, mem_size_A); error = cudaMalloc((void **) &d_B, mem_size_B); // copy host memory to device error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); error = cudaMalloc((void **) &d_C, mem_size_C); // setup execution parameters dim3 threads(block_size, block_size); dim3 grid(n / threads.x, m / threads.y); // create and start timer printf("Computing result using CUBLAS..."); // CUBLAS version 2.0 { cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } const cuComplex alpha = make_cuComplex(1.0f,0.0f); const cuComplex beta = make_cuComplex(0.0f,0.0f); //Perform operation with cublas ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, n,m,&alpha,d_A,m,d_B,m,&beta,d_C,m); // copy result from device to host error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost); checkError(cublasDestroy(handle), "cublasDestroy() error!\n"); } printf ("\nComputations completed.\n\n"); printf (" symm matrix A: \n"); int s=0; for (i=0; i<min(m,4); i++) { for (j=0; j<=i; j++) { //printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y); printf ("%7.5G", h_A[s].x); s++; } printf ("\n"); } printf ("\n matrix B: \n"); for (i=0; i<min(k,4); i++) { for (j=0; j<min(n,4); j++) { //printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y); printf ("%7.5G", h_B[j+i*n].x); } printf ("\n"); } printf ("\n matrix C=A*B: \n"); for (i=0; i<min(m,4); i++) { for (j=0; j<min(n,4); j++) { //printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y); printf ("%7.5G", h_CUBLAS[j+i*n].x); } printf ("\n"); } // clean up memory free(h_A); free(h_B); free(h_C); //free(reference); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaDeviceReset();}////////////////////////////////////////////////////////////////////////////////// Program main////////////////////////////////////////////////////////////////////////////////int main(int argc, char **argv){ printf("[Matrix Multiply CUBLAS] - Starting...\n"); int devID = 0, sizeMult = 5; initializeCUDA(argc, argv, devID); int matrix_result = matrixMultiply(argc, argv, devID);}I suppose that I have the following matrices for the multiplication:A =1 2 42 3 54 5 6B = 2 3 4 5 6 7and expect to obtain A*B =34 4146 5664 79But the obtained OUTPUT is as follows:symm matrix A: 1 2 3 4 5 6matrix B: 2 3 4 5 6 7matrix C=A*B: 78 90 74 97 114 146What am I missing in this code ? Probably the arguments of "cublasCsymm" function are wrong.Thanks,Kagan 解决方案 EDIT:Based on questions posed below, I elected to re-work my answer and example code.You can handle row-major storage without transpose at least for these operations. And this observation is further facilitated by the fact that the symm function does not used the packed storage.So to answer the additional questions:the cublasCsymm function does not use a packed storage format (like some other functions such as cublasCspmv for example), because the cublasCsymm function is intended to duplicate the functionality of the corresponding netlib function, which also does not use a packed storage format. Based on my review of the cublas API, I don't see a symmetric-packed-storage matrix-matrix multiply function available.You can use row-major storage (e.g. C-style) with cublas, without transposing, at least for these operations (matrix-matrix multiply, without packed storage) by following the advice given here. What follows is a re-worked version of my previous example, that incorporates the information in item 2 above.// Matrix multiplication: C = A * B.// Host code.//// Utilities and system includes#include <assert.h>#include <helper_string.h> // helper for shared functions common to CUDA SDK samples// CUDA runtime#include <cuda_runtime.h>#include <cublas_v2.h>// error check macros#define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0)// for CUBLAS V2 API#define cublasCheckErrors(fn) \ do { \ cublasStatus_t __err = fn; \ if (__err != CUBLAS_STATUS_SUCCESS) { \ fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \ (int)(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0)#ifndef min#define min(a,b) ((a < b) ? a : b)#endif#ifndef max#define max(a,b) ((a > b) ? a : b)#endif////////////////////////////////////////////////////////////////////////////////// These are CUDA Helper functions (in addition to helper_cuda.h)void inline checkError(cublasStatus_t status, const char *msg){ if (status != CUBLAS_STATUS_SUCCESS) { printf("%s", msg); exit(EXIT_FAILURE); }}// end of CUDA Helper Functions// Allocates a matrix with random float entries.void randomCmplxInit(cuComplex *data, int size){ for (int i = 0; i < size; ++i) data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);}//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)void initializeCUDA(int argc, char **argv, int &devID){ // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line cudaError_t error; devID = 0; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); error = cudaSetDevice(devID); if (error != cudaSuccess) { printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } } // get number of SMs on this GPU error = cudaGetDevice(&devID); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(&deviceProp, devID); printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);}//////////////////////////////////////////////////////////////////////////////////! Run a simple test matrix multiply using CUBLAS////////////////////////////////////////////////////////////////////////////////int matrixMultiply(int argc, char **argv, int devID){ int i,j; unsigned int m,n,k; cudaDeviceProp deviceProp; cudaError_t error; error = cudaGetDeviceProperties(&deviceProp, devID); if (error != cudaSuccess) { printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } // use a larger block size for Fermi and above m=3; //number of rows of matrix op(A) and C. A--> (m x k) n=2; //number of columns of matrix op(B) and C. B--> (k x n) k=m; //number of columns of op(A) and rows of op(B). C--> (m x n) // I want to compute C = A*B in row-major format, //so I must find C(T)=B(T)A(T) = C(T)A in column-major format // allocate host memory for matrices A and B unsigned int size_A = m*m; //size of a symmetric matrix printf("size_A = %d\n", size_A); unsigned int mem_size_A = sizeof(cuComplex) * size_A; cuComplex *h_A = (cuComplex *)malloc(mem_size_A); unsigned int size_B = m*n; unsigned int mem_size_B = sizeof(cuComplex) * size_B; cuComplex *h_B = (cuComplex *)malloc(mem_size_B); // initialize host memory// for (i = 0; i < size_A; ++i)// h_A[i] = make_cuComplex( (float)(i+1),(float)0); h_A[0] = make_cuComplex((float)1, (float)0); h_A[1] = make_cuComplex((float)2, (float)0); h_A[2] = make_cuComplex((float)4, (float)0); h_A[3] = make_cuComplex((float)0, (float)0); h_A[4] = make_cuComplex((float)3, (float)0); h_A[5] = make_cuComplex((float)5, (float)0); h_A[6] = make_cuComplex((float)0, (float)0); h_A[7] = make_cuComplex((float)0, (float)0); h_A[8] = make_cuComplex((float)6, (float)0);// for (i = 0; i < size_B; ++i)// h_B[i] = make_cuComplex((float)(i+2), (float)0); h_B[0] = make_cuComplex((float)2, (float)0); h_B[1] = make_cuComplex((float)3, (float)0); h_B[2] = make_cuComplex((float)4, (float)0); h_B[3] = make_cuComplex((float)5, (float)0); h_B[4] = make_cuComplex((float)6, (float)0); h_B[5] = make_cuComplex((float)7, (float)0); // allocate device memory cuComplex *d_A, *d_B, *d_C; unsigned int size_C = m*n; unsigned int mem_size_C = sizeof(cuComplex) * size_C; // allocate host memory for the result cuComplex *h_C = (cuComplex *) malloc(mem_size_C); cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C); error = cudaMalloc((void **) &d_A, mem_size_A); error = cudaMalloc((void **) &d_B, mem_size_B); // copy host memory to device error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); error = cudaMalloc((void **) &d_C, mem_size_C); // create and start timer printf("Computing result using CUBLAS..."); // CUBLAS version 2.0 { cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } const cuComplex alpha = make_cuComplex(1.0f,0.0f); const cuComplex beta = make_cuComplex(0.0f,0.0f); //Perform operation with cublas ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, n,m,&alpha,d_A,m,d_B,n,&beta,d_C,n); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } // copy result from device to host error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost); checkError(cublasDestroy(handle), "cublasDestroy() error!\n"); } printf ("\nComputations completed.\n\n"); printf (" symm matrix A: \n");// int s=0; for (i=0; i<min(m,4); i++) { for (j=0; j<min(m,4); j++) { //printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);// printf ("%7.5G", h_A[s].x); printf ("%7.5G", h_A[j+(i*m)].x);// s++; } printf ("\n"); } printf ("\n matrix B: \n"); for (i=0; i<min(k,4); i++) { for (j=0; j<min(n,4); j++) { //printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y); printf ("%7.5G", h_B[j+(i*n)].x); } printf ("\n"); } printf ("\n matrix C=A*B: \n"); for (i=0; i<min(m,4); i++) { for (j=0; j<min(n,4); j++) { //printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y); printf ("%7.5G", h_CUBLAS[j+(i*n)].x); } printf ("\n"); } // clean up memory free(h_A); free(h_B); free(h_C); //free(reference); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaDeviceReset(); return 0;}////////////////////////////////////////////////////////////////////////////////// Program main////////////////////////////////////////////////////////////////////////////////int main(int argc, char **argv){ printf("[Matrix Multiply CUBLAS] - Starting...\n"); int devID = 0; initializeCUDA(argc, argv, devID); int matrix_result = matrixMultiply(argc, argv, devID); cudaCheckErrors("some error"); return 0;}$ ./t213[Matrix Multiply CUBLAS] - Starting...GPU Device 0: "Tesla M2070" with compute capability 2.0size_A = 9Computing result using CUBLAS...Computations completed. symm matrix A: 1 2 4 0 3 5 0 0 6 matrix B: 2 3 4 5 6 7 matrix C=A*B: 34 41 46 56 64 79$ORIGINAL RESPONSE:Several problems:When I run your code as you have it posted right now, I don't get theresults that you show. Here's what I get:[Matrix Multiply CUBLAS] - Starting...GPU Device 0: "Tesla M2070" with compute capability 2.0Computing result using CUBLAS...Computations completed.symm matrix A: 1 2 3 4 5 6matrix B: 2 3 4 5 6 7matrix C=A*B: -131 -128 260 -122 -115 266The code compiles with a number of warnings and also you're not doing proper error checking (for example you're not checking the return value from cublasCsymmYou are wanting to multiply C = A*B This means A is on the LEFT,but you are passing CUBLAS_SIDE_RIGHT to cublasCsymm Several other cublasCsymm parameters were wrong as well. I think maybe you thought you could do A*B as (B(T)*A(T)) but that only works for square matrices. Not sure what you were thinking, exactly.You having row-major storage on your matrices and passing them to cublas which interprets them in column-major order. For the following matrix:1 23 4 row-major storage looks like this: 1 2 3 4column-major storage looks like this: 1 3 2 4You can transpose these matrices if you wish, using cublasCgeam or you can manually modify your storage.You're making some sort of assumption about some kind of compressedstorage format for the symmetric matrix A which is not correct. Read carefully the defintion of the storagetype.It doesn't say the portion of the matrix that is "supplied" or"present" it says the portion of the matrix that is filled.Here is a complete code that has the above problems fixed:// Matrix multiplication: C = A * B.// Host code.//// Utilities and system includes#include <assert.h>#include <helper_string.h> // helper for shared functions common to CUDA SDK samples// CUDA runtime#include <cuda_runtime.h>#include <cublas_v2.h>// error check macros#define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0)// for CUBLAS V2 API#define cublasCheckErrors(fn) \ do { \ cublasStatus_t __err = fn; \ if (__err != CUBLAS_STATUS_SUCCESS) { \ fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \ (int)(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0)#ifndef min#define min(a,b) ((a < b) ? a : b)#endif#ifndef max#define max(a,b) ((a > b) ? a : b)#endif////////////////////////////////////////////////////////////////////////////////// These are CUDA Helper functions (in addition to helper_cuda.h)void inline checkError(cublasStatus_t status, const char *msg){ if (status != CUBLAS_STATUS_SUCCESS) { printf("%s", msg); exit(EXIT_FAILURE); }}// end of CUDA Helper Functions// Allocates a matrix with random float entries.void randomCmplxInit(cuComplex *data, int size){ for (int i = 0; i < size; ++i) data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);}//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)void initializeCUDA(int argc, char **argv, int &devID){ // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line cudaError_t error; devID = 0; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); error = cudaSetDevice(devID); if (error != cudaSuccess) { printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } } // get number of SMs on this GPU error = cudaGetDevice(&devID); cudaDeviceProp deviceProp; error = cudaGetDeviceProperties(&deviceProp, devID); printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);}//////////////////////////////////////////////////////////////////////////////////! Run a simple test matrix multiply using CUBLAS////////////////////////////////////////////////////////////////////////////////int matrixMultiply(int argc, char **argv, int devID){ int i,j; unsigned int m,n,k; cudaDeviceProp deviceProp; cudaError_t error; error = cudaGetDeviceProperties(&deviceProp, devID); if (error != cudaSuccess) { printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } // use a larger block size for Fermi and above m=3; //number of rows of matrix op(A) and C. A--> (m x k) n=2; //number of columns of matrix op(B) and C. B--> (k x n) k=m; //number of columns of op(A) and rows of op(B). C--> (m x n) // I want to compute C = A*B in row-major format, //so I must find C(T)=B(T)A(T) = C(T)A in column-major format // allocate host memory for matrices A and B unsigned int size_A = m*m; //size of a symmetric matrix printf("size_A = %d\n", size_A); unsigned int mem_size_A = sizeof(cuComplex) * size_A; cuComplex *h_A = (cuComplex *)malloc(mem_size_A); unsigned int size_B = m*n; unsigned int mem_size_B = sizeof(cuComplex) * size_B; cuComplex *h_B = (cuComplex *)malloc(mem_size_B); // initialize host memory// for (i = 0; i < size_A; ++i)// h_A[i] = make_cuComplex( (float)(i+1),(float)0); h_A[0] = make_cuComplex((float)1, (float)0); h_A[1] = make_cuComplex((float)2, (float)0); h_A[2] = make_cuComplex((float)4, (float)0); h_A[3] = make_cuComplex((float)0, (float)0); h_A[4] = make_cuComplex((float)3, (float)0); h_A[5] = make_cuComplex((float)5, (float)0); h_A[6] = make_cuComplex((float)0, (float)0); h_A[7] = make_cuComplex((float)0, (float)0); h_A[8] = make_cuComplex((float)6, (float)0);// for (i = 0; i < size_B; ++i)// h_B[i] = make_cuComplex((float)(i+2), (float)0); h_B[0] = make_cuComplex((float)2, (float)0); h_B[1] = make_cuComplex((float)4, (float)0); h_B[2] = make_cuComplex((float)6, (float)0); h_B[3] = make_cuComplex((float)3, (float)0); h_B[4] = make_cuComplex((float)5, (float)0); h_B[5] = make_cuComplex((float)7, (float)0); // allocate device memory cuComplex *d_A, *d_B, *d_C; unsigned int size_C = m*n; unsigned int mem_size_C = sizeof(cuComplex) * size_C; // allocate host memory for the result cuComplex *h_C = (cuComplex *) malloc(mem_size_C); cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C); error = cudaMalloc((void **) &d_A, mem_size_A); error = cudaMalloc((void **) &d_B, mem_size_B); // copy host memory to device error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); error = cudaMalloc((void **) &d_C, mem_size_C); // create and start timer printf("Computing result using CUBLAS..."); // CUBLAS version 2.0 { cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } const cuComplex alpha = make_cuComplex(1.0f,0.0f); const cuComplex beta = make_cuComplex(0.0f,0.0f); //Perform operation with cublas ret = cublasCsymm(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, m,n,&alpha,d_A,m,d_B,m,&beta,d_C,m); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); }Here is the output:[Matrix Multiply CUBLAS] - Starting...GPU Device 0: "Tesla M2070" with compute capability 2.0size_A = 9Computing result using CUBLAS...Computations completed. symm matrix A: 1 0 0 2 3 0 4 5 6 matrix B: 2 3 4 5 6 7 matrix C=A*B: 34 41 46 56 64 79 这篇关于如何配置cublas {t} symm()函数参数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!
10-27 18:59