问题描述
自CUDA 5.5起,CUBLAS库包含用于批量矩阵分解和反演的例程(和)。 b
$ b
从文档中获取指南,我使用这些例程编写了一个用于反演N×N矩阵的测试代码。只有当矩阵具有所有非零枢轴时,代码才给出正确的输出。将任何枢轴设置为零会导致不正确的结果。我已经使用MATLAB验证了结果。
我意识到我提供行主矩阵作为输入,而CUBLAS期望列主矩阵,但它不重要,因为它会只转置结果。当然,我也测试了列主要输入,但是得到相同的行为。
我很困惑, cublas< t> getriBatched
期望枢纽交换信息数组 P
作为输入,它是 cublas< t> getrfBatched
的输出。因此,如果通过行交换消除任何零点,则反演程序应该自动处理它。
如何使用CUBLAS执行包含零点的矩阵的反演?
以下是具有不同测试用例的自包含可编译示例:
#include< cstdio>
#include< cstdlib>
#include< cuda_runtime.h>
#include< cublas_v2.h>
#define cudacall(call)\
do \
{\
cudaError_t err =(call); \
if(cudaSuccess!= err)\
{\
fprintf(stderr,CUDA错误:\\\
File =%s\\\
Line =%d \\\
Reason = %s\\\
,__FILE__,__LINE__,cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while(0)
#define cublascall(call)\
do \
{\
cublasStatus_t status =(call); \
if(CUBLAS_STATUS_SUCCESS!= status)\
{\
fprintf(stderr,CUBLAS错误:\\\
File =%s \\\
Line =%d \\\
Code = %d\\\
,__FILE__,__LINE__,status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float * src_d,float * dst_d,int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(& handle));
int batchSize = 1;
int * P,* INFO;
cudacall(cudaMalloc< int>(& P,n * batchSize * sizeof(int)))
cudacall(cudaMalloc< int>(& INFO,batchSize * sizeof(int)));
int lda = n;
float * A [] = {src_d};
float ** A_d;
cudacall(cudaMalloc< float *>(& A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(& INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == n)
{
fprintf(stderr,Factorization Failed:Matrix is singular\);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float * C [] = {dst_d};
float ** C_d;
cudacall(cudaMalloc< float *>(& C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize)
cudacall(cudaMemcpy(& INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh!= 0)
{
fprintf(stderr,Inversion Failed:Matrix is singular\);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P),cudaFree(INFO),cublasDestroy_v2(handle);
}
void invert(float * src,float * dst,int n)
{
float * src_d,* dst_d;
cudacall(cudaMalloc< float>(& src_d,n * n * sizeof(float)))
cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc< float>(& dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d),cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//具有完整枢轴的随机矩阵
float full_pivots [n * n] = {0.5,3,4,
1,3,10,
4, 9,16};
//几乎与上面的第一个零点矩阵相同
float zero_pivot [n * n] = {0,3,4,
1,3,10,
4,9,16};
float zero_pivot_col_major [n * n] = {0,1,4,
3,3,9,
4,10,16}
float another_zero_pivot [n * n] = {0,3,4,
1,5,6,
9,8,2}
float another_full_pivot [n * n] = {22,3,4,
1,5,6,
9,8,2}
float singular [n * n] = {1,2,3,
4,5,6,
7,8,9};
//通过设置a选择矩阵
float * a = zero_pivot;
fprintf(stdout,Input:\\\
\\\
);
for(int i = 0; i {
for(int j = 0; j< n; j ++)
fprintf(stdout,%f \t,a [i * n + j]);
fprintf(stdout,\\\
);
}
fprintf(stdout,\\\
\\\
);
invert(a,a,n);
fprintf(stdout,Inverse:\\\
\\\
);
for(int i = 0; i {
for(int j = 0; j< n; j ++)
fprintf(stdout,%f \t,a [i * n + j]);
fprintf(stdout,\\\
);
}
}
int main()
{
test_invert();
int n; scanf(%d,& n);
return 0;
}
在对于维度( n
)的矩阵的当前CUBLAS库实现 cublas< t> getrfBatched
一个可能的解决方法是,如果你想要一个可能的解决方法,你可以使用$ c> 3< = n 当使用matlab命名法时,当 A 矩阵转换为大小为17x17的identity-extend:
LU = getrf([A 0; 0 I]);
继续,您可以使用 cublas< t> getriBatched
以普通方式:
invA = getri(LU(1:3,1:3))
(您也可以将一切都保留在n = 17,调用 getri
那样,然后提取结果作为 invA
的第一个3x3行和列。)
这是一个完全工作的例子,借用你提供的代码,显示你提供的3x3 zero_pivot
矩阵的反演,使用 zero_pivot_war
matrix作为身份扩展解决方法:
$ cat t340.cu
#include< ; cstdio>
#include< cstdlib>
#include< cuda_runtime.h>
#include< cublas_v2.h>
#define cudacall(call)\
do \
{\
cudaError_t err =(call); \
if(cudaSuccess!= err)\
{\
fprintf(stderr,CUDA错误:\\\
File =%s\\\
Line =%d \\\
Reason = %s\\\
,__FILE__,__LINE__,cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while(0)
#define cublascall(call)\
do \
{\
cublasStatus_t status =(call); \
if(CUBLAS_STATUS_SUCCESS!= status)\
{\
fprintf(stderr,CUBLAS错误:\\\
File =%s \\\
Line =%d \\\
Code = %d\\\
,__FILE__,__LINE__,status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float * src_d,float * dst_d,int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(& handle));
int batchSize = 1;
int * P,* INFO;
cudacall(cudaMalloc< int>(& P,17 * batchSize * sizeof(int)))
cudacall(cudaMalloc< int>(& INFO,batchSize * sizeof(int)));
int lda = 17;
float * A [] = {src_d};
float ** A_d;
cudacall(cudaMalloc< float *>(& A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(& INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == 17)
{
fprintf(stderr,Factorization Failed:Matrix is singular\);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float * C [] = {dst_d};
float ** C_d;
cudacall(cudaMalloc< float *>(& C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
cudacall(cudaMemcpy(& INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh!= 0)
{
fprintf(stderr,Inversion Failed:Matrix is singular\);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P),cudaFree(INFO),cublasDestroy_v2(handle);
}
void invert(float * src,float * dst,int n)
{
float * src_d,* dst_d;
cudacall(cudaMalloc< float>(& src_d,17 * 17 * sizeof(float)))
cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc< float>(& dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d),cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//具有完整枢轴的随机矩阵
/ * float full_pivots [n * n] = {0.5,3,4,
1,3,10,
4,9,16};
//几乎与上面的第一个零点矩阵相同
float zero_pivot [n * n] = {0,3,4,
1,3,10,
4,9,16};
float zero_pivot_col_major [n * n] = {0,1,4,
3,3,9,
4,10,16}
* /
float zero_pivot_war [17 * 17] = {
0,3,4,0,0,0,0,0,0,0,0,0,0,0 ,0,0,0,
1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4 ,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0 ,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0 ,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0 ,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0 ,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0 ,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0 ,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0 ,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0 ,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0 ,0,0,0,0,0,0,1};
/ *
float another_zero_pivot [n * n] = {0,3,4,
1,5,6,
9,8,2}
float another_full_pivot [n * n] = {22,3,4,
1,5,6,
9,8,2}
float singular [n * n] = {1,2,3,
4,5,6,
7,8,9};
* /
float result [n * n];
//通过设置a选择矩阵
float * a = zero_pivot_war;
fprintf(stdout,Input:\\\
\\\
);
for(int i = 0; i {
for(int j = 0; j< n; j ++)
fprintf(stdout,%f \t,a [i * 17 + j]);
fprintf(stdout,\\\
);
}
fprintf(stdout,\\\
\\\
);
invert(a,result,n);
fprintf(stdout,Inverse:\\\
\\\
);
for(int i = 0; i {
for(int j = 0; j< n; j ++)
fprintf(stdout,%f \t,result [i * n + j]);
fprintf(stdout,\\\
);
}
}
int main()
{
test_invert();
// int n; scanf(%d,& n);
return 0;
}
$ nvcc -arch = sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
输入:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
反向:
-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
=========错误摘要:0个错误
$
上面的结果对我来说是正确的,基于一个简单的测试。
我没有关于CUBLAS中可能的错误的性质的任何进一步的技术细节。从我可以知道,它存在于CUDA 5.5和CUDA 6.0 RC。 NVIDIA提供的资源(例如CUBLAS库)的详细错误讨论应在上直接讨论在的错误归档门户网站(您必须是注册开发人员提交错误)。
Since CUDA 5.5, the CUBLAS library contains routines for batched matrix factorization and inversion (cublas<t>getrfBatched
and cublas<t>getriBatched
respectively).
Getting guide from the documentation, I wrote a test code for inversion of an N x N matrix using these routines. The code gives correct output only if the matrix has all non zero pivots. Setting any pivot to zero results in incorrect results. I have verified the results using MATLAB.
I realize that I am providing row major matrices as input while CUBLAS expects column major matrices, but it shouldn't matter as it would only transpose the result. To be sure, I also tested on column major input, but getting same behavior.
I am confused as, cublas<t>getriBatched
expects pivot exchange information array P
as input, which is the output from cublas<t>getrfBatched
. So, if any zero pivots are eliminated by row exchange, then the inversion routine should handle it automatically.
How to perform inversion of matrices which contain a zero pivot using CUBLAS?
Following is a self contained compile-able example with different test cases:
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
//Select matrix by setting "a"
float* a = zero_pivot;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,a,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
int n; scanf("%d",&n);
return 0;
}
There seems to be a bug in the current CUBLAS library implementation of cublas<t>getrfBatched
for matrices of dimension (n
) such that 3<=n<=16
, when there is a "zero pivot" as you say.
A possible workaround is to "identity-extend" your A
matrix to be inverted, when n<17, to a size of 17x17 (using matlab nomenclature):
LU = getrf( [A 0 ; 0 I]);
continuing, you can then use cublas<t>getriBatched
in an "ordinary" fashion:
invA = getri( LU(1:3,1:3) )
(You can also leave everything at n=17, call getri
that way, and then extract the result as the first 3x3 rows and columns of invA
.)
Here is a fully worked example, borrowing from the code you supplied, showing the inversion of your supplied 3x3 zero_pivot
matrix, using the zero_pivot_war
matrix as an "identity-extended" workaround:
$ cat t340.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = 17;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == 17)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
/* float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
*/
float zero_pivot_war[17*17] = {
0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
*/
float result[n*n];
//Select matrix by setting "a"
float* a = zero_pivot_war;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*17+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,result,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",result[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
// int n; scanf("%d",&n);
return 0;
}
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Inverse:
-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
========= ERROR SUMMARY: 0 errors
$
The above result appears to me to be correct based on a simple test elsewhere.
I don't have any further technical details about the nature of the possible bug in CUBLAS. From what I can tell, it is present in both CUDA 5.5 and CUDA 6.0 RC. Detailed bug discussions for NVIDIA-supplied assets (e.g. CUBLAS library) should be taken up on the NVIDIA developer forums or directly at the bug filing portal on developer.nvidia.com (you must be a registered developer to file a bug).
这篇关于CUBLAS:矩阵零点反转不正确的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!