CUBLAS：矩阵零点反转不正确

本文介绍了CUBLAS：矩阵零点反转不正确的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

自CUDA 5.5起，CUBLAS库包含用于批量矩阵分解和反演的例程（和）。 b
$ b

从文档中获取指南，我使用这些例程编写了一个用于反演N×N矩阵的测试代码。只有当矩阵具有所有非零枢轴时，代码才给出正确的输出。将任何枢轴设置为零会导致不正确的结果。我已经使用MATLAB验证了结果。

我意识到我提供行主矩阵作为输入，而CUBLAS期望列主矩阵，但它不重要，因为它会只转置结果。当然，我也测试了列主要输入，但是得到相同的行为。

我很困惑， cublas< t> getriBatched 期望枢纽交换信息数组 P 作为输入，它是 cublas< t> getrfBatched 的输出。因此，如果通过行交换消除任何零点，则反演程序应该自动处理它。

如何使用CUBLAS执行包含零点的矩阵的反演？

以下是具有不同测试用例的自包含可编译示例：

  #include< cstdio> 
 #include< cstdlib> 
 #include< cuda_runtime.h> 
 #include< cublas_v2.h> 
 
 #define cudacall（call）\ 
 do \ 
 {\ 
 cudaError_t err =（call）; \ 
 if（cudaSuccess！= err）\ 
 {\ 
 fprintf（stderr，CUDA错误：\\\
File =％s\\\
Line =％d \\\
Reason = ％s\\\
，__FILE__，__LINE__，cudaGetErrorString（err））; \ 
 cudaDeviceReset（）; \ 
 exit（EXIT_FAILURE）; \ 
} \ 
} \ 
 while（0）
 
 #define cublascall（call）\ 
 do \ 
 {\ 
 cublasStatus_t status =（call）; \ 
 if（CUBLAS_STATUS_SUCCESS！= status）\ 
 {\ 
 fprintf（stderr，CUBLAS错误：\\\
File =％s \\\
Line =％d \\\
Code = ％d\\\
，__FILE__，__LINE__，status）; \ 
 cudaDeviceReset（）; \ 
 exit（EXIT_FAILURE）; \ 
} \ 
 \ 
} \ 
 while（0）
 
 
 void invert_device（float * src_d，float * dst_d，int n）
 {
 cublasHandle_t handle; 
 cublascall（cublasCreate_v2（& handle））; 
 
 int batchSize = 1; 
 
 int * P，* INFO; 
 
 cudacall（cudaMalloc< int>（& P，n * batchSize * sizeof（int））） 
 cudacall（cudaMalloc< int>（& INFO，batchSize * sizeof（int）））; 
 
 int lda = n; 
 
 float * A [] = {src_d}; 
 float ** A_d; 
 cudacall（cudaMalloc< float *>（& A_d，sizeof（A）））; 
 cudacall（cudaMemcpy（A_d，A，sizeof（A），cudaMemcpyHostToDevice））; 
 
 cublascall（cublasSgetrfBatched（handle，n，A_d，lda，P，INFO，batchSize））; 
 
 int INFOh = 0; 
 cudacall（cudaMemcpy（& INFOh，INFO，sizeof（int），cudaMemcpyDeviceToHost））; 
 
 if（INFOh == n）
 {
 fprintf（stderr，Factorization Failed：Matrix is singular\）; 
 cudaDeviceReset（）; 
 exit（EXIT_FAILURE）; 
} 
 
 float * C [] = {dst_d}; 
 float ** C_d; 
 cudacall（cudaMalloc< float *>（& C_d，sizeof（C）））; 
 cudacall（cudaMemcpy（C_d，C，sizeof（C），cudaMemcpyHostToDevice））; 
 
 cublascall（cublasSgetriBatched（handle，n，A_d，lda，P，C_d，lda，INFO，batchSize） 
 
 cudacall（cudaMemcpy（& INFOh，INFO，sizeof（int），cudaMemcpyDeviceToHost））; 
 
 if（INFOh！= 0）
 {
 fprintf（stderr，Inversion Failed：Matrix is singular\）; 
 cudaDeviceReset（）; 
 exit（EXIT_FAILURE）; 
} 
 
 cudaFree（P），cudaFree（INFO），cublasDestroy_v2（handle）; 
} 
 
 void invert（float * src，float * dst，int n）
 {
 float * src_d，* dst_d; 
 
 cudacall（cudaMalloc< float>（& src_d，n * n * sizeof（float））） 
 cudacall（cudaMemcpy（src_d，src，n * n * sizeof（float），cudaMemcpyHostToDevice））; 
 cudacall（cudaMalloc< float>（& dst_d，n * n * sizeof（float）））; 
 
 invert_device（src_d，dst_d，n）; 
 
 cudacall（cudaMemcpy（dst，dst_d，n * n * sizeof（float），cudaMemcpyDeviceToHost））; 
 
 cudaFree（src_d），cudaFree（dst_d）; 
} 
 
 void test_invert（）
 {
 const int n = 3; 
 
 //具有完整枢轴的随机矩阵
 float full_pivots [n * n] = {0.5,3,4，
 1，3，10，
 4， 9，16}; 
 
 //几乎与上面的第一个零点矩阵相同
 float zero_pivot [n * n] = {0，3，4，
 1，3，10，
 4，9，16}; 
 
 float zero_pivot_col_major [n * n] = {0,1,4，
 3，3，9，
 4，10，16} 
 
 float another_zero_pivot [n * n] = {0,3,4，
 1，5，6，
 9，8，2} 
 
 float another_full_pivot [n * n] = {22,3,4，
 1，5，6，
 9，8，2} 
 
 float singular [n * n] = {1,2,3，
 4,5,6，
 7,8,9}; 
 
 
 //通过设置a选择矩阵
 float * a = zero_pivot; 
 
 fprintf（stdout，Input：\\\
\\\
）; 
 for（int i = 0; i  {
 for（int j = 0; j< n; j ++）
 fprintf（stdout，％f \t，a [i * n + j]）; 
 fprintf（stdout，\\\
）; 
} 
 
 fprintf（stdout，\\\
\\\
）; 
 
 invert（a，a，n）; 
 
 fprintf（stdout，Inverse：\\\
\\\
）; 
 for（int i = 0; i  {
 for（int j = 0; j< n; j ++）
 fprintf（stdout，％f \t，a [i * n + j]）; 
 fprintf（stdout，\\\
）; 
} 
 
} 
 
 int main（）
 {
 test_invert（）; 
 
 int n; scanf（％d，& n）; 
 return 0; 
}

解决方案

在对于维度（ n ）的矩阵的当前CUBLAS库实现 cublas< t> getrfBatched

一个可能的解决方法是，如果你想要一个可能的解决方法，你可以使用$ c> 3< = n 当使用matlab命名法时，当 A 矩阵转换为大小为17x17的identity-extend：

  LU = getrf（[A 0; 0 I]）;

继续，您可以使用 cublas< t> getriBatched 以普通方式：

  invA = getri（LU（1：3,1：3））

（您也可以将一切都保留在n = 17，调用 getri 那样，然后提取结果作为 invA 的第一个3x3行和列。）

这是一个完全工作的例子，借用你提供的代码，显示你提供的3x3 zero_pivot 矩阵的反演，使用 zero_pivot_war matrix作为身份扩展解决方法：

  $ cat t340.cu 
 #include< ; cstdio> 
 #include< cstdlib> 
 #include< cuda_runtime.h> 
 #include< cublas_v2.h> 
 
 #define cudacall（call）\ 
 do \ 
 {\ 
 cudaError_t err =（call）; \ 
 if（cudaSuccess！= err）\ 
 {\ 
 fprintf（stderr，CUDA错误：\\\
File =％s\\\
Line =％d \\\
Reason = ％s\\\
，__FILE__，__LINE__，cudaGetErrorString（err））; \ 
 cudaDeviceReset（）; \ 
 exit（EXIT_FAILURE）; \ 
} \ 
} \ 
 while（0）
 
 #define cublascall（call）\ 
 do \ 
 {\ 
 cublasStatus_t status =（call）; \ 
 if（CUBLAS_STATUS_SUCCESS！= status）\ 
 {\ 
 fprintf（stderr，CUBLAS错误：\\\
File =％s \\\
Line =％d \\\
Code = ％d\\\
，__FILE__，__LINE__，status）; \ 
 cudaDeviceReset（）; \ 
 exit（EXIT_FAILURE）; \ 
} \ 
 \ 
} \ 
 while（0）
 
 
 void invert_device（float * src_d，float * dst_d，int n）
 {
 cublasHandle_t handle; 
 cublascall（cublasCreate_v2（& handle））; 
 
 int batchSize = 1; 
 
 int * P，* INFO; 
 
 cudacall（cudaMalloc< int>（& P，17 * batchSize * sizeof（int））） 
 cudacall（cudaMalloc< int>（& INFO，batchSize * sizeof（int）））; 
 
 int lda = 17; 
 
 float * A [] = {src_d}; 
 float ** A_d; 
 cudacall（cudaMalloc< float *>（& A_d，sizeof（A）））; 
 cudacall（cudaMemcpy（A_d，A，sizeof（A），cudaMemcpyHostToDevice））; 
 
 cublascall（cublasSgetrfBatched（handle，17，A_d，lda，P，INFO，batchSize））; 
 
 int INFOh = 0; 
 cudacall（cudaMemcpy（& INFOh，INFO，sizeof（int），cudaMemcpyDeviceToHost））; 
 
 if（INFOh == 17）
 {
 fprintf（stderr，Factorization Failed：Matrix is singular\）; 
 cudaDeviceReset（）; 
 exit（EXIT_FAILURE）; 
} 
 
 float * C [] = {dst_d}; 
 float ** C_d; 
 cudacall（cudaMalloc< float *>（& C_d，sizeof（C）））; 
 cudacall（cudaMemcpy（C_d，C，sizeof（C），cudaMemcpyHostToDevice））; 
 
 cublascall（cublasSgetriBatched（handle，n，A_d，lda，P，C_d，n，INFO，batchSize））; 
 
 cudacall（cudaMemcpy（& INFOh，INFO，sizeof（int），cudaMemcpyDeviceToHost））; 
 
 if（INFOh！= 0）
 {
 fprintf（stderr，Inversion Failed：Matrix is singular\）; 
 cudaDeviceReset（）; 
 exit（EXIT_FAILURE）; 
} 
 
 cudaFree（P），cudaFree（INFO），cublasDestroy_v2（handle）; 
} 
 
 void invert（float * src，float * dst，int n）
 {
 float * src_d，* dst_d; 
 
 cudacall（cudaMalloc< float>（& src_d，17 * 17 * sizeof（float））） 
 cudacall（cudaMemcpy（src_d，src，17 * 17 * sizeof（float），cudaMemcpyHostToDevice））; 
 cudacall（cudaMalloc< float>（& dst_d，n * n * sizeof（float）））; 
 
 invert_device（src_d，dst_d，n）; 
 
 cudacall（cudaMemcpy（dst，dst_d，n * n * sizeof（float），cudaMemcpyDeviceToHost））; 
 
 cudaFree（src_d），cudaFree（dst_d）; 
} 
 
 void test_invert（）
 {
 const int n = 3; 
 
 //具有完整枢轴的随机矩阵
 / * float full_pivots [n * n] = {0.5,3,4，
 1，3，10，
 4，9，16}; 
 
 //几乎与上面的第一个零点矩阵相同
 float zero_pivot [n * n] = {0，3，4，
 1，3，10，
 4，9，16}; 
 
 float zero_pivot_col_major [n * n] = {0,1,4，
 3，3，9，
 4，10，16} 
 * / 
 float zero_pivot_war [17 * 17] = {
 0,3,4,0,0,0,0,0,0,0,0,0,0,0 ，0,0,0，
 1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0，
 4 ，9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0，
 0,0,0,1,0,0,0 ，0,0,0,0,0,0,0,0,0,0，
 0,0,0,0,1,0,0,0,0,0,0,0,0 ，0,0,0,0，
 0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0，
 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0，
 0,0,0,0,0,0 ，0,1,0,0,0,0,0,0,0,0,0，
 0,0,0,0,0,0,0,0,1,0,0,0 ，0,0,0,0,0，
 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0，
 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0，
 0,0,0,0,0 ，0,0,0,0,0,0,1,0,0,0,0,0，
 0,0,0,0,0,0,0,0,0,0,0 ，0,1,0,0,0,0，
 0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0 ，
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0，
 0,0,0,0 ，0,0,0,0,0,0,0,0,0,0,0,1,0，
 0,0,0,0,0,0,0,0,0,0 ，0,0,0,0,0,0,1}; 
 / * 
 float another_zero_pivot [n * n] = {0,3,4，
 1，5，6，
 9，8，2} 
 
 float another_full_pivot [n * n] = {22,3,4，
 1，5，6，
 9，8，2} 
 
 float singular [n * n] = {1,2,3，
 4,5,6，
 7,8,9}; 
 * / 
 float result [n * n]; 
 
 //通过设置a选择矩阵
 float * a = zero_pivot_war; 
 
 fprintf（stdout，Input：\\\
\\\
）; 
 for（int i = 0; i  {
 for（int j = 0; j< n; j ++）
 fprintf（stdout，％f \t，a [i * 17 + j]）; 
 fprintf（stdout，\\\
）; 
} 
 
 fprintf（stdout，\\\
\\\
）; 
 
 invert（a，result，n）; 
 
 fprintf（stdout，Inverse：\\\
\\\
）; 
 for（int i = 0; i  {
 for（int j = 0; j< n; j ++）
 fprintf（stdout，％f \t，result [i * n + j]）; 
 fprintf（stdout，\\\
）; 
} 
 
} 
 
 int main（）
 {
 test_invert（）; 
 
 // int n; scanf（％d，& n）; 
 return 0; 
} 
 $ nvcc -arch = sm_20 -o t340 t340.cu -lcublas 
 $ cuda-memcheck ./t340 
 ========= CUDA-MEMCHECK 
输入：
 
 0.000000 3.000000 4.000000 
 1.000000 3.000000 10.000000 
 4.000000 9.000000 16.000000 
 
 
反向：
 
 -0.700000 -0.200000 0.300000 
 0.400000 -0.266667 0.066667 
 -0.050000 0.200000 -0.050000 
 =========错误摘要：0个错误
 $

上面的结果对我来说是正确的，基于一个简单的测试。

我没有关于CUBLAS中可能的错误的性质的任何进一步的技术细节。从我可以知道，它存在于CUDA 5.5和CUDA 6.0 RC。 NVIDIA提供的资源（例如CUBLAS库）的详细错误讨论应在上直接讨论在的错误归档门户网站（您必须是注册开发人员提交错误）。

Since CUDA 5.5, the CUBLAS library contains routines for batched matrix factorization and inversion (cublas<t>getrfBatched and cublas<t>getriBatched respectively).

Getting guide from the documentation, I wrote a test code for inversion of an N x N matrix using these routines. The code gives correct output only if the matrix has all non zero pivots. Setting any pivot to zero results in incorrect results. I have verified the results using MATLAB.

I realize that I am providing row major matrices as input while CUBLAS expects column major matrices, but it shouldn't matter as it would only transpose the result. To be sure, I also tested on column major input, but getting same behavior.

I am confused as, cublas<t>getriBatched expects pivot exchange information array P as input, which is the output from cublas<t>getrfBatched. So, if any zero pivots are eliminated by row exchange, then the inversion routine should handle it automatically.

How to perform inversion of matrices which contain a zero pivot using CUBLAS?

Following is a self contained compile-able example with different test cases:

#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define cudacall(call)                                                                                                          \
    do                                                                                                                          \
    {                                                                                                                           \
        cudaError_t err = (call);                                                                                               \
        if(cudaSuccess != err)                                                                                                  \
        {                                                                                                                       \
            fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
            cudaDeviceReset();                                                                                                  \
            exit(EXIT_FAILURE);                                                                                                 \
        }                                                                                                                       \
    }                                                                                                                           \
    while (0)

#define cublascall(call)                                                                                        \
    do                                                                                                          \
    {                                                                                                           \
        cublasStatus_t status = (call);                                                                         \
        if(CUBLAS_STATUS_SUCCESS != status)                                                                     \
        {                                                                                                       \
            fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
            cudaDeviceReset();                                                                                  \
            exit(EXIT_FAILURE);                                                                                 \
        }                                                                                                       \
                                                                                                                \
    }                                                                                                           \
    while(0)


void invert_device(float* src_d, float* dst_d, int n)
{
    cublasHandle_t handle;
    cublascall(cublasCreate_v2(&handle));

    int batchSize = 1;

    int *P, *INFO;

    cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
    cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));

    int lda = n;

    float *A[] = { src_d };
    float** A_d;
    cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
    cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));

    cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));

    int INFOh = 0;
    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

    if(INFOh == n)
    {
        fprintf(stderr, "Factorization Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

    float* C[] = { dst_d };
    float** C_d;
    cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
    cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));

    cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize));

    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

    if(INFOh != 0)
    {
        fprintf(stderr, "Inversion Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

    cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}

void invert(float* src, float* dst, int n)
{
    float* src_d, *dst_d;

    cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float)));
    cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
    cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));

    invert_device(src_d,dst_d,n);

    cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));

    cudaFree(src_d), cudaFree(dst_d);
}

void test_invert()
{
    const int n = 3;

    //Random matrix with full pivots
    float full_pivots[n*n] = { 0.5, 3, 4, 
                                1, 3, 10, 
                                4 , 9, 16 };

    //Almost same as above matrix with first pivot zero
    float zero_pivot[n*n] = { 0, 3, 4, 
                              1, 3, 10,
                              4 , 9, 16 };

    float zero_pivot_col_major[n*n] = { 0, 1, 4, 
                                        3, 3, 9,
                                        4 , 10, 16 };

    float another_zero_pivot[n*n] = { 0, 3, 4, 
                                      1, 5, 6,
                                      9, 8, 2 };

    float another_full_pivot[n * n] = { 22, 3, 4, 
                                        1, 5, 6,
                                        9, 8, 2 };

    float singular[n*n] = {1,2,3,
                           4,5,6,
                           7,8,9};


    //Select matrix by setting "a"
    float* a = zero_pivot;  

    fprintf(stdout, "Input:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            fprintf(stdout,"%f\t",a[i*n+j]);
        fprintf(stdout,"\n");
    }

    fprintf(stdout,"\n\n");

    invert(a,a,n);

    fprintf(stdout, "Inverse:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            fprintf(stdout,"%f\t",a[i*n+j]);
        fprintf(stdout,"\n");
    }

}

int main()
{
    test_invert();

    int n;  scanf("%d",&n);
    return 0;
}

解决方案

There seems to be a bug in the current CUBLAS library implementation of cublas<t>getrfBatched for matrices of dimension (n) such that 3<=n<=16, when there is a "zero pivot" as you say.

A possible workaround is to "identity-extend" your A matrix to be inverted, when n<17, to a size of 17x17 (using matlab nomenclature):

 LU = getrf( [A 0 ; 0 I]);

continuing, you can then use cublas<t>getriBatched in an "ordinary" fashion:

 invA = getri( LU(1:3,1:3) )

(You can also leave everything at n=17, call getri that way, and then extract the result as the first 3x3 rows and columns of invA.)

Here is a fully worked example, borrowing from the code you supplied, showing the inversion of your supplied 3x3 zero_pivot matrix, using the zero_pivot_war matrix as an "identity-extended" workaround:

$ cat t340.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define cudacall(call)                                                                                                          \
    do                                                                                                                          \
    {                                                                                                                           \
        cudaError_t err = (call);                                                                                               \
        if(cudaSuccess != err)                                                                                                  \
        {                                                                                                                       \
            fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
            cudaDeviceReset();                                                                                                  \
            exit(EXIT_FAILURE);                                                                                                 \
        }                                                                                                                       \
    }                                                                                                                           \
    while (0)

#define cublascall(call)                                                                                        \
    do                                                                                                          \
    {                                                                                                           \
        cublasStatus_t status = (call);                                                                         \
        if(CUBLAS_STATUS_SUCCESS != status)                                                                     \
        {                                                                                                       \
            fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);     \
            cudaDeviceReset();                                                                                  \
            exit(EXIT_FAILURE);                                                                                 \
        }                                                                                                       \
                                                                                                                \
    }                                                                                                           \
    while(0)


void invert_device(float* src_d, float* dst_d, int n)
{
    cublasHandle_t handle;
    cublascall(cublasCreate_v2(&handle));

    int batchSize = 1;

    int *P, *INFO;

    cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
    cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));

    int lda = 17;

    float *A[] = { src_d };
    float** A_d;
    cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
    cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));

    cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));

    int INFOh = 0;
    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

    if(INFOh == 17)
    {
        fprintf(stderr, "Factorization Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

    float* C[] = { dst_d };
    float** C_d;
    cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
    cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));

    cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));

    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

    if(INFOh != 0)
    {
        fprintf(stderr, "Inversion Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

    cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}

void invert(float* src, float* dst, int n)
{
    float* src_d, *dst_d;

    cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
    cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
    cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));

    invert_device(src_d,dst_d,n);

    cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));

    cudaFree(src_d), cudaFree(dst_d);
}

void test_invert()
{
    const int n = 3;

    //Random matrix with full pivots
/*    float full_pivots[n*n] = { 0.5, 3, 4,
                                1, 3, 10,
                                4 , 9, 16 };

    //Almost same as above matrix with first pivot zero
    float zero_pivot[n*n] = { 0, 3, 4,
                              1, 3, 10,
                              4 , 9, 16 };

    float zero_pivot_col_major[n*n] = { 0, 1, 4,
                                        3, 3, 9,
                                        4 , 10, 16 };
*/
    float zero_pivot_war[17*17] = {
                                        0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                        1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                        4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
                                        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
    float another_zero_pivot[n*n] = { 0, 3, 4,
                                      1, 5, 6,
                                      9, 8, 2 };

    float another_full_pivot[n * n] = { 22, 3, 4,
                                        1, 5, 6,
                                        9, 8, 2 };

    float singular[n*n] = {1,2,3,
                           4,5,6,
                           7,8,9};
*/
    float result[n*n];

    //Select matrix by setting "a"
    float* a = zero_pivot_war;

    fprintf(stdout, "Input:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            fprintf(stdout,"%f\t",a[i*17+j]);
        fprintf(stdout,"\n");
    }

    fprintf(stdout,"\n\n");

    invert(a,result,n);

    fprintf(stdout, "Inverse:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            fprintf(stdout,"%f\t",result[i*n+j]);
        fprintf(stdout,"\n");
    }

}

int main()
{
    test_invert();

//    int n;  scanf("%d",&n);
    return 0;
}
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
Input:

0.000000        3.000000        4.000000
1.000000        3.000000        10.000000
4.000000        9.000000        16.000000


Inverse:

-0.700000       -0.200000       0.300000
0.400000        -0.266667       0.066667
-0.050000       0.200000        -0.050000
========= ERROR SUMMARY: 0 errors
$

The above result appears to me to be correct based on a simple test elsewhere.

I don't have any further technical details about the nature of the possible bug in CUBLAS. From what I can tell, it is present in both CUDA 5.5 and CUDA 6.0 RC. Detailed bug discussions for NVIDIA-supplied assets (e.g. CUBLAS library) should be taken up on the NVIDIA developer forums or directly at the bug filing portal on developer.nvidia.com (you must be a registered developer to file a bug).

这篇关于CUBLAS：矩阵零点反转不正确的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！