我正在尝试使用cuSOLVER库实现Cholesky分解。我是一名初学者CUDA程序员,并且我一直指定块大小和网格大小,但是我无法找出程序员如何使用cuSOLVER函数显式设置它。

这里是文档:http://docs.nvidia.com/cuda/cusolver/index.html#introduction

QR分解是使用cuSOLVER库实现的(请参见此处的示例:http://docs.nvidia.com/cuda/cusolver/index.html#ormqr-example1),甚至没有设置上述两个参数。

总结一下,我有以下问题

  • 如何使用cuSOLVER库设置参数:块大小和网格大小?
  • 使用NVIDIA文档中提到的QR示例代码如何做?
  • 最佳答案

    Robert Crovella已经回答了这个问题。在这里,我仅提供一个完整的示例,说明如何使用cuSOLVER库提供的potrf函数轻松地进行Cholesky分解。
    Utilities.cuUtilities.cuh文件在this page维护,此处省略。该示例实现了CPU和GPU方法。

    #include "cuda_runtime.h"
    #include "device_launch_paraMeters.h"
    
    #include<iostream>
    #include <fstream>
    #include<iomanip>
    #include<stdlib.h>
    #include<stdio.h>
    #include<assert.h>
    
    #include <cusolverDn.h>
    #include <cublas_v2.h>
    #include <cuda_runtime_api.h>
    
    #include "Utilities.cuh"
    
    #define prec_save 10
    
    /******************************************/
    /* SET HERMITIAN POSITIVE DEFINITE MATRIX */
    /******************************************/
    // --- Credit to: https://math.stackexchange.com/questions/357980/how-to-generate-random-symmetric-positive-definite-matrices-using-matlab
    void setPDMatrix(double * __restrict h_A, const int N) {
    
        // --- Initialize random seed
        srand(time(NULL));
    
        double *h_A_temp = (double *)malloc(N * N * sizeof(double));
    
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++)
                h_A_temp[i * N + j] = (float)rand() / (float)RAND_MAX;
    
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++)
                h_A[i * N + j] = 0.5 * (h_A_temp[i * N + j] + h_A_temp[j * N + i]);
    
        for (int i = 0; i < N; i++) h_A[i * N + i] = h_A[i * N + i] + N;
    
    }
    
    /************************************/
    /* SAVE REAL ARRAY FROM CPU TO FILE */
    /************************************/
    template <class T>
    void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
    
        std::ofstream outfile;
        outfile.open(filename);
        for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
        outfile.close();
    
    }
    
    /************************************/
    /* SAVE REAL ARRAY FROM GPU TO FILE */
    /************************************/
    template <class T>
    void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
    
        T *h_in = (T *)malloc(M * sizeof(T));
    
        gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
    
        std::ofstream outfile;
        outfile.open(filename);
        for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
        outfile.close();
    
    }
    
    /********/
    /* MAIN */
    /********/
    int main(){
    
        const int N = 1000;
    
        // --- CUDA solver initialization
        cusolverDnHandle_t solver_handle;
        cusolveSafeCall(cusolverDnCreate(&solver_handle));
    
        // --- CUBLAS initialization
        cublasHandle_t cublas_handle;
        cublasSafeCall(cublasCreate(&cublas_handle));
    
        /***********************/
        /* SETTING THE PROBLEM */
        /***********************/
        // --- Setting the host, N x N matrix
        double *h_A = (double *)malloc(N * N * sizeof(double));
        setPDMatrix(h_A, N);
    
        // --- Allocate device space for the input matrix
        double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * sizeof(double)));
    
        // --- Move the relevant matrix from host to device
        gpuErrchk(cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice));
    
        /****************************************/
        /* COMPUTING THE CHOLESKY DECOMPOSITION */
        /****************************************/
        // --- cuSOLVE input/output parameters/arrays
        int work_size = 0;
        int *devInfo;           gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    
        // --- CUDA CHOLESKY initialization
        cusolveSafeCall(cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, &work_size));
    
        // --- CUDA POTRF execution
        double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
        cusolveSafeCall(cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, work, work_size, devInfo));
        int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
        if (devInfo_h != 0) std::cout << "Unsuccessful potrf execution\n\n" << "devInfo = " << devInfo_h << "\n\n";
    
        // --- At this point, the lower triangular part of A contains the elements of L.
        /***************************************/
        /* CHECKING THE CHOLESKY DECOMPOSITION */
        /***************************************/
        saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\h_A.txt", N * N);
        saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\d_A.txt", N * N);
    
        cusolveSafeCall(cusolverDnDestroy(solver_handle));
    
        return 0;
    
    }
    

    编辑

    Cholesky分解要求相关矩阵为Hermitian和正定。可以通过How to generate random symmetric positive definite matrices using MATLAB?中的方法生成对称和正定矩阵。

    下面的Matlab代码可用于检查结果
    clear all
    close all
    clc
    
    warning off
    
    N = 1000;
    
    % --- Setting the problem solution
    x = ones(N, 1);
    
    load h_A.txt
    A = reshape(h_A, N, N);
    
    yMatlab = A * x;
    
    Lmatlab = chol(A, 'lower');
    
    xprime      = inv(Lmatlab) * yMatlab;
    xMatlab     = inv(Lmatlab') * xprime;
    
    fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
    
    load d_A.txt
    LCUDA = tril(reshape(d_A, N, N));
    
    fprintf('Percentage rms of Cholesky decompositions in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(Lmatlab - LCUDA).^2)) / sum(sum(abs(Lmatlab).^2))));
    
    load xCUDA.txt
    fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xCUDA - x).^2)) / sum(sum(abs(x).^2))));
    

    10-06 06:06