使用CUDA计算不同集合中点之间的所有对距离

使用CUDA计算不同集合中点之间的所有对距离

本文介绍了使用CUDA计算不同集合中点之间的所有对距离的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试在CUDA中实现暴力距离计算算法。

  #define VECTOR_DIM 128 
推力:: device_vector< float> feature_data_1;
feature_data_1.resize(VECTOR_DIM * 1000); // 1000 128个维度点
推力:: device_vector< float> feature_data_2;
feature_data_2.resize(VECTOR_DIM * 2000); // 2000 128个维度点

现在我想做的是计算 L2 从第一个矩阵中的每个向量到第二个矩阵中的每个向量的距离(平方差之和)。



如果数组 1 的大小为 1000 ,而数组 2 的大小为 2000 的大小,结果将是大小为 1000 * 2000 的浮点矩阵。

$我想知道是否有一种方法可以单独使用Thrust算法来实现。

解决方案

计算CUDA中两个不同集合中的点之间的全对距离可以通过观察来解决

  || xy || ^ 2 = || x || ^ 2 + || y || ^ 2-2 *< x,y> ; 

其中 || || l2 范数,而< x,y> 表示标量积在 x y 之间。



规范 || x || || y || 可以通过,而标量积< x,y> cublas gemm()计算为矩阵-矩阵乘法 X * Y ^ T c>。



下面是一个完全可行的实现。请注意,对于规范 ||的计算|| 报告了两种方法,一种使用 cuBLAS cublas< gemv 使用Thurst的转换。对于您感兴趣的问题大小,我在GT540M卡上经历了以下计时:

 方法nr。 1 0.12ms 
方法2个0.59毫秒

包括< cublas_v2.h>

#include< thrust / host_vector.h>
#include< thrust / device_vector.h>
#include< thrust / generate.h>
#include< thrust / reduce.h>
#include< thrust / functional.h>
#include< thrust / random.h>
#include< thrust / sequence.h>

#include< stdio.h>
#include< iostream>

#include Utilities.cuh
#include TimingGPU.cuh

#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16

/ ********************************************* ************** /
/ *方差绝对值函数-方法1所需* /
/ ************ *************************************************** /
struct abs2 {
__host__ __device__ double operator()(const float& x)const {return x * x; }
};

//-方法2必需
__device__ float * vals;

/ ********************************************* * /
/ * ROW_REDUCTION-方法2所需* /
/ **************************** ************** /
struct row_reduction {

const int Ncols; //-列数

row_reduction(int _Ncols):Ncols(_Ncols){}

__device__ float操作符()(float& x,int& y) {
float temp = 0.f;
for(int i = 0; i temp + = vals [i +(y * Ncols)] * vals [i +(y * Ncols)];
返回温度;
}
};

/ ********************************************* ******* /
/ *设定最终结果的内核功能* /
/ ********************** ************************** /
__global__ void assemble_final_result(const float * __restrict__ d_norms_x_2,const float * __restrict__ d_norms_y_2,float * __restrict__ d_dots,
const int NX,const int NY){

const int i = threadIdx.x + blockIdx.x * gridDim.x;
const int j = threadIdx.y + blockIdx.y * gridDim.y;

如果((i< NY)&&(j< NX))d_dots [i * NX + j] = d_norms_x_2 [j] + d_norms_y_2 [i]-2 * d_dots [i * NX + j];

}

/ ******** /
/ *主要* /
/ ******** /
int main()
{
// const int Ndims = 128; //-行数
// const int NX = 1000; //-列数
// const int NY = 2000; //-列数

const int Ndims = 3; //-行数
const int NX = 4; //-列数
const int NY = 5; //-列数

//-分布在10到99之间的随机均匀整数
推力:: default_random_engine rng;
推力:: uniform_int_distribution< int> dist(10,99);

//-矩阵分配和初始化
推力:: device_vector< float> d_X(Ndims * NX);
推力:: device_vector< float> d_Y(Ndims * NY);
for(size_t i = 0; i< d_X.size(); i ++)d_X [i] =(float)dist(rng);
for(size_t i = 0; i
TimingGPU timerGPU;

// --- cuBLAS句柄创建
cublasHandle_t句柄;
cublasSafeCall(cublasCreate(& handle));

/ ********************************************* ***** /
/ *计算X元素的范数* /
/ *********************** *********************** /
推力:: device_vector< float> d_norms_x_2(NX);

//-方法nr。 1
//timerGPU.StartCounter();
推力:: device_vector< float> d_X_2(Ndims * NX);
推力:: transform(d_X.begin(),d_X.end(),d_X_2.begin(),abs2());

推力:: device_vector< float> d_ones(Ndims,1.f);

浮点alpha = 1.f;
float beta = 0.f;
cublasSafeCall(cublasSgemv(handle,CUBLAS_OP_T,Ndims,NX,& alpha,推力:: raw_pointer_cast(d_X_2.data()),Ndims,
推力:: raw_pointer_cast(d_ones.data()), 1,& beta,推力:: raw_pointer_cast(d_norms_x_2.data()),1));

// printf(方法1的计时=%f\n,timerGPU.GetCounter());

//-方法nr。 2
//timerGPU.StartCounter();
//浮点* s_vals =推力:: raw_pointer_cast(& d_X [0]);
// gpuErrchk(cudaMemcpyToSymbol(vals,& s_vals,sizeof(float *)));
//推力:: transform(d_norms_x_2.begin(),d_norms_x_2.end(),推力:: counting_iterator< int>(0),d_norms_x_2.begin(),row_reduction(Ndims));

// printf(方法2的计时=%f\n,timerGPU.GetCounter());

/ ********************************************* ***** /
/ *计算Y元素的范数* /
/ *********************** *********************** /
推力:: device_vector< float> d_norms_y_2(NX);

推力:: device_vector< float> d_Y_2(Ndims * NX);
推力:: transform(d_Y.begin(),d_Y.end(),d_Y_2.begin(),abs2());

cublasSafeCall(cublasSgemv(handle,CUBLAS_OP_T,Ndims,NY,& alpha,推力:: raw_pointer_cast(d_Y_2.data()),Ndims,
推力:: raw_pointer_cast(d_ones.data ()),1和& beta,推力:: raw_pointer_cast(d_norms_y_2.data()),1));


/ *********************************** /
/ *计算标量产品* /
/ ********************************** * /
推力:: device_vector< float> d_dots(NX * NY);

cublasSafeCall(cublasSgemm(句柄,CUBLAS_OP_T,CUBLAS_OP_N,NX,NY,Ndims,& alpha,
推力:: raw_pointer_cast(d_X.data()),Ndims,推力:: raw_pointer_cast (d_Y.data()),Ndims,& beta,
推力:: raw_pointer_cast(d_dots.data()),NX));

/ **************************** /
/ *设定最终结果* /
/ ***************************** /

dim3 dimBlock(BLOCK_SIZE_X,BLOCK_SIZE_Y );
dim3 dimGrid(iDivUp(NX,BLOCK_SIZE_X),iDivUp(NY,BLOCK_SIZE_Y));
assemble_final_result< dimGrid,dimBlock>>>(thrust :: raw_pointer_cast(d_norms_x_2.data()),推力:: raw_pointer_cast(d_norms_y_2.data()),
推力:: raw_pointer_cast(d_dots.data()),NX,NY);

for(int i = 0; i< NX * NY; i ++)std :: cout< d_dots [i]<< \n;

返回0;
}

Utilities.cu Utilities.cuh 文件在在此省略。 TimingGPU.cu TimingGPU.cuh 保持,也将其省略。


I am trying to implement a brute force distance computation algorithm in CUDA.

#define VECTOR_DIM 128
thrust::device_vector<float> feature_data_1;
feature_data_1.resize(VECTOR_DIM * 1000); // 1000 128 dimensional points
thrust::device_vector<float> feature_data_2;
feature_data_2.resize(VECTOR_DIM * 2000); // 2000 128 dimensional points

Now what I would like to do is to compute the L2 distances (sum of the squared differences) from every vector in the first matrix to every vector in the second matrix.

So, if array 1 is of size 1000 and array 2 is of size 2000, the result would be a floating point matrix of 1000*2000 in size.

I was wondering if there is a way to achieve this using Thrust algorithms alone.

解决方案

Calculating the all-pairs distances between points in two different sets in CUDA can be solved by observing that

||x-y||^2=||x||^2+||y||^2-2*<x,y>

where || || is the l2 norm and <x,y> denotes the scalar product between x and y.

The norms ||x|| and ||y|| can be calculated by approaches inspired by Reduce matrix rows with CUDA, while the scalar products <x,y> can then be calculated as the matrix-matrix multiplication X*Y^T using cublas<t>gemm().

Below is a fully worked out implementation. Please, note that for the calculation of the norms || || two approaches are reported, one using cuBLAS cublas<t>gemv and one using Thurst's transform. For the problem size of your interest, I have experienced the following timings on my GT540M card:

Approach nr. 1    0.12ms
Approach nr. 2    0.59ms

include <cublas_v2.h>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/sequence.h>

#include <stdio.h>
#include <iostream>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16

/***********************************************************/
/* SQUARED ABSOLUTE VALUE FUNCTOR - NEEDED FOR APPROACH #1 */
/***********************************************************/
struct abs2 {
    __host__ __device__ double operator()(const float &x) const { return x * x; }
};

// --- Required for approach #2
__device__ float *vals;

/******************************************/
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */
/******************************************/
struct row_reduction {

    const int Ncols;    // --- Number of columns

    row_reduction(int _Ncols) : Ncols(_Ncols) {}

    __device__ float operator()(float& x, int& y ) {
        float temp = 0.f;
        for (int i = 0; i<Ncols; i++)
            temp += vals[i + (y*Ncols)] * vals[i + (y*Ncols)];
        return temp;
    }
};

/************************************************/
/* KERNEL FUNCTION TO ASSEMBLE THE FINAL RESULT */
/************************************************/
__global__ void assemble_final_result(const float * __restrict__ d_norms_x_2, const float * __restrict__ d_norms_y_2, float * __restrict__ d_dots,
                                      const int NX, const int NY) {

    const int i = threadIdx.x + blockIdx.x * gridDim.x;
    const int j = threadIdx.y + blockIdx.y * gridDim.y;

    if ((i < NY) && (j < NX)) d_dots[i * NX+ j] = d_norms_x_2[j] + d_norms_y_2[i] - 2 * d_dots[i * NX+ j];

}

/********/
/* MAIN */
/********/
int main()
{
    //const int Ndims = 128;        // --- Number of rows
    //const int NX  = 1000;     // --- Number of columns
    //const int NY  = 2000;     // --- Number of columns

    const int Ndims = 3;        // --- Number of rows
    const int NX    = 4;        // --- Number of columns
    const int NY    = 5;        // --- Number of columns

    // --- Random uniform integer distribution between 10 and 99
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(10, 99);

    // --- Matrices allocation and initialization
    thrust::device_vector<float> d_X(Ndims * NX);
    thrust::device_vector<float> d_Y(Ndims * NY);
    for (size_t i = 0; i < d_X.size(); i++) d_X[i] = (float)dist(rng);
    for (size_t i = 0; i < d_Y.size(); i++) d_Y[i] = (float)dist(rng);

    TimingGPU timerGPU;

    // --- cuBLAS handle creation
    cublasHandle_t handle;
    cublasSafeCall(cublasCreate(&handle));

    /**********************************************/
    /* CALCULATING THE NORMS OF THE ELEMENTS OF X */
    /**********************************************/
    thrust::device_vector<float> d_norms_x_2(NX);

    // --- Approach nr. 1
    //timerGPU.StartCounter();
    thrust::device_vector<float> d_X_2(Ndims * NX);
    thrust::transform(d_X.begin(), d_X.end(), d_X_2.begin(), abs2());

    thrust::device_vector<float> d_ones(Ndims, 1.f);

    float alpha = 1.f;
    float beta  = 0.f;
    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NX, &alpha, thrust::raw_pointer_cast(d_X_2.data()), Ndims,
                               thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_x_2.data()), 1));

    //printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());

    // --- Approach nr. 2
    //timerGPU.StartCounter();
 //   float *s_vals = thrust::raw_pointer_cast(&d_X[0]);
 //   gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *)));
 //   thrust::transform(d_norms_x_2.begin(), d_norms_x_2.end(), thrust::counting_iterator<int>(0),  d_norms_x_2.begin(), row_reduction(Ndims));

    //printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());

    /**********************************************/
    /* CALCULATING THE NORMS OF THE ELEMENTS OF Y */
    /**********************************************/
    thrust::device_vector<float> d_norms_y_2(NX);

    thrust::device_vector<float> d_Y_2(Ndims * NX);
    thrust::transform(d_Y.begin(), d_Y.end(), d_Y_2.begin(), abs2());

    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NY, &alpha, thrust::raw_pointer_cast(d_Y_2.data()), Ndims,
                               thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_y_2.data()), 1));


    /***********************************/
    /* CALCULATING THE SCALAR PRODUCTS */
    /***********************************/
    thrust::device_vector<float> d_dots(NX * NY);

    cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, NX, NY, Ndims, &alpha,
                               thrust::raw_pointer_cast(d_X.data()), Ndims, thrust::raw_pointer_cast(d_Y.data()), Ndims, &beta,
                               thrust::raw_pointer_cast(d_dots.data()), NX));

    /*****************************/
    /* ASSEMBLE THE FINAL RESULT */
    /*****************************/

    dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
    dim3 dimGrid(iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y));
    assemble_final_result<<<dimGrid, dimBlock>>>(thrust::raw_pointer_cast(d_norms_x_2.data()), thrust::raw_pointer_cast(d_norms_y_2.data()),
                                                 thrust::raw_pointer_cast(d_dots.data()), NX, NY);

    for(int i = 0; i < NX * NY; i++) std::cout << d_dots[i] << "\n";

    return 0;
}

The Utilities.cu and Utilities.cuh files are mantained here and omitted here. The TimingGPU.cu and TimingGPU.cuh are maintained here and are omitted as well.

这篇关于使用CUDA计算不同集合中点之间的所有对距离的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

09-06 05:54