CUDA 例程 | define

scalar add

#include <thrust/host_vector.h>

#include <thrust/device_vector.h>

#include <iostream>

__global__ void add(int *a, int *b,int *c)

{

c[blockIdx.x]=a[blockIdx.x]+b[blockIdx.x];

}

int main(void)

{ // H has storage for 4 integers

int a,b,c;

int *da,*db,*dc;

int size=*sizeof(int); //scalar;

cudaMalloc((void**)&da,size);

cudaMalloc((void**)&db,size);

cudaMalloc((void**)&dc,size);

a=;

b=;

cudaMemcpy(da,&a,size,cudaMemcpyHostToDevice);

cudaMemcpy(db,&b,size,cudaMemcpyHostToDevice);

add<<<,>>>(da,db,dc);

cudaMemcpy(&c,dc,size,cudaMemcpyDeviceToHost );

std::cout<<c<<std::endl;

cudaFree(da);

cudaFree(db);

cudaFree(dc);

std::cout<<"hell";

thrust::host_vector<int> H();

// initialize individual elements

H[] = ; H[] = ; H[] = ; H[] = ;

// H.size() returns the size of vector H

std::cout << "H has size " << H.size() << std::endl;

// print contents of H

for(int i = ; i < H.size(); i++) std::cout << "H[" << i << "] = " << H[i] << std::endl;

// resize H

H.resize();

std::cout << "H now has size " << H.size() << std::endl;

// Copy host_vector H to device_vector D

thrust::device_vector<int> D = H;

// elements of D can be modified

D[] = ; D[] = ; // print contents of D

for(int i = ; i < D.size(); i++) std::cout << "D[" << i << "] = " << D[i] << std::endl;

// H and D are automatically deleted when the function returns

return ; }

block or thread

#include <thrust/host_vector.h>

#include <thrust/device_vector.h>

#include <iostream>

const int N=;

__global__ void add(int *a, int *b,int *c)

{

c[blockIdx.x]=a[blockIdx.x]+b[blockIdx.x];      //c[threadIdx.x]=a[threadIdx.x]+b[threadIdx.x];

}

int main(void)

{ // H has storage for 4 integers

int *a,*b,*c;

int *da,*db,*dc;

int size=N*sizeof(int); //scalar;

cudaMalloc((void**)&da,size);

cudaMalloc((void**)&db,size);

cudaMalloc((void**)&dc,size);

a=(int *) malloc(size);

memset(a,,N*sizeof(int));//rand_ints(a,N);

a[]=;

a[]=;

b=(int *) malloc(size);  memset(b,, N*sizeof(int));// rand_ints(b,N);

b[]=;

b[]=;

c=(int *) malloc(size); //rand_ints(c,N);

memset(c,, N*sizeof(int));

cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);

cudaMemcpy(db,b,size,cudaMemcpyHostToDevice);

add<<<N,>>>(da,db,dc);               //N blocks  add<<<1,N>>>(da,db,dc);   N threads

cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost ); for (int i=; i<;i++) std::cout<<c[i]<<std::endl; //_syncthreads(); //useless cudaDeviceSynchronize(); free(a); free(b); free(c); cudaFree(da); cudaFree(db); cudaFree(dc); return ; }

block+thread
#include <thrust/host_vector.h>

#include <thrust/device_vector.h>

#include <iostream>

/*

#define N (2048*2048)
#define M 512 // THREADS_PER_BLOCK
…
add<<<N/M, M>>>(d_a, d_b, d_c);

N /M      blocks used
M   threads / block
*/

const int N=*;

const int M=;

__global__ void add(int *a, int *b,int *c,int n)

{

int index=threadIdx.x+blockIdx.x*blockDim.x;

c[index]=a[index]+b[index];

if (index<n)

  c[index]=a[index]+b[index];

//c[threadIdx.x]=a[threadIdx.x]+b[threadIdx.x];

}

int main(void)

{ // H has storage for 4 integers

int *a,*b,*c;

int *da,*db,*dc;

int size=N*sizeof(int); //scalar;

cudaMalloc((void**)&da,size);

cudaMalloc((void**)&db,size);

cudaMalloc((void**)&dc,size);

a=(int *) malloc(size);

memset(a,,N*sizeof(int));//rand_ints(a,N);

a[]=;

a[]=;

b=(int *) malloc(size);  memset(b,, N*sizeof(int));// rand_ints(b,N);

b[]=;

b[]=;

c=(int *) malloc(size); //rand_ints(c,N);

memset(c,, N*sizeof(int));

cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);

cudaMemcpy(db,b,size,cudaMemcpyHostToDevice);

add<<<(N+M-)/M,M>>>(da,db,dc,N);

cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost );

for (int i=; i<;i++)

std::cout<<c[i]<<std::endl;

//_syncthreads();

//useless

cudaDeviceSynchronize();

free(a);

free(b);

free(c);

cudaFree(da);

cudaFree(db);

cudaFree(dc);

return ; }