CUDA 例程

扫码查看
scalar add

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
__global__ void add(int *a, int *b,int *c)
{
c[blockIdx.x]=a[blockIdx.x]+b[blockIdx.x];
}
int main(void)
{ // H has storage for 4 integers
int a,b,c;
int *da,*db,*dc;
int size=*sizeof(int); //scalar;
cudaMalloc((void**)&da,size);
cudaMalloc((void**)&db,size);
cudaMalloc((void**)&dc,size); a=;
b=;
cudaMemcpy(da,&a,size,cudaMemcpyHostToDevice);
cudaMemcpy(db,&b,size,cudaMemcpyHostToDevice); add<<<,>>>(da,db,dc);
cudaMemcpy(&c,dc,size,cudaMemcpyDeviceToHost );
std::cout<<c<<std::endl; cudaFree(da);
cudaFree(db);
cudaFree(dc); std::cout<<"hell";
thrust::host_vector<int> H();
// initialize individual elements
H[] = ; H[] = ; H[] = ; H[] = ;
// H.size() returns the size of vector H
std::cout << "H has size " << H.size() << std::endl;
// print contents of H
for(int i = ; i < H.size(); i++) std::cout << "H[" << i << "] = " << H[i] << std::endl;
// resize H
H.resize();
std::cout << "H now has size " << H.size() << std::endl;
// Copy host_vector H to device_vector D
thrust::device_vector<int> D = H;
// elements of D can be modified
D[] = ; D[] = ; // print contents of D
for(int i = ; i < D.size(); i++) std::cout << "D[" << i << "] = " << D[i] << std::endl;
// H and D are automatically deleted when the function returns
return ; }
block or thread

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream> const int N=;
__global__ void add(int *a, int *b,int *c)
{
c[blockIdx.x]=a[blockIdx.x]+b[blockIdx.x]; //c[threadIdx.x]=a[threadIdx.x]+b[threadIdx.x];
}
int main(void)
{ // H has storage for 4 integers
int *a,*b,*c;
int *da,*db,*dc;
int size=N*sizeof(int); //scalar; cudaMalloc((void**)&da,size);
cudaMalloc((void**)&db,size);
cudaMalloc((void**)&dc,size); a=(int *) malloc(size);
memset(a,,N*sizeof(int));//rand_ints(a,N); a[]=;
a[]=;
b=(int *) malloc(size); memset(b,, N*sizeof(int));// rand_ints(b,N);
b[]=;
b[]=; c=(int *) malloc(size); //rand_ints(c,N);
memset(c,, N*sizeof(int)); cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(db,b,size,cudaMemcpyHostToDevice); add<<<N,>>>(da,db,dc); //N blocks add<<<1,N>>>(da,db,dc); N threads
cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost ); for (int i=; i<;i++) std::cout<<c[i]<<std::endl; //_syncthreads(); //useless cudaDeviceSynchronize(); free(a); free(b); free(c); cudaFree(da); cudaFree(db); cudaFree(dc); return ; }
block+thread
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream> /*

#define N (2048*2048)
#define M 512 // THREADS_PER_BLOCK

add<<<N/M, M>>>(d_a, d_b, d_c);

N /M      blocks used
M threads / block
*/ const int N=*;
const int M=;
__global__ void add(int *a, int *b,int *c,int n)
{
int index=threadIdx.x+blockIdx.x*blockDim.x;
c[index]=a[index]+b[index];
if (index<n)
c[index]=a[index]+b[index];
//c[threadIdx.x]=a[threadIdx.x]+b[threadIdx.x];
}
int main(void)
{ // H has storage for 4 integers
int *a,*b,*c;
int *da,*db,*dc;
int size=N*sizeof(int); //scalar; cudaMalloc((void**)&da,size);
cudaMalloc((void**)&db,size);
cudaMalloc((void**)&dc,size); a=(int *) malloc(size);
memset(a,,N*sizeof(int));//rand_ints(a,N); a[]=;
a[]=;
b=(int *) malloc(size); memset(b,, N*sizeof(int));// rand_ints(b,N);
b[]=;
b[]=; c=(int *) malloc(size); //rand_ints(c,N);
memset(c,, N*sizeof(int)); cudaMemcpy(da,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(db,b,size,cudaMemcpyHostToDevice); add<<<(N+M-)/M,M>>>(da,db,dc,N);
cudaMemcpy(c,dc,size,cudaMemcpyDeviceToHost );
for (int i=; i<;i++)
std::cout<<c[i]<<std::endl; //_syncthreads(); //useless
cudaDeviceSynchronize(); free(a);
free(b);
free(c);
cudaFree(da);
cudaFree(db);
cudaFree(dc); return ; }
05-11 10:49
查看更多