Cuda版本不工作，而串行工作

本文介绍了Cuda版本不工作，而串行工作的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我的下面的简单的Cuda代码返回一个不正确的结果（所有多边形在结尾有0个顶点），而在C ++中运行的相同的代码运行良好。问题是令人尴尬的并行：没有通信，没有syncthreads等，和Cuda内存分配是成功的。即使我的虚拟变量存储输入数组的内容为调试目的为0为Cuda版本。没有访问超出边界，因为我的数组足够大。用Cuda中的循环替换memcpy不会改变任何内容。

我真的不明白发生了什么...任何想法？谢谢！

Cuda代码：

  #include< stdio。 h。 
 #include< iostream> 
 #include< stdlib.h> 
 #include< cuda.h> 
 
 class Point2D {
 public：
 __device__ Point2D（double xx = 0，double yy = 0）：x（xx），y（yy）{}; 
 double x，y; 
}; 
 
 __device__ double dot（const Point2D& A，const Point2D& B）{
 return A.x * B.x + A.y * B.y; 
} 
 __device__ Point2D operator *（double a，const Point2D& P）{
 return Point2D（a * P.x，a * P.y）; 
} 
 __device__ Point2D operator +（Point2D A，const Point2D& B）{
 return Point2D（A.x + B.x，A.y + B.y）; 
} 
 __device__ Point2D operator-（Point2D A，const Point2D& B）{
 return Point2D（A.x  -  B.x，A.y  -  B.y）; 
} 
 __device__ Point2D inter（const Point2D& A，const Point2D& B，const Point2D& C，const Point2D& D）{//与CD的调解符* b $ b Point2D M = 0.5 *（C + D）; 
 return A  - （dot（A-M，D-C）/ dot（B-A，D-C））*（B-A）; 
} 
 
类Polygon {
 public：
 __device__ Polygon（）：nbpts（0）{}; 
 __device__ void addPts（Point2D pt）{
 pts [nbpts] = pt; 
 nbpts ++; 
}; 
 __device__多边形& operator =（const Polygon& rhs）{
 nbpts = rhs.nbpts; 
 dummy = rhs.dummy; 
 memcpy（pts，rhs.pts，nbpts * sizeof（Point2D））; 
 return * this; 
} 
 __device__ void cut（const Point2D& inside_pt，const Point2D& outside_pt）{
 
 int new_nbpts = 0; 
 Point2D newpts [128]; 
 Point2D AB（outside_pt-inside_pt）; 
 Point2D M（0.5 *（outside_pt + inside_pt））; 
 double ABM = dot（AB，M）; 
 
 Point2D S = pts [nbpts-1]; 
 
 for（int i = 0; i  
 Point2D E = pts [i]; 
 
 double ddot = -ABM + dot（AB，E）; 
 if（ddot< 0）{// E在剪辑边缘
 double ddot2 = -ABM + dot（AB，S）; 
 if（ddot2> 0）{
 newpts [new_nbpts] = inter（S，E，inside_pt，outside_pt）; 
 new_nbpts ++; 
} 
 newpts [new_nbpts] = E; 
 new_nbpts ++; 
} else {
 double ddot2 = -ABM + dot（AB，S）; 
 if（ddot2< 0）{
 newpts [new_nbpts] = inter（S，E，inside_pt，outside_pt）; 
 new_nbpts ++; 
} 
} 
 S = E; 
} 
 
 memcpy（pts，newpts，min（128，new_nbpts）* sizeof（Point2D））; 
 nbpts = new_nbpts; 
} 
 
 // private：
 Point2D pts [128]; 
 int nbpts; 
 float dummy; 
}; 
 
 
 __global__ void cut_poly（float * a，Polygon * polygons，int N）
 {
 int idx = blockIdx.x * blockDim.x + threadIdx.x ; 
 if（idx> = N / 2）return; 
 
多边形
 pol.addPts（Point2D（0.，0。））; 
 pol.addPts（Point2D（1.，0。））; 
 pol.addPts（Point2D（1.，1。））; 
 pol.addPts（Point2D（0.，1。））; 
 
 Point2D curPt（a [2 * idx]，a [2 * idx + 1]）; 
 
 for（int i = 0; i  Point2D other_pt（a [2 * i]，a [2 * i + 1]）; 
 pol.cut（curPt，other_pt）; 
} 
 pol.dummy = a [idx] 
 
 polygons [idx] = pol; 
} 
 
 
 
 int main（int argc，unsigned char * argv []）
 {
 
 const int N = 100; 
 float a_h [N]，* a_d; 
多边形p_h [N / 2]，* p_d; 
 
 size_t size = N * sizeof（float）; 
 size_t size_pol = N / 2 * sizeof（Polygon）; 
 
 cudaError_t err = cudaMalloc（（void **）& a_d，size）; 
 cudaError_t err2 = cudaMalloc（（void **）& p_d，size_pol）; 
 
 for（int i = 0; i  cudaMemcpy（a_d，a_h，size，cudaMemcpyHostToDevice）; 
 
 int block_size = 4; 
 int n_blocks = N / block_size +（N％block_size == 0？0：1）; 
 cut_poly<<< n_blocks，block_size>>> （a_d，p_d，N）; 
 
 cudaMemcpy（a_h，a_d，sizeof（float）* N，cudaMemcpyDeviceToHost）; 
 cudaMemcpy（p_h，p_d，sizeof（Polygon）* N / 2，cudaMemcpyDeviceToHost）; 
 
 for（int i = 0; i  printf（％f \t％f \t％u\\\
，a_h [i ]，p_h [i] .dummy，p_h [i] .nbpts）; 
 
 cudaFree（a_d）; 
 cudaFree（p_d）; 
 
 
 return 0; 
}

在C ++中相同的代码可以正常工作：

  #include< stdio.h> 
 #include< iostream> 
 #include< stdlib.h> 
 
 class Point2D {
 public：
 Point2D（double xx = 0，double yy = 0）：x（xx），y（yy）{}; 
 double x，y; 
}; 
 
双点（const Point2D& A，const Point2D& B）{
 return A.x * B.x + A.y * B.y; 
} 
 Point2D operator *（double a，const Point2D& P）{
 return Point2D（a * P.x，a * P.y）; 
} 
 Point2D operator +（Point2D A，const Point2D& B）{
 return Point2D（A.x + B.x，A.y + B.y）; 
} 
 Point2D operator-（Point2D A，const Point2D& B）{
 return Point2D（A.x- B.x，A.y-B.y）; 
} 
 Point2D inter（const Point2D& A，const Point2D& B，const Point2D& C，const Point2D& D）{//与CD $ b的调解符*相交$ b Point2D M = 0.5 *（C + D）; 
 return A  - （dot（A-M，D-C）/ dot（B-A，D-C））*（B-A）; 
} 
 
类Polygon {
 public：
 Polygon（）：nbpts（0）{}; 
 void addPts（Point2D pt）{
 pts [nbpts] = pt; 
 nbpts ++; 
}; 
 Polygon& operator =（const Polygon&rhs）{
 nbpts = rhs.nbpts; 
 dummy = rhs.dummy; 
 memcpy（pts，rhs.pts，nbpts * sizeof（Point2D））; 
 return * this; 
} 
 void cut（const Point2D& inside_pt，const Point2D& outside_pt）{
 
 int new_nbpts = 0; 
 Point2D newpts [128]; 
 Point2D AB（outside_pt-inside_pt）; 
 Point2D M（0.5 *（outside_pt + inside_pt））; 
 double ABM = dot（AB，M）; 
 
 Point2D S = pts [nbpts-1]; 
 
 for（int i = 0; i  
 Point2D E = pts [i]; 
 
 double ddot = -ABM +点（AB，E）; 
 if（ddot< 0）{// E在剪辑边缘
 double ddot2 = -ABM + dot（AB，S）; 
 if（ddot2> 0）{
 newpts [new_nbpts] = inter（S，E，inside_pt，outside_pt）; 
 new_nbpts ++; 
} 
 newpts [new_nbpts] = E; 
 new_nbpts ++; 
} else {
 double ddot2 = -ABM + dot（AB，S）; 
 if（ddot2< 0）{
 newpts [new_nbpts] = inter（S，E，inside_pt，outside_pt）; 
 new_nbpts ++; 
} 
} 
 S = E; 
} 
 
 memcpy（pts，newpts，std :: min（128，new_nbpts）* sizeof（Point2D））; 
 / * for（int i = 0; i  pts [i] = newpts [i] 
} * / 
 nbpts = new_nbpts; 
} 
 
 // private：
 Point2D pts [128]; 
 int nbpts; 
 float dummy; 
}; 
 
 
 void cut_poly（int idx，float * a，Polygon * polygons，int N）
 {
 if（idx> = N / 
 
多边形
 pol.addPts（Point2D（0.，0。））; 
 pol.addPts（Point2D（1.，0。））; 
 pol.addPts（Point2D（1.，1。））; 
 pol.addPts（Point2D（0.，1。））; 
 
 Point2D curPt（a [2 * idx]，a [2 * idx + 1]）; 
 
 for（int i = 0; i  if（idx == i）continue; 
 Point2D other_pt（a [2 * i]，a [2 * i + 1]）; 
 pol.cut（curPt，other_pt）; 
} 
 pol.dummy = a [idx]; 
 
 polygons [idx] = pol; 
} 
 
 
 
 int main（int argc，unsigned char * argv []）
 {
 
 const int N = 100; //数组中的元素数量
 float a_h [N]，* a_d; //指向主机&设备数组
多边形p_h [N / 2]，* p_d; 
 
 for（int i = 0; i  
 for（int idx = 0; idx< N; idx ++）
 cut_poly（idx，a_h，p_h，N） 
 
 for（int i = 0; i  printf（％f \t％f \t％u\\\
，a_h [i ]，p_h [i] .dummy，p_h [i] .nbpts）; 
 
 return 0; 
}

解决方案

我想你可以忽略我的大部分评论。我错误地在一台机器上设置了CUDA 3.2，它的行为不同的内核启动失败的行。当我切换到CUDA 4.1和CUDA 5.0事情开始有意义。很抱歉我的困惑在那里。

无论如何，过去，我很快注意到，你的CPU和GPU实现之间有一个区别。在这里（查看CPU代码）：

  void cut_poly（int idx，float * a，Polygon * polygons，int N ）
 {
 if（idx> = N / 2）return; 
 
多边形
 pol.addPts（Point2D（0，0。））; 
 pol.addPts（Point2D（1.，0。））; 
 pol.addPts（Point2D（1.，1。））; 
 pol.addPts（Point2D（0.，1。））; 
 
 Point2D curPt（a [2 * idx]，a [2 * idx + 1]）; 
 
 for（int i = 0; i  if（idx == i）continue; / *注意这个行从你的GPU代码丢失* / 
 Point2D other_pt（a [2 * i]，a [2 * i + 1]）; 
 pol.cut（curPt，other_pt）; 
} 
 pol.dummy = a [idx]; 
 
 polygons [idx] = pol; 
}

参考我已经添加注释到上面，如果你添加精确的代码到你的GPU代码中的相应位置在 cut_poly 内核，然后对于我反正GPU代码产生与CPU代码相同的打印结果。 >

我会做的另一个观察是，你不必要地运行只有4个线程的块。没有什么错误，当你正在编写的代码中的扭结，但一旦你有它的运行生产的目的，你很可能想要定位一个更高的数字，如256，并确保选择一个数字，是为了获得最佳性能。

为了回应评论中发布的问题，我相信数据正在被正确复制，但很可能是您不能在主机上正确地访问它。（我不知道你如何确定我的数组没有正确返回到主机）。大多数类定义仅限于 __ device __ 。因此，很难访问主机上类中的结构（例如 Polygon 类中的 Point2D pts ）。我在这里插入修改的代码，我认为这表明数据正在传回主机：

  #include< stdio.h> 
 #include< iostream> 
 #include< stdlib.h> 
 // #include< cuda.h> 
 
 #define cudaCheckErrors（msg）\ 
 do {\ 
 cudaError_t __err = cudaGetLastError（）; \ 
 if（__err！= cudaSuccess）{\ 
 fprintf（stderr，致命错误：％s（％s在％s：％d）\\\
，\ 
 msg，cudaGetErrorString（__ err），\ 
 __FILE__，__LINE__）; \ 
 fprintf（stderr，*** FAILED  -  ABORTING\\\
）; \ 
 exit（1）; \ 
} \ 
} while（0）
 
 
 class Point2D {
 public：
 __host__ __device__ Point2D（double xx = 0，double yy = 0）：x（xx），y（yy）{}; 
 double x，y; 
}; 
 
 __host__ __device__ double dot（const Point2D& A，const Point2D& B）{
 return A.x * B.x + A.y * B.y; 
} 
 __host__ __device__ Point2D operator *（double a，const Point2D& P）{
 return Point2D（a * P.x，a * P.y）; 
} 
 __host__ __device__ Point2D operator +（Point2D A，const Point2D& B）{
 return Point2D（A.x + B.x，A.y + B.y）; 
} 
 __host__ __device__ Point2D operator-（Point2D A，const Point2D& B）{
 return Point2D（A.x  -  B.x，A.y  -  B.y）; 
} 
 __host__ __device__ Point2D inter（const Point2D& A，const Point2D& B，const Point2D& C，const Point2D& D）{//与CD的介体*相交
 Point2D M = 0.5 *（C + D）; 
 return A  - （dot（A-M，D-C）/ dot（B-A，D-C））*（B-A）; 
} 
 
类Polygon {
 public：
 __host__ __device__ Polygon（）：nbpts（0）{}; 
 __host__ __device__ void addPts（Point2D pt）{
 pts [nbpts] = pt; 
 nbpts ++; 
}; 
 __host__ __device__多边形& operator =（const Polygon& rhs）{
 nbpts = rhs.nbpts; 
 dummy = rhs.dummy; 
 memcpy（pts，rhs.pts，nbpts * sizeof（Point2D））; 
 return * this; 
} 
 __host__ __device__ Point2D getpoint（unsigned i）{
 if（i< 128）return pts [i] 
 else return pts [0]; 
} 
 __host__ __device__ void cut（const Point2D& inside_pt，const Point2D& outside_pt）{
 
 int new_nbpts = 0; 
 Points2D newpts [128]; 
 Point2D AB（outside_pt-inside_pt）; 
 Point2D M（0.5 *（outside_pt + inside_pt））; 
 double ABM = dot（AB，M）; 
 
 Point2D S = pts [nbpts-1]; 
 
 for（int i = 0; i  
 Point2D E = pts [i]; 
 
 double ddot = -ABM + dot（AB，E）; 
 if（ddot< 0）{// E在剪辑边缘
 double ddot2 = -ABM + dot（AB，S）; 
 if（ddot2> 0）{
 newpts [new_nbpts] = inter（S，E，inside_pt，outside_pt）; 
 new_nbpts ++; 
} 
 newpts [new_nbpts] = E; 
 new_nbpts ++; 
} else {
 double ddot2 = -ABM + dot（AB，S）; 
 if（ddot2< 0）{
 newpts [new_nbpts] = inter（S，E，inside_pt，outside_pt）; 
 new_nbpts ++; 
} 
} 
 S = E; 
} 
 
 memcpy（pts，newpts，min（128，new_nbpts）* sizeof（Point2D））; 
 nbpts = new_nbpts; 
} 
 
 // private：
 Point2D pts [128]; 
 int nbpts; 
 float dummy; 
}; 
 
 
 __global__ void cut_poly（float * a，Polygon * polygons，int N）
 {
 int idx = blockIdx.x * blockDim.x + threadIdx.x ; 
 if（idx> = N / 2）return; 
 
多边形
 pol.addPts（Point2D（0.，0。））; 
 pol.addPts（Point2D（1.，0。））; 
 pol.addPts（Point2D（1.，1。））; 
 pol.addPts（Point2D（0.,1。））; 
 
 Point2D curPt（a [2 * idx]，a [2 * idx +1]）; 
 
 for（int i = 0; i  if（idx == i）continue; 
 Point2D other_pt（a [2 * i]，a [2 * i + 1]）; 
 pol.cut（curPt，other_pt）; 
} 
 pol.dummy = pol.getpoint（0）.x; 
 
 polygons [idx] = pol; 
} 
 
 
 
 int main（int argc，unsigned char * argv []）
 {
 
 const int N = 100; 
 float a_h [N]，* a_d; 
多边形p_h [N / 2]，* p_d; 
 
 size_t size = N * sizeof（float） 
 size_t size_pol = N / 2 * sizeof（Polygon）; 
 
 cudaMalloc（（void **）& a_d，size）; 
 cudaCheckErrors（cm1）; 
 cudaMalloc（（void **）& p_d，size_pol）; 
 cudaCheckErrors（cm2）; 
 
 for（int i = 0; i  cudaMemcpy（a_d，a_h，size，cudaMemcpyHostToDevice）; 
 cudaCheckErrors（cmcp1）; 
 
 int block_size = 128; 
 int n_blocks = N / block_size +（N％block_size == 0？0：1）; 
 cut_poly<<< n_blocks，block_size>>> （a_d，p_d，N）; 
 cudaCheckErrors（kernel）; 
 
 cudaMemcpy（a_h，a_d，sizeof（float）* N，cudaMemcpyDeviceToHost）; 
 cudaCheckErrors（cmcp2）; 
 cudaMemcpy（p_h，p_d，sizeof（Polygon）* N / 2，cudaMemcpyDeviceToHost）; 
 cudaCheckErrors（cmcp3）; 
 
 for（int i = 0; i  printf（％f \t％f \t％f \t％u\\\
 ，a_h [i]，p_h [i] .dummy，p_h [i] .getpoint（0）.x，p_h [i] .nbpts）; 
 
 cudaFree（a_d）; 
 cudaFree（p_d）; 
 
 
 return 0; 
}

我建议您使用发布新问题。

my following minimalist Cuda code returns an incorrect result (all polygons have 0 vertices at the end) while the same code running in serial in C++ is working well. The problem is embarrassingly parallel : no communication, no syncthreads etc., and the Cuda memory allocations are sucessful. Even my dummy variable that stores the content of the input array for debug purpose is 0 for the Cuda version. There is no access out of bounds since my arrays are largely large enough. Replacing the memcpy by a loop in Cuda doesn't change anything.
I really don't understand what happens... any idea ? Thanks!

Cuda code:

    #include <stdio.h>
    #include <iostream>
    #include <stdlib.h>
    #include <cuda.h>

    class Point2D {
     public:
     __device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){};
     double x, y;
    };

    __device__ double dot(const Point2D &A, const Point2D &B) {
     return A.x*B.x + A.y*B.y;
    }
    __device__ Point2D operator*(double a, const Point2D &P) {
     return Point2D(a*P.x, a*P.y);
    }
    __device__ Point2D operator+(Point2D A, const Point2D &B) {
     return Point2D(A.x + B.x, A.y + B.y);
    }
    __device__ Point2D operator-(Point2D A, const Point2D &B) {
     return Point2D(A.x - B.x, A.y - B.y);
    }
    __device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
      Point2D M = 0.5*(C+D);
      return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
    }

    class Polygon {
    public:
      __device__ Polygon():nbpts(0){};
      __device__ void addPts(Point2D pt) {
        pts[nbpts] = pt;
        nbpts++;
      };
      __device__ Polygon& operator=(const Polygon& rhs) {
        nbpts = rhs.nbpts;
        dummy = rhs.dummy;
        memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
        return *this;
      }
      __device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) {

        int new_nbpts = 0;
        Point2D newpts[128];
        Point2D AB(outside_pt-inside_pt);
        Point2D M(0.5*(outside_pt+inside_pt));
        double ABM = dot(AB, M);

        Point2D S = pts[nbpts-1];

        for (int i=0; i<nbpts; i++) {

          Point2D E = pts[i];

          double ddot = -ABM + dot(AB, E);
          if (ddot<0) { // E inside clip edge
            double ddot2 = -ABM + dot(AB, S);
            if (ddot2>0) {
               newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
               new_nbpts++;
            }
            newpts[new_nbpts] = E;
            new_nbpts++;
          } else {
            double ddot2 = -ABM + dot(AB, S);
            if (ddot2<0) {
               newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
               new_nbpts++;
            }
          }
          S = E;
        }

        memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D));
        nbpts = new_nbpts;
      }

    //private:
     Point2D pts[128];
     int nbpts;
     float dummy;
    };


    __global__ void cut_poly(float *a, Polygon* polygons, int N)
    {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx>=N/2) return;

      Polygon pol;
      pol.addPts(Point2D(0.,0.));
      pol.addPts(Point2D(1.,0.));
      pol.addPts(Point2D(1.,1.));
      pol.addPts(Point2D(0.,1.));

      Point2D curPt(a[2*idx], a[2*idx+1]);

      for (int i=0; i<N/2; i++) {
        Point2D other_pt(a[2*i], a[2*i+1]);
        pol.cut(curPt, other_pt);
      }
      pol.dummy = a[idx];

      polygons[idx] = pol;
    }



    int main(int argc, unsigned char* argv[])
    {

      const int N = 100;
      float a_h[N], *a_d;
      Polygon p_h[N/2], *p_d;

      size_t size = N * sizeof(float);
      size_t size_pol = N/2 * sizeof(Polygon);

      cudaError_t err  = cudaMalloc((void **) &a_d, size);
      cudaError_t err2 = cudaMalloc((void **) &p_d, size_pol);

      for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
      cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

      int block_size = 4;
      int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
      cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N);

      cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
      cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost);

      for (int i=0; i<N/2; i++)
       printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);

      cudaFree(a_d);
      cudaFree(p_d);


        return 0;
    }

Same code in C++ that works properly:

#include <stdio.h>
#include <iostream>
#include <stdlib.h>

class Point2D {
 public:
 Point2D(double xx=0, double yy=0):x(xx),y(yy){};
 double x, y;
};

double dot(const Point2D &A, const Point2D &B) {
 return A.x*B.x + A.y*B.y;
}
Point2D operator*(double a, const Point2D &P) {
 return Point2D(a*P.x, a*P.y);
}
Point2D operator+(Point2D A, const Point2D &B) {
 return Point2D(A.x + B.x, A.y + B.y);
}
Point2D operator-(Point2D A, const Point2D &B) {
 return Point2D(A.x - B.x, A.y - B.y);
}
Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
  Point2D M = 0.5*(C+D);
  return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}

class Polygon {
public:
  Polygon():nbpts(0){};
  void addPts(Point2D pt) {
    pts[nbpts] = pt;
    nbpts++;
  };
  Polygon& operator=(const Polygon& rhs) {
    nbpts = rhs.nbpts;
    dummy = rhs.dummy;
    memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
    return *this;
  }
  void cut(const Point2D &inside_pt, const Point2D &outside_pt) {

    int new_nbpts = 0;
    Point2D newpts[128];
    Point2D AB(outside_pt-inside_pt);
    Point2D M(0.5*(outside_pt+inside_pt));
    double ABM = dot(AB, M);

    Point2D S = pts[nbpts-1];

    for (int i=0; i<nbpts; i++) {

      Point2D E = pts[i];

      double ddot = -ABM + dot(AB, E);
      if (ddot<0) { // E inside clip edge
        double ddot2 = -ABM + dot(AB, S);
        if (ddot2>0) {
           newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
           new_nbpts++;
        }
        newpts[new_nbpts] = E;
        new_nbpts++;
      } else {
        double ddot2 = -ABM + dot(AB, S);
        if (ddot2<0) {
           newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
           new_nbpts++;
        }
      }
        S = E;
    }

    memcpy(pts, newpts, std::min(128, new_nbpts)*sizeof(Point2D));
    /*for (int i=0; i<128; i++) {
      pts[i] = newpts[i];
    }*/
    nbpts = new_nbpts;
  }

//private:
 Point2D pts[128];
 int nbpts;
 float dummy;
};


void cut_poly(int idx, float *a, Polygon* polygons, int N)
{
  if (idx>=N/2) return;

  Polygon pol;
  pol.addPts(Point2D(0.,0.));
  pol.addPts(Point2D(1.,0.));
  pol.addPts(Point2D(1.,1.));
  pol.addPts(Point2D(0.,1.));

  Point2D curPt(a[2*idx], a[2*idx+1]);

  for (int i=0; i<N/2; i++) {
    if (idx==i) continue;
    Point2D other_pt(a[2*i], a[2*i+1]);
    pol.cut(curPt, other_pt);
  }
  pol.dummy = a[idx];

  polygons[idx] = pol;
}



int main(int argc, unsigned char* argv[])
{

  const int N = 100;  // Number of elements in arrays
  float a_h[N], *a_d;  // Pointer to host & device arrays
  Polygon p_h[N/2], *p_d;

  for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;

  for (int idx=0; idx<N; idx++)
    cut_poly(idx, a_h, p_h, N);

  for (int i=0; i<N/2; i++)
   printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);

   return 0;
}

解决方案

Well I guess you can disregard most of my comments. I was by mistake working on a machine I had set up with CUDA 3.2 and it was behaving differently along the lines of the kernel launch failure. When I switched to CUDA 4.1 and CUDA 5.0 things started to make sense. Apologies for my confusion there.

Anyway after getting past that, I pretty quickly noticed that there is a difference between your CPU and GPU implementations. Specifically here (looking at the CPU code):

void cut_poly(int idx, float *a, Polygon* polygons, int N)
{
  if (idx>=N/2) return;

  Polygon pol;
  pol.addPts(Point2D(0.,0.));
  pol.addPts(Point2D(1.,0.));
  pol.addPts(Point2D(1.,1.));
  pol.addPts(Point2D(0.,1.));

  Point2D curPt(a[2*idx], a[2*idx+1]);

  for (int i=0; i<N/2; i++) {
    if (idx==i) continue;     /*   NOTE THIS LINE MISSING FROM YOUR GPU CODE */
    Point2D other_pt(a[2*i], a[2*i+1]);
    pol.cut(curPt, other_pt);
  }
  pol.dummy = a[idx];

  polygons[idx] = pol;
}

Referring to the line I have added the comment to above, if you add that exact line of code to the corresponding spot in your GPU code in the cut_poly kernel, then for me anyway the GPU code produces the same printed result as the CPU code.

One other observation I would make is that you are needlessly running blocks with just 4 threads. Nothing wrong with that while you're working out the kinks in the code, but once you have it running for "production" purposes, you will most likely want to target a higher number like 256, and be sure to choose a number that is an integer multiple of 32, for best performance.

In response to a question posted in the comments, I believe that the data is being copied properly, but most likely you are not accessing it correctly on the host. (I don't know how you're determining that "my array is not properly returned to the host"). Most of your class definitions were __device__ only. As a result, it's difficult to access structures within classes on the host (e.g. the Point2D pts class within the Polygon class). I'm inserting modified code here which I think demonstrates that the data is being transferred back to the host:

    #include <stdio.h>
    #include <iostream>
    #include <stdlib.h>
//    #include <cuda.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


    class Point2D {
     public:
     __host__ __device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){};
     double x, y;
    };

    __host__ __device__ double dot(const Point2D &A, const Point2D &B) {
     return A.x*B.x + A.y*B.y;
    }
    __host__ __device__ Point2D operator*(double a, const Point2D &P) {
     return Point2D(a*P.x, a*P.y);
    }
    __host__ __device__ Point2D operator+(Point2D A, const Point2D &B) {
     return Point2D(A.x + B.x, A.y + B.y);
    }
    __host__ __device__ Point2D operator-(Point2D A, const Point2D &B) {
     return Point2D(A.x - B.x, A.y - B.y);
    }
    __host__ __device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
      Point2D M = 0.5*(C+D);
      return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
    }

    class Polygon {
    public:
      __host__ __device__ Polygon():nbpts(0){};
      __host__ __device__ void addPts(Point2D pt) {
        pts[nbpts] = pt;
        nbpts++;
      };
      __host__ __device__ Polygon& operator=(const Polygon& rhs) {
        nbpts = rhs.nbpts;
        dummy = rhs.dummy;
        memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
        return *this;
      }
      __host__ __device__ Point2D getpoint(unsigned i){
        if (i<128) return pts[i];
        else return pts[0];
        }
      __host__ __device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) {

        int new_nbpts = 0;
        Point2D newpts[128];
        Point2D AB(outside_pt-inside_pt);
        Point2D M(0.5*(outside_pt+inside_pt));
        double ABM = dot(AB, M);

        Point2D S = pts[nbpts-1];

        for (int i=0; i<nbpts; i++) {

          Point2D E = pts[i];

          double ddot = -ABM + dot(AB, E);
          if (ddot<0) { // E inside clip edge
            double ddot2 = -ABM + dot(AB, S);
            if (ddot2>0) {
               newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
               new_nbpts++;
            }
            newpts[new_nbpts] = E;
            new_nbpts++;
          } else {
            double ddot2 = -ABM + dot(AB, S);
            if (ddot2<0) {
               newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
               new_nbpts++;
            }
          }
          S = E;
        }

        memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D));
        nbpts = new_nbpts;
      }

    //private:
     Point2D pts[128];
     int nbpts;
     float dummy;
    };


    __global__ void cut_poly(float *a, Polygon* polygons, int N)
    {
      int idx = blockIdx.x * blockDim.x + threadIdx.x;
      if (idx>=N/2) return;

      Polygon pol;
      pol.addPts(Point2D(0.,0.));
      pol.addPts(Point2D(1.,0.));
      pol.addPts(Point2D(1.,1.));
      pol.addPts(Point2D(0.,1.));

      Point2D curPt(a[2*idx], a[2*idx+1]);

      for (int i=0; i<N/2; i++) {
        if (idx==i) continue;
        Point2D other_pt(a[2*i], a[2*i+1]);
        pol.cut(curPt, other_pt);
      }
      pol.dummy = pol.getpoint(0).x;

      polygons[idx] = pol;
    }



    int main(int argc, unsigned char* argv[])
    {

      const int N = 100;
      float a_h[N], *a_d;
      Polygon p_h[N/2], *p_d;

      size_t size = N * sizeof(float);
      size_t size_pol = N/2 * sizeof(Polygon);

      cudaMalloc((void **) &a_d, size);
      cudaCheckErrors("cm1");
      cudaMalloc((void **) &p_d, size_pol);
      cudaCheckErrors("cm2");

      for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
      cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
      cudaCheckErrors("cmcp1");

      int block_size = 128;
      int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
      cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N);
      cudaCheckErrors("kernel");

      cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
      cudaCheckErrors("cmcp2");
      cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost);
      cudaCheckErrors("cmcp3");

      for (int i=0; i<N/2; i++)
       printf("%f \t %f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].getpoint(0).x, p_h[i].nbpts);

      cudaFree(a_d);
      cudaFree(p_d);


        return 0;
    }

I would suggest using posting new questions for these things.

这篇关于Cuda版本不工作，而串行工作的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！