好吧,所以我正在尝试为cuda开发2D阵列,但这正在变得很痛苦。错误在标题中,并发生在cudaMemcpy2D处。我认为这个问题对训练有素的眼睛很明显。预先感谢您提供的任何帮助,我已经超越了正在学习指针的班级。
#include <cuda_runtime.h>
#include <iostream>
#pragma comment (lib, "cudart")
/* Program purpose: pass a 10 x 10 matrix and multiply it by another 10x10 matrix */
float matrix1_host[100][100];
float matrix2_host[100][100];
float* matrix1_device;
float* matrix2_device;
size_t pitch;
cudaError_t err;
__global__ void addMatrix(float* matrix1_device,float* matrix2_device, size_t pitch){
// How this works
// first we start to cycle through the rows by using the thread's ID
// then we calculate an address from the address of a point in the row, by adding the pitch (size of each row) and * it by
// the amount of rows we've already completed, then we can use that address of somewhere at a start of a row to get the colums
// in the row with a normal array grab.
int r = threadIdx.x;
float* rowofMat1 = (float*)((char*)matrix1_device + r * pitch);
float* rowofMat2 = (float*)((char*)matrix2_device + r * pitch);
for (int c = 0; c < 100; ++c) {
rowofMat1[c] += rowofMat2[c];
}
}
void initCuda(){
err = cudaMallocPitch((void**)matrix1_device, &pitch, 100 * sizeof(float), 100);
err = cudaMallocPitch((void**)matrix2_device, &pitch, 100 * sizeof(float), 100);
//err = cudaMemcpy(matrix1_device, matrix1_host, 100*100*sizeof(float), cudaMemcpyHostToDevice);
//err = cudaMemcpy(matrix2_device, matrix2_host, 100*100*sizeof(float), cudaMemcpyHostToDevice);
err = cudaMemcpy2D(matrix1_device, 100*sizeof(float), matrix1_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
err = cudaMemcpy2D(matrix2_device, 100*sizeof(float), matrix2_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
}
void populateArrays(){
for(int x = 0; x < 100; x++){
for(int y = 0; y < 100; y++){
matrix1_host[x][y] = (float) x + y;
matrix2_host[y][x] = (float) x + y;
}
}
}
void runCuda(){
dim3 dimBlock ( 100 );
dim3 dimGrid ( 1 );
addMatrix<<<dimGrid, dimBlock>>>(matrix1_device, matrix2_device, 100*sizeof(float));
//err = cudaMemcpy(matrix1_host, matrix1_device, 100*100*sizeof(float), cudaMemcpyDeviceToHost);
err = cudaMemcpy2D(matrix1_host, 100*sizeof(float), matrix1_device, pitch, 100*sizeof(float),100, cudaMemcpyDeviceToHost);
//cudaMemcpy(matrix1_host, matrix1_device, 100*100*sizeof(float), cudaMemcpyDeviceToHost);
}
void cleanCuda(){
err = cudaFree(matrix1_device);
err = cudaFree(matrix2_device);
err = cudaDeviceReset();
}
int main(){
populateArrays();
initCuda();
runCuda();
cleanCuda();
std::cout << cudaGetErrorString(cudaGetLastError());
system("pause");
return 0;
}
最佳答案
首先,通常,您应该为matrix1和matrix2有一个单独的音调变量。在这种情况下,它们与从cudaMallocPitch
的API调用返回的值相同,但通常情况下可能不是。
在您的cudaMemcpy2D
行中,the second parameter to the call是目标音高。这只是您对特定目标矩阵(即第一个参数)进行cudaMallocPitch
调用时返回的音高值。
第四个参数是源音高。由于这是通过普通主机分配分配的,因此除了字节宽度以外,没有间距。
这样就交换了第二个和第四个参数。
所以代替这个:
err = cudaMemcpy2D(matrix1_device, 100*sizeof(float), matrix1_host, pitch, 100*sizeof(float), 100, cudaMemcpyHostToDevice);
尝试这个:
err = cudaMemcpy2D(matrix1_device, pitch, matrix1_host, 100*sizeof(float), 100*sizeof(float), 100, cudaMemcpyHostToDevice);
同样,第二次调用
cudaMemcpy2D
。第三个调用实际上是可以的,因为它的方向相反,源矩阵和目标矩阵被交换,因此它们正确地与您的音高参数对齐。关于c++ - CUDA-使用cudaMallocPitch和cudaMemcpy2D,错误:InvalidValue,InvalidPitchValue,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/15424738/