Closed. This question needs debugging details。它当前不接受答案。
想改善这个问题吗?更新问题,以便将其作为on-topic用于堆栈溢出。
4年前关闭。
Improve this question
首先,我想对我在下面提供的大量代码表示歉意,我不确定,但是我可能因此违反了发布规则。
我正在尝试修改nvidia的cuda样本FluidsGL(示例可以在这里https://www.youtube.com/watch?v=jU9wgSu4_zw看到),以便代替修改随机像素,我可以加载自定义图像(例如PNG),然后在其上应用流体求解器,从而有效地使其变形流畅的时尚
在尝试了许多不同的实现之后,我想到可以使用两个vbo,一个用于发送到流体求解器内核的float2粒子,另一个用于颜色,我用RGBA信息填充从图像。
我在glDrawArrays中遇到访问冲突,但我仍然没有弄清楚为什么,在较早的尝试中,我尝试使用单个vbo和一个名为vertex_data的结构,该结构包含一个float2和一个uchar4,通过glbufferdata发送它,并让内核会根据它的float2部分进行计算,但是也会发生访问冲突异常。
如果有人愿意帮助我,我将不胜感激
这是在float2粒子上运行的相关内核,可以有效地模拟流体
和advectParticles_k:
谢谢
调用时:
您已分配4 * 10 = 40 内存字节
然后调用此函数:
也就是说,单个粒子需要2 * 4 = 8 字节的内存。
然后您正在渲染:
通过此调用,您尝试渲染消耗10 * 8 = 80 内存字节的10个粒子
通过此调用,您将遇到访问冲突,因为您仅分配了 40 字节的内存。
想改善这个问题吗?更新问题,以便将其作为on-topic用于堆栈溢出。
4年前关闭。
Improve this question
首先,我想对我在下面提供的大量代码表示歉意,我不确定,但是我可能因此违反了发布规则。
我正在尝试修改nvidia的cuda样本FluidsGL(示例可以在这里https://www.youtube.com/watch?v=jU9wgSu4_zw看到),以便代替修改随机像素,我可以加载自定义图像(例如PNG),然后在其上应用流体求解器,从而有效地使其变形流畅的时尚
在尝试了许多不同的实现之后,我想到可以使用两个vbo,一个用于发送到流体求解器内核的float2粒子,另一个用于颜色,我用RGBA信息填充从图像。
我在glDrawArrays中遇到访问冲突,但我仍然没有弄清楚为什么,在较早的尝试中,我尝试使用单个vbo和一个名为vertex_data的结构,该结构包含一个float2和一个uchar4,通过glbufferdata发送它,并让内核会根据它的float2部分进行计算,但是也会发生访问冲突异常。
如果有人愿意帮助我,我将不胜感激
typedef unsigned char ubyte;
#define DEVICE __device__
#define GLOBAL __global__
#define MAX(a,b) ((a > b) ? a : b)
#define DIM 512
#define DS DIM*DIM
glm::mat4 m_mat;
// CUFFT plan handle
cufftHandle planr2c;
cufftHandle planc2r;
static float2 *vxfield = NULL;
static float2 *vyfield = NULL;
float2 *hvfield = NULL;
float2 *dvfield = NULL;
static int wWidth = MAX(512, DIM);
static int wHeight = MAX(512, DIM);
/*-----CUSTOM STRUCT-----------------------------------------------------*/
struct GLTexture
{
GLuint id;
int width;
int height;
};
vertex_data data[DS];
//c linkage
/*--------------------------------------------------------------------------------------------------------------------------------*/
extern "C" void addForces(float2 *v, int dx, int dy, int spx, int spy, float fx, float fy, int r);
extern "C" void advectVelocity(float2 *v, float *vx, float *vy, int dx, int pdx, int dy, float dt);
extern "C" void diffuseProject(float2 *vx, float2 *vy, int dx, int dy, float dt, float visc);
extern "C" void updateVelocity(float2 *v, float *vx, float *vy, int dx, int pdx, int dy);
extern "C" void advectParticles(GLuint vbo, float2 *v, int dx, int dy, float dt);
/*--------------------------------------------------------------------------------------------------------------------------------*/
GLSLProgram prog;
IOManager m_manager;
GLTexture m_tex;
std::vector<ubyte> in_img;
std::vector<ubyte> out_img;
vertex_data vData[6];
GLuint positionsVBO;
GLuint colorsVBO;
cudaGraphicsResource* positionsVBO_CUDA;
float2 *particles = NULL;
float2 *part_cuda = NULL;
int lastx = 0, lasty = 0;
int clicked = 0;
size_t tPitch = 0;
float myrand(void)
{
return rand() / (float)RAND_MAX;
}
void initParticles(float2 *p, int dx, int dy)
{
int i, j;
for (i = 0; i < dy; i++)
{
for (j = 0; j < dx; j++)
{
p[i*dx + j].x = (j + 0.5f + (myrand() - 0.5f)) / dx;
p[i*dx + j].y = (i + 0.5f + (myrand() - 0.5f)) / dy;
}
}
}
void keyboard(unsigned char key, int x, int y)
{
switch (key)
{
case 27:
glutDestroyWindow(glutGetWindow());
exit(0);
return;
default:
break;
}
}
void click(int button, int updown, int x, int y)
{
lastx = x;
lasty = y;
clicked = !clicked;
}
void motion(int x, int y)
{
// Convert motion coordinates to domain
float fx = (lastx / (float)wWidth);
float fy = (lasty / (float)wHeight);
int nx = (int)(fx * DIM);
int ny = (int)(fy * DIM);
if (clicked && nx < DIM - FR && nx > FR - 1 && ny < DIM - FR && ny > FR - 1)
{
int ddx = x - lastx;
int ddy = y - lasty;
fx = ddx / (float)wWidth;
fy = ddy / (float)wHeight;
int spy = ny - FR;
int spx = nx - FR;
addForces(dvfield, DIM, DIM, spx, spy, FORCE * DT * fx, FORCE * DT * fy, FR);
lastx = x;
lasty = y;
}
glutPostRedisplay();
}
void cleanup(void)
{
cudaGraphicsUnregisterResource(positionsVBO_CUDA);
unbindTexture();
deleteTexture();
// Free all host and device resources
free(hvfield);
free(particles);
cudaFree(dvfield);
cudaFree(vxfield);
cudaFree(vyfield);
cufftDestroy(planr2c);
cufftDestroy(planc2r);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
glDeleteBuffersARB(1, &positionsVBO);
}
void run_simulation(void){
//Execute kernel
advectVelocity(dvfield, (float *)vxfield, (float *)vyfield, DIM, RPADW, DIM, DT);
diffuseProject(vxfield, vyfield, CPADW, DIM, DT, VIS);
updateVelocity(dvfield, (float *)vxfield, (float *)vyfield, DIM, RPADW, DIM);
advectParticles(positionsVBO, dvfield, DIM, DIM, DT);
}
void initShaders(){
prog.compileShaders("vShader.vertex", "fShader.frag");
prog.addAttribute("vertexPos");
prog.addAttribute("vertexColor");
prog.linkShaders();
}
void pre_display()
{
glViewport(0, 0, 512, 512);
glutPostRedisplay();
}
void display()
{
pre_display();
// render points from vertex buffer
glClear(GL_COLOR_BUFFER_BIT);
initShaders();
run_simulation();
prog.use();
//GLint textureUniform = prog.getUniformLocation("mySampler");
//glUniform1i(textureUniform, 0);
//glActiveTexture(GL_TEXTURE0);
GLint pUniform = prog.getUniformLocation("P");
glUniformMatrix4fv(pUniform, 1, GL_FALSE, &m_mat[0][0]);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, positionsVBO);
glPointSize(1);
glEnable(GL_POINT_SMOOTH);
glEnable(GL_BLEND);
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
//glEnableVertexAttribArray(0); tried to manually enable the arrays
//glEnableVertexAttribArray(1);
glDisable(GL_DEPTH_TEST);
glDisable(GL_CULL_FACE);
glVertexAttribPointer(0, 2, GL_FLOAT, GL_TRUE, sizeof(float2), 0);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0); //dont need this but access violaton persists without it anyway
glBindBufferARB(GL_ARRAY_BUFFER_ARB, colorsVBO);
glVertexAttribPointer(1, 4, GL_UNSIGNED_BYTE, GL_TRUE, sizeof(vertex_data) ,(void*)(offsetof(vertex_data, col)));
glDrawArrays(GL_POINTS, 0, DS);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
prog.unuse();
glDisable(GL_TEXTURE_2D); //from nvidia's probably linked to the cudaarray_t
// Swap buffers
glutSwapBuffers();
}
void initGL()
{
int foo = 1;
char *bar = "bar";
glutInit(&foo, &bar);
glutInitDisplayMode(GLUT_DEPTH | GLUT_DOUBLE | GLUT_RGBA);
glutInitWindowSize(DIM, DIM);
glutCreateWindow("mate21");
glClearColor(0.0, 0.0, 0.0, 1.0);
glutKeyboardFunc(keyboard);
glutMouseFunc(click);
glutMotionFunc(motion);
glutDisplayFunc(display);
glewInit();
}
void setGLDevice(){
cudaDeviceProp prop;
int dev;
memset(&prop, 0, sizeof(cudaDeviceProp));
prop.major = 1;
prop.minor = 0;
cudaChooseDevice(&dev, &prop);
cudaGLSetGLDevice(dev);
}
void createVBO(){
//reading rgba information from image to out_img
unsigned long width, height;
IOManager::readFileToBuffer("jojo_test.png", in_img);
decodePNG(out_img, width, height, &(in_img[0]), in_img.size());
//data.resize(DS); data used to be a vector, gave up on that
for (int i = 0; i < DS; ++i){
//data[i].pos = particles[i]; edited vertex_data struct for rgba only
data[i].col.x = out_img[i * 4 + 0];
data[i].col.y = out_img[i * 4 + 1];
data[i].col.z = out_img[i * 4 + 2];
data[i].col.w = out_img[i * 4 + 3];
}
glGenBuffers(1, &positionsVBO);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, positionsVBO);
glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(float2)*DS, particles , GL_DYNAMIC_DRAW_ARB);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
cudaGraphicsGLRegisterBuffer(&positionsVBO_CUDA, positionsVBO, cudaGraphicsMapFlagsNone);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
glGenBuffers(1, &colorsVBO);
glBindBuffer(GL_ARRAY_BUFFER_ARB, colorsVBO);
glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(vertex_data)*DS, data, GL_DYNAMIC_DRAW_ARB);
glBindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
}
int main()
{
setGLDevice();
initGL();
//orthogonal view matrix with glm
m_mat = glm::ortho(0, 1, 1, 0, 0, 1);
hvfield = (float2 *)malloc(sizeof(float2) * DS);
memset(hvfield, 0, sizeof(float2) * DS);
// Allocate and initialize device data
cudaMallocPitch((void **)&dvfield, &tPitch, sizeof(float2)*DIM, DIM);
cudaMemcpy(dvfield, hvfield, sizeof(float2) * DS,
cudaMemcpyHostToDevice);
// Temporary complex velocity field data
cudaMalloc((void **)&vxfield, sizeof(float2) * PDS);
cudaMalloc((void **)&vyfield, sizeof(float2) * PDS);
setupTexture(DIM, DIM);
bindTexture();
// Create particle array
particles = (float2 *)malloc(sizeof(float2) * DS);
memset(particles, 0, sizeof(float2) * DS);
initParticles(particles, DIM, DIM);
// Create CUFFT transform plan configuration
cufftPlan2d(&planr2c, DIM, DIM, CUFFT_R2C);
cufftPlan2d(&planc2r, DIM, DIM, CUFFT_C2R);
cufftSetCompatibilityMode(planr2c, CUFFT_COMPATIBILITY_FFTW_PADDING);
cufftSetCompatibilityMode(planc2r, CUFFT_COMPATIBILITY_FFTW_PADDING);
createVBO();
//cleanup
glutCloseFunc(cleanup);
//Launch rendering loop
glutMainLoop();
}
这是在float2粒子上运行的相关内核,可以有效地模拟流体
extern "C"
void advectParticles(GLuint vbo, float2 *v, int dx, int dy, float dt)
{
dim3 grid((dx/TILEX)+(!(dx%TILEX)?0:1), (dy/TILEY)+(!(dy%TILEY)?0:1));
dim3 tids(TIDSX, TIDSY);
float2 *p;
cudaGraphicsMapResources(1, &positionsVBO_CUDA, 0);
size_t num_bytes;
cudaGraphicsResourceGetMappedPointer((void **)&p, &num_bytes,positionsVBO_CUDA);
advectParticles_k<<<grid, tids>>>(p, v, dx, dy, dt, TILEY/TIDSY, tPitch);
cudaGraphicsUnmapResources(1, &positionsVBO_CUDA, 0);
}
和advectParticles_k:
__global__ void
advectParticles_k(float2 *part, float2 *v, int dx, int dy,
float dt, int lb, size_t pitch)
{
int gtidx = blockIdx.x * blockDim.x + threadIdx.x;
int gtidy = blockIdx.y * (lb * blockDim.y) + threadIdx.y * lb;
int p;
// gtidx is the domain location in x for this thread
float2 pterm, vterm;
if (gtidx < dx)
{
for (p = 0; p < lb; p++)
{
// fi is the domain location in y for this thread
int fi = gtidy + p;
if (fi < dy)
{
int fj = fi * dx + gtidx;
pterm = part[fj];
int xvi = ((int)(pterm.x * dx));
int yvi = ((int)(pterm.y * dy));
vterm = *((float2 *)((char *)v + yvi * pitch) + xvi);
pterm.x += dt * vterm.x;
pterm.x = pterm.x - (int)pterm.x;
pterm.x += 1.f;
pterm.x = pterm.x - (int)pterm.x;
pterm.y += dt * vterm.y;
pterm.y = pterm.y - (int)pterm.y;
pterm.y += 1.f;
pterm.y = pterm.y - (int)pterm.y;
part[fj] = pterm;
}
} // If this thread is inside the domain in Y
} // If this thread is inside the domain in X
}
谢谢
最佳答案
我认为您没有在缓冲区中为粒子分配足够的内存。
假设:
sizeof(float2) = 4
DS = 10
调用时:
glBufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(float2)*DS, particles , GL_DYNAMIC_DRAW_ARB);
您已分配4 * 10 = 40 内存字节
然后调用此函数:
glVertexAttribPointer(0, 2, GL_FLOAT, GL_TRUE, sizeof(float2), 0);
也就是说,单个粒子需要2 * 4 = 8 字节的内存。
然后您正在渲染:
glDrawArrays(GL_POINTS, 0, DS);
通过此调用,您尝试渲染消耗10 * 8 = 80 内存字节的10个粒子
通过此调用,您将遇到访问冲突,因为您仅分配了 40 字节的内存。
09-10 01:34