为什么 CudaFree 似乎没有释放内存?

本文介绍了为什么 CudaFree 似乎没有释放内存?的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我正在尝试分配设备内存，复制到它，在 GPU 上执行计算，将结果复制回来，然后释放我分配的设备内存.我想确保我没有超出限制，我想看看共享内存空间中是否有足够的内存来转储一些数组.

当我分配设备内存时，没有返回错误.当我使用 cudaMemGetInfo 检查分配的内存量时，看起来一个 cudaMalloc 没有分配任何内存.此外，当我尝试释放内存时，似乎只释放了一个指针.

我正在使用 matlab Mexfunction 接口来设置 GPU 内存并启动内核.在这一点上，我什至没有调用内核，只是返回一个单位矩阵作为结果.

cudaError_t cudaErr;size_t freeMem = 0;size_t 总内存 = 0;size_t allocMem = 0;cudaMemGetInfo(&freeMem, &totalMem);mexPrintf("可用内存:空闲:%lu，总计:%lu
",freeMem, totalMem);/* 设备内存指针 */双 *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;双 *deviceReceivedReal, *deviceReceivedImag;/* 在设备上为数组分配内存.*/mexPrintf("分配内存.
");cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);if (cudaErr != cudaSuccess){mexPrintf("无法为 devicePulseDelay 分配内存
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("devicePulseDelay: 可用内存: 空闲: %lu, 总计: %lu, 已消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);if (cudaErr != cudaSuccess){mexPrintf("无法分配内存给 deviceTarDistance
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceTarDistance: 可用内存: 空闲: %lu, 总计: %lu, 已消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);if (cudaErr != cudaSuccess){mexPrintf("无法为 deviceScattDistance 分配内存
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceScattDistance: 可用内存: 空闲: %lu, 总计: %lu, 消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);if (cudaErr != cudaSuccess){mexPrintf("无法为 deviceScatterers 分配内存
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceScatterers: 可用内存: 空闲: %lu, 总计: %lu, 消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);if (cudaErr != cudaSuccess){mexPrintf("无法为 deviceReceivedReal 分配内存
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceReceivedReal: 可用内存: 空闲: %lu, 总计: %lu, 消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);if (cudaErr != cudaSuccess){mexPrintf("无法为 deviceReceivedImag 分配内存
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceReceivedImag: 可用内存: 空闲: %lu, 总计: %lu, 消耗: %lu
", allocMem, totalMem,(freeMem - allocMem));/* 将输入数组复制到设备 */mexPrintf("
正在复制内存.
");cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);if (cudaErr != cudaSuccess){mexPrintf("无法复制到 devicePulseDelay
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("devicePulseDelay: 可用内存: 空闲: %lu, 总计: %lu, 已消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);if (cudaErr != cudaSuccess){mexPrintf("无法复制到 deviceTarDistance
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceTarDistance: 可用内存: 空闲: %lu, 总计: %lu, 已消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);if (cudaErr != cudaSuccess){mexPrintf("无法复制到 deviceScattDistance
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceScattDistance: 可用内存: 空闲: %lu, 总计: %lu, 消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice);if (cudaErr != cudaSuccess){mexPrintf("无法复制到 deviceScatterers
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceScatterers: 可用内存: 空闲: %lu, 总计: %lu, 消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));/* 调用内核 *///启动内核<<<1,512>>>(.......);/* 撤销输出 */cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);if (cudaErr != cudaSuccess){mexPrintf("不能复制到 receivedReal
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("receivedReal: 可用内存: 空闲: %lu, 总计: %lu, 消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost);if (cudaErr != cudaSuccess){mexPrintf("无法复制到收到的图像
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("receivedImag: 可用内存: 空闲: %lu, 总计: %lu, 消耗: %lu
",allocMem, totalMem,(freeMem - allocMem));/* 释放内存.*/mexPrintf("
释放内存.
");cudaMemGetInfo(&freeMem, &totalMem);mexPrintf("释放前:空闲%lu，总计:%lu
", freeMem, totalMem);cudaErr = cudaFree(devicePulseDelay);if (cudaErr != cudaSuccess){mexPrintf("可以释放 devicePulseDelay
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("devicePulseDelay: 可用内存: 空闲: %lu, 总计: %lu, 空闲: %lu
",allocMem, totalMem,(allocMem - freeMem));cudaErr = cudaFree(deviceTarDistance);if (cudaErr != cudaSuccess){mexPrintf("可以释放 deviceTarDistance
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceTarDistance: 可用内存: 空闲: %lu, 总计: %lu, 空闲: %lu
",allocMem, totalMem,(allocMem - freeMem));cudaErr = cudaFree(deviceScattDistance);if (cudaErr != cudaSuccess){mexPrintf("可以释放 deviceScattDistance
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceScattDistance: 可用内存: 空闲: %lu, 总计: %lu, 空闲: %lu
",allocMem, totalMem,(allocMem - freeMem));cudaErr = cudaFree(deviceScatterers);if (cudaErr != cudaSuccess){mexPrintf("可以释放 deviceScatterers
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceScatterers: 可用内存: 空闲: %lu, 总计: %lu, 空闲: %lu
",allocMem, totalMem,(allocMem - freeMem));cudaErr = cudaFree(deviceReceivedReal);if (cudaErr != cudaSuccess){mexPrintf("可以释放 deviceReceivedReal
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceReceivedReal: 可用内存: 空闲: %lu, 总计: %lu, 空闲: %lu
",allocMem, totalMem,(allocMem - freeMem));cudaErr = cudaFree(deviceReceivedImag);if (cudaErr != cudaSuccess){mexPrintf("可以释放 deviceReceivedImag
");mexPrintf("错误: %s
",cudaGetErrorString(cudaErr));}cudaMemGetInfo(&allocMem, &totalMem);mexPrintf("deviceReceivedImag: 可用内存: 空闲: %lu, 总计: %lu, 空闲: %lu
",allocMem, totalMem,(allocMem - freeMem));

下面是这个的输出:

可用内存:免费:2523959296，总计:2818572288分配内存.devicePulseDelay:可用内存:免费:2522910720，总计:2818572288，消耗:1048576deviceTarDistance:可用内存:免费:2522910720，总计:2818572288，已消耗:1048576deviceScattDistance:可用内存:免费:2518716416，总计:2818572288，已消耗:5242880deviceScatterers:可用内存:免费:2517667840，总计:2818572288，已消耗:6291456deviceReceivedReal:可用内存:免费:2515570688，总计:2818572288，已消耗:8388608deviceReceivedImag:可用内存:免费:2513473536，总计:2818572288，已消耗:10485760复制记忆.devicePulseDelay:可用内存:免费:2513473536，总计:2818572288，消耗:10485760deviceTarDistance:可用内存:免费:2513473536，总计:2818572288，消耗:10485760deviceScattDistance:可用内存:免费:2513473536，总计:2818572288，已消耗:10485760deviceScatterers:可用内存:免费:2513473536，总计:2818572288，已消耗:10485760receivedReal:可用内存:免费:2513473536，总计:2818572288，已消耗:10485760receivedImag:可用内存:免费:2513473536，总计:2818572288，已消耗:10485760释放内存.释放前:免费2513473536，总计:2818572288devicePulseDelay:可用内存:空闲:2513473536，总计:2818572288，空闲:0deviceTarDistance:可用内存:空闲:2513473536，总计:2818572288，空闲:0deviceScattDistance:可用内存:空闲:2513473536，总计:2818572288，空闲:0deviceScatterers:可用内存:免费:2514522112，总计:2818572288，免费:1048576deviceReceivedReal:可用内存:免费:2514522112，总计:2818572288，免费:1048576deviceReceivedImag:可用内存:免费:2514522112，总计:2818572288，免费:1048576

我觉得我缺少一些明显的东西.任何人都可以帮助解释发生了什么吗?

平台是带有 Tesla C2050 GPU 卡的 windows 7.

解决方案

malloc 在调用时直接从主机操作系统获取内存分配，而 free 在调用时直接将它们释放回运行的主机.但它们几乎总是不会那样工作，而是标准库维护一个循环的 free'd 和 malloc'd 内存列表，通过与主机操作系统交互来机会主义地扩展和收缩(请参阅如果您有兴趣了解更多详情).不管它是如何工作的，这都会导致一些不直观的结果，包括通常不可能像操作系统所说的那样分配尽可能多的内存是免费的，分配有时似乎不会改变可用内存的数量，并且 free 有时对操作系统所说的空闲内存量没有影响.

虽然我只有经验证据支持这一点，但我相信 CUDA 的工作方式完全相同.上下文维护自己的 malloc'd 和 free'd 内存列表，并将扩展和收缩该列表中保存的内存作为主机驱动程序/窗口管理器和 GPU 本身允许.所有硬件都有一个特有的 MMU 页面大小，有证据表明 NVIDIA GPU 上的页面大小相当大.这意味着 cudaMalloc 调用的粒度相当粗略，这意味着有时 malloc 似乎不会影响可用内存量或消耗比请求更多的内存，并且有时 free 调用似乎没有效果(如果您有兴趣，可以找到一个小工具来帮助说明 CUDA 驱动程序的页面大小行为这里，尽管它是为早期版本的 CUDA API 编写的，可能需要进行一些更改用现代版本编译).我相信这是您观察到的行为最可能的解释.

顺便说一句，如果我使用 GT200 系列设备运行您在 MacOS 10.6 上发布的代码的简化版本:

#include <cstdio>#define mexPrintf printfinline void gpuAssert(cudaError_t code, char *file, int line,布尔中止=真){如果(代码！= cudaSuccess){mexPrintf("GPUassert: %s %s %d
", cudaGetErrorString(code),文件，行)；如果(中止)退出(代码)；}}#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__);}内联 void gpuMemReport(size_t *avail, size_t * total,const char * title = 0, const size_t * free = 0, const bool sense = true){字符 tstring[32] = { '' };gpuErrchk(cudaMemGetInfo(avail, total));如果(免费){如果(标题){strncpy(tstring, 标题, 31);}mexPrintf("%s 可用内存: 空闲: %zu, 总计: %zu, %s: %zu
",tstring, *avail, *total, (sense) ?"已分配" : "已释放",(感觉) ?(*free - *avail) : (*avail - *free));} 别的 {mexPrintf("可用内存:空闲:%zu，总计:%zu
", *avail, *total);}}主函数(){size_t freeMem = 0;size_t 总内存 = 0;size_t allocMem = 0;gpuErrchk(cudaFree(0));gpuMemReport(&freeMem, &totalMem);双 *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;双 *deviceReceivedReal, *deviceReceivedImag;mexPrintf("分配内存.
");gpuErrchk( cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512) );gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem);gpuErrchk( cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512) );gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem);gpuErrchk( cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512) );gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem);gpuErrchk( cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999) );gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem);gpuErrchk( cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512) );gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem);gpuErrchk( cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512) );gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem);mexPrintf("
释放内存.
");gpuMemReport(&freeMem, &totalMem);gpuErrchk(cudaFree(devicePulseDelay));gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem, false);gpuErrchk(cudaFree(deviceTarDistance));gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem, false);gpuErrchk(cudaFree(deviceScattDistance));gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem, false);gpuErrchk(cudaFree(deviceScatterers));gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem, false);gpuErrchk(cudaFree(deviceReceivedReal));gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem, false);gpuErrchk(cudaFree(deviceReceivedImag));gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem, false);返回0；}

我得到了不同的结果，但也显示了相同的现象:

分配内存.devicePulseDelay:可用内存:空闲:202870784，总计:265027584，已分配:1048576deviceTarDistance:可用内存:空闲:202870784，总计:265027584，已分配:1048576deviceScattDistance:可用内存:空闲:198778880，总计:265027584，已分配:5140480deviceScatterers:可用内存:免费:197730304，总计:265027584，已分配:6189056deviceReceivedReal:可用内存:免费:193638400，总计:265027584，已分配:10280960deviceReceivedImag:可用内存:免费:189546496，总计:265027584，已分配:14372864释放内存.可用内存:免费:189546496，总计:265027584devicePulseDelay:可用内存:空闲:189546496，总计:265027584，已释放:0deviceTarDistance:可用内存:免费:190595072，总计:265027584，已释放:1048576deviceScattDistance:可用内存:免费:194686976，总计:265027584，已释放:5140480deviceScatterers:可用内存:免费:195735552，总计:265027584，已释放:6189056deviceReceivedReal:可用内存:免费:199827456，总计:265027584，已释放:10280960deviceReceivedImag:可用内存:免费:203919360，总计:265027584，已释放:14372864

这表明该行为也取决于硬件/主机操作系统.

I am trying to allocate device memory, copy to it, perform the calculations on the GPU, copy the results back and then free up the device memory I allocated. I wanted to make sure that I wasn't going over the limit and I wanted to see if I would have enough memory in the shared memory space to dump a few arrays.

When I allocate device memory, there are no errors being returned. When I use cudaMemGetInfo to check the amount of memory allocated, it looks like one cudaMalloc hasn't allocated any memory.Also when I try to free the memory, it looks like only one pointer is freed.

I am using the matlab Mexfunction interface to setup the GPU memory and launch the kernel. At this point, I'm not even calling into the kernel and just returning back a unit matrix for the results.

cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Memory avaliable: Free: %lu, Total: %lu
",freeMem, totalMem);

/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;

/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.
");
cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to devicePulseDelay
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceTarDistance
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceScattDistance
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceScatterers
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedReal
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedImag
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
", allocMem, totalMem,(freeMem - allocMem));

/* copy the input arrays across to the device */
mexPrintf("
Copying memory.
");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not copy to devicePulseDelay
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not copy to deviceTarDistance
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not copy to deviceScattDistance
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not copy to deviceScatterers
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));

/* call the kernel */
// launchKernel<<<1,512>>>(........);

/* retireve the output */
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not copy to receivedReal
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not copy to receivedImag
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu
",allocMem, totalMem,(freeMem - allocMem));

/* free the memory. */
mexPrintf("
Free'ing memory.
");
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Before freeing: Free %lu, Total: %lu
", freeMem, totalMem);
cudaErr = cudaFree(devicePulseDelay);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could free devicePulseDelay
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu
",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceTarDistance);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could free deviceTarDistance
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu
",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScattDistance);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could free deviceScattDistance
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu
",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScatterers);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could free deviceScatterers
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu
",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedReal);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could free deviceReceivedReal
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu
",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedImag);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could free deviceReceivedImag
");
    mexPrintf("Error: %s
",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu
",allocMem, totalMem,(allocMem - freeMem));

Here is the output from this:

Memory avaliable: Free: 2523959296, Total: 2818572288
 Allocating memory.
 devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576
 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576
 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880
 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456
 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608
 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

Copying memory.
 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

Free'ing memory.
 Before freeing: Free 2513473536, Total: 2818572288
 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0
 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0
 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0
 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576

I feel like there is something obvious that i'm missing. Can anyone help explain what is going on?

EDIT: platform is windows 7 with a Tesla C2050 GPu card.

解决方案

It is a pretty common misconception that malloc directly gets memory allocations from the host operating system when called, and free directly releases them back to the host operating when called. But they almost always don't work like that, instead the standard library maintains a circular list of free'd and malloc'd memory which is opportunistically expanded and contracted by interacting with the host OS (see some of the answers on How do malloc() and free() work? for more details if you are interested). Irrespective of how it works, this leads to a number of non-intuitive results, including the fact that it is usually impossible to allocate as much memory as the OS says is free, that allocations sometimes appear to not change the amount of free memory, and that free sometimes has no effect on the amount of memory the OS says is free.

Although I have nothing but empirical evidence to support this, I believe CUDA works exactly the same way. The context maintains its own list of malloc'd and free'd memory, and will expand and contract the memory held in that list as host driver/window manager and the GPU itself allows. All hardware has a characteristic MMU page size, and there is evidence to suggest that the page size on NVIDIA GPUs is rather large. This implies there is rather coarse granularity in cudaMalloc calls, and means sometimes a malloc appears to not effect the amount of free memory or to consume much more memory than was requested, and sometimes free calls appear to have no effect (If you are interested, you can find a little tool which helps illustrate the page size behaviour of the CUDA driver here, although it was written for an early version of the CUDA API and might need a couple of changes to compile with modern versions). I believe this is the most likely explanation for the behaviour you are observing.

Incidentally, if I run a simplified version of the code you posted on MacOS 10.6 with a GT200 family device:

#include <cstdio>

#define mexPrintf printf

inline void gpuAssert(cudaError_t code, char *file, int line,
                 bool abort=true)
{
   if (code != cudaSuccess)
   {
      mexPrintf("GPUassert: %s %s %d
", cudaGetErrorString(code),
          file, line);
      if (abort) exit(code);
   }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuMemReport(size_t * avail, size_t * total,
        const char * title = 0, const size_t * free = 0, const bool sense = true)
{
    char tstring[32] = { '' };
    gpuErrchk( cudaMemGetInfo(avail, total) );

    if (free) {
        if (title) {
            strncpy(tstring, title, 31);
        }
        mexPrintf("%s Memory avaliable: Free: %zu, Total: %zu, %s: %zu
",
                tstring, *avail, *total, (sense) ? "Allocated" : "Freed",
                (sense) ? (*free - *avail) : (*avail - *free));
    } else {
        mexPrintf("Memory avaliable: Free: %zu, Total: %zu
", *avail, *total);
    }
}

int main()
{
    size_t freeMem = 0;
    size_t totalMem = 0;
    size_t allocMem = 0;

    gpuErrchk( cudaFree(0) );
    gpuMemReport(&freeMem, &totalMem);

    double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
    double *deviceReceivedReal, *deviceReceivedImag;

    mexPrintf("Allocating memory.
");
    gpuErrchk( cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512) );
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999) );
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem);

    mexPrintf("
Free'ing memory.
");
    gpuMemReport(&freeMem, &totalMem);

    gpuErrchk( cudaFree(devicePulseDelay) );
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem, false);

    gpuErrchk( cudaFree(deviceTarDistance) );
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem, false);

    gpuErrchk( cudaFree(deviceScattDistance) );
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem, false);

    gpuErrchk( cudaFree(deviceScatterers) );
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem, false);

    gpuErrchk( cudaFree(deviceReceivedReal) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem, false);

    gpuErrchk( cudaFree(deviceReceivedImag) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem, false);

    return 0;
}

I get a different result, but also one showing the same phenomena:

Allocating memory.
devicePulseDelay: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceTarDistance: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceScattDistance: Memory avaliable: Free: 198778880, Total: 265027584, Allocated: 5140480
deviceScatterers: Memory avaliable: Free: 197730304, Total: 265027584, Allocated: 6189056
deviceReceivedReal: Memory avaliable: Free: 193638400, Total: 265027584, Allocated: 10280960
deviceReceivedImag: Memory avaliable: Free: 189546496, Total: 265027584, Allocated: 14372864

Free'ing memory.
Memory avaliable: Free: 189546496, Total: 265027584
devicePulseDelay: Memory avaliable: Free: 189546496, Total: 265027584, Freed: 0
deviceTarDistance: Memory avaliable: Free: 190595072, Total: 265027584, Freed: 1048576
deviceScattDistance: Memory avaliable: Free: 194686976, Total: 265027584, Freed: 5140480
deviceScatterers: Memory avaliable: Free: 195735552, Total: 265027584, Freed: 6189056
deviceReceivedReal: Memory avaliable: Free: 199827456, Total: 265027584, Freed: 10280960
deviceReceivedImag: Memory avaliable: Free: 203919360, Total: 265027584, Freed: 14372864

Which suggests that the behaviour is hardware/host OS dependent as well.

这篇关于为什么 CudaFree 似乎没有释放内存?的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！