我是ArrayFire和CUDA开发的新手,几天前我在惨败使用Thrust后才开始使用ArrayFire。
我正在构建一个基于ArrayFire的算法,该算法应该在存储在设备内存中的数十万个32x32帧的数据库中搜索单个32x32像素帧。
首先,我初始化一个矩阵,该矩阵具有1024 + 1个像素行(我需要一个额外的像素来保留帧组ID)和预定义数量的帧(在这种情况下为1000个),以coloumn为索引。

如果我取消注释“ pixels_uint32 = device_frame_ptr [pixel_group_idx];”,这是执行搜索的功能。程序崩溃。该指针似乎是有效的,所以我不明白为什么会发生这种情况。关于以这种方式访问​​设备内存,也许有些我不知道的东西?

#include <iostream>
#include <stdio.h>
#include <sys/types.h>
#include <arrayfire.h>

#include "utils.h"

using namespace af;
using namespace std;

/////////////////////////// CUDA settings ////////////////////////////////
#define TEST_DEBUG false
#define MAX_NUMBER_OF_FRAMES  1000 // maximum (2499999 frames) X (1024 + 1 pixels per frame) x (2 bytes per pixel) = 5.124.997.950 bytes (~ 5GB)
#define BLOB_FINGERPRINT_SIZE 1024 //32x32

//percentage of macroblocks that should match: 0.9 means 90%
#define MACROBLOCK_COMPARISON_OVERALL_THRESHOLD 768 //1024 * 0.75
//////////////////////// End of CUDA settings ////////////////////////////

array search_frame(array d_db_vec)
{
    try {
        uint number_of_uint32_for_frame = BLOB_FINGERPRINT_SIZE / 2;

        // create one-element array to hold the result of the computation
        array frame_found(1,MAX_NUMBER_OF_FRAMES, u32);
        frame_found = 0;

        gfor (array frame_idx, MAX_NUMBER_OF_FRAMES) {

            // get the blob id it's the last coloumn of the matrix
            array blob_id = d_db_vec(number_of_uint32_for_frame, frame_idx);  // addressing with (pixel_idx, frame_idx)

            // define some hardcoded pixel to search for
            uint8_t searched_r = 0x0;
            uint8_t searched_g = 0x3F;
            uint8_t searched_b = 0x0;

            uint8_t b1 = 0;
            uint8_t g1 = 0;
            uint8_t r1 = 0;

            uint8_t b2 = 0;
            uint8_t g2 = 0;
            uint8_t r2 = 0;

            uint32_t sum1 = 0;
            uint32_t sum2 = 0;

            uint32_t *device_frame_ptr   = NULL;
            uint32_t pixels_uint32       = 0;

            uint pixel_match_counter = 0;

            //uint pixel_match_counter = 0;
            array frame = d_db_vec(span, frame_idx);
            device_frame_ptr = frame.device<uint32_t>();

            for (uint pixel_group_idx = 0; pixel_group_idx < number_of_uint32_for_frame; pixel_group_idx++) {
                // test to see if the whole matrix is traversed
                // d_db_vec(pixel_group_idx, frame_idx) = 0;

                /////////////////////////////// PROBLEMATIC CODE ///////////////////////////////////
                pixels_uint32 = 0x7E007E0;
                //pixels_uint32 = device_frame_ptr[pixel_group_idx]; //why does this crash the program?
                // if I uncomment the above line the program tries to copy the u32 frame into the pixels_uint32 variable
                // something goes wrong, since the pointer device_frame_ptr is not NULL and the elements should be there judging by the lines above
                ////////////////////////////////////////////////////////////////////////////////////

                // splitting the first pixel into its components
                b1 = (pixels_uint32 & 0xF8000000) >> 27;   //(input & 11111000000000000000000000000000)
                g1 = (pixels_uint32 & 0x07E00000) >> 21;   //(input & 00000111111000000000000000000000)
                r1 = (pixels_uint32 & 0x001F0000) >> 16;   //(input & 00000000000111110000000000000000)

                // splitting the second pixel into its components
                b2 = (pixels_uint32 & 0xF800) >> 11;       //(input & 00000000000000001111100000000000)
                g2 = (pixels_uint32 & 0x07E0) >> 5;        //(input & 00000000000000000000011111100000)
                r2 = (pixels_uint32 & 0x001F);             //(input & 00000000000000000000000000011111)

                // checking if they are a match
                sum1 = abs(searched_r - r1) + abs(searched_g - g1) + abs(searched_b - b1);
                sum2 = abs(searched_r - r2) + abs(searched_g - g2) + abs(searched_b - b2);

                // if they match, increment the local counter
                pixel_match_counter = (sum1 <= 16) ? pixel_match_counter + 1 : pixel_match_counter;
                pixel_match_counter = (sum2 <= 16) ? pixel_match_counter + 1 : pixel_match_counter;
            }

            bool is_found = pixel_match_counter > MACROBLOCK_COMPARISON_OVERALL_THRESHOLD;
            // write down if the frame is a match or not
            frame_found(0,frame_idx) = is_found ? frame_found(0,frame_idx) : blob_id;
        }

       // test to see if the whole matrix is traversed - this has to print zeroes
    if (TEST_DEBUG)
            print(d_db_vec);

        // return the matches array
    return frame_found;

    } catch (af::exception& e) {
        fprintf(stderr, "%s\n", e.what());
        throw;
    }
}

// make 2 green pixels
uint32_t make_test_pixel_group() {
    uint32_t b1 = 0x0;        //11111000000000000000000000000000
    uint32_t g1 = 0x7E00000;  //00000111111000000000000000000000
    uint32_t r1 = 0x0;        //00000000000111110000000000000000

    uint32_t b2 = 0x0;        //00000000000000001111100000000000
    uint32_t g2 = 0x7E0;      //00000000000000000000011111100000
    uint32_t r2 = 0x0;        //00000000000000000000000000011111

    uint32_t green_pix = b1 | g1 | r1 | b2 | g2 | r2;

    return green_pix;
}

int main(int argc, char ** argv)
{
    info();

    /////////////////////////////////////// CREATE THE DATABASE ///////////////////////////////////////
    uint number_of_uint32_for_frame = BLOB_FINGERPRINT_SIZE / 2;

    array d_db_vec(number_of_uint32_for_frame + 1,   // fingerprint size + 1 extra u32 for blob id
                   MAX_NUMBER_OF_FRAMES,             // number of frames
                   u32);                             // type of elements is 32-bit unsigned integer (unsigned) with the configuration RGBRGB (565565)

    if (TEST_DEBUG == true) {
        for (uint frame_idx = 0; frame_idx < MAX_NUMBER_OF_FRAMES; frame_idx++) {
            for (uint pix_idx = 0; pix_idx < number_of_uint32_for_frame; pix_idx++) {
                d_db_vec(pix_idx, frame_idx) = make_test_pixel_group();  // fill everything with green :D
            }
        }
    } else {
        d_db_vec = rand(number_of_uint32_for_frame + 1, MAX_NUMBER_OF_FRAMES);
    }

    cout << "Setting blob ids. \n\n";
    for (uint frame_idx = 0; frame_idx < MAX_NUMBER_OF_FRAMES; frame_idx++) {
        // set the blob id to 123456
        d_db_vec(number_of_uint32_for_frame, frame_idx) = 123456;  // blob_id = 123456
    }

    if (TEST_DEBUG)
        print(d_db_vec);

    cout << "Done setting blob ids. \n\n";

    //////////////////////////////////// CREATE THE SEARCHED FRAME ///////////////////////////////////

    // to be done, for now we use the hardcoded values at line 37-39 to simulate the searched pixel:
    //37        uint8_t searched_r = 0x0;
    //38        uint8_t searched_g = 0x3F;
    //39        uint8_t searched_b = 0x0;

    ///////////////////////////////////////////// SEARCH /////////////////////////////////////////////
    clock_t timer = startTimer();
    for (int i = 0; i< 1000; i++) {
        array frame_found = search_frame(d_db_vec);

        if (TEST_DEBUG)
            print(frame_found);
    }
    stopTimer(timer);

    return 0;
}


这是带有注释行的控制台输出:

arrayfire / examples / helloworld $ ./helloworld

ArrayFire v1.9.1(64位Linux,内部版本9af23ea)

许可证:服务器([email protected]

CUDA工具包5.0,驱动程序304.54

GPU0 Tesla C2075,5376 MB,计算2.0

内存使用:5312 MB可用空间(总计5376 MB)

设置Blob ID。

完成设置Blob ID。

时间:0.03秒。



这是控制台输出,其行未注释:

arrayfire / examples / helloworld $ ./helloworld

ArrayFire v1.9.1(64位Linux,内部版本9af23ea)

许可证:服务器([email protected]

CUDA工具包5.0,驱动程序304.54

GPU0 Tesla C2075,5376 MB,计算2.0

内存使用:5312 MB可用空间(总计5376 MB)

设置Blob ID。

完成设置Blob ID。

分段故障



在此先感谢您的任何帮助。我真的尝试了一切,但没有成功。

最佳答案

免责声明:我是arrayfire的首席开发人员。我看到您也有posted on AccelerEyes forums,但我将在此发布以清除代码中的一些常见问题。


不要在gfor循环中使用.device()、. host()、. scalar()。这将导致GFOR循环内部出现分歧,而GFOR并不是为此而设计的。
您不能索引到设备指针。指针指向GPU上的位置。当您执行device_frame_ptr[pixel_group_idx];时,系统正在寻找CPU上的等效位置。这是您的细分错误的原因。
使用向量化代码。例如,您不需要gfor的内部for循环。您可以执行b1 = (pixels_uint32 & 0xF8000000) >> 27;,而不是在for循环中执行array B1 = (frame & 0xF800000000) >> 27;。也就是说,您不是在将数据返回到CPU并使用for循环,而是在GPU内部进行了整个操作。
不要在GFOR中使用if-else或三元运算符。这些再次引起分歧。例如,pixel_match_counter = sum(sum1 <= 16) + sum(sum2 < 16);found(0, found_idx) = is_found * found(0, found_idx) + (1 - is_found) * blob_id


我已经回答了您面临的特定问题。如果您有任何后续问题,请在我们的论坛和/或我们的支持电子邮件中跟进。 Stackoverflow非常适合提出一个特定的问题,但不能调试整个程序。

10-07 15:43