下面是在堆上使用指向指针的指针分配多维数组的常用方法。

typedef struct ArrayInt {
    int *array;
    int length;
} ArrayInt;

static void ArrayIntCreate(ArrayInt *array, int length) {
    array->array = MjMalloc(length * sizeof(int));
    array->length = length;
}

static void ArrayIntDelete(ArrayInt *array) {
    free(array->array);
}

typedef struct ArrayArrayInt {
    ArrayInt *array;
    int length;
} ArrayArrayInt;

static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
    array->array = MjMalloc(length * sizeof(ArrayInt));
    array->length = length;
    for (int i = 0; i < length; i += 1) {
        ArrayIntCreate(&array->array[i], length2);
    }
}

static void ArrayArrayIntDelete(ArrayArrayInt *array) {
    for (int i = 0; i < array->length; i += 1) {
        ArrayIntDelete(&array->array[i]);
    }
    free(array->array);
}

但我决定做一个版本,只分配一个内存,并通过乘法对索引值进行元素访问。
typedef struct ArrayArrayInt2 {
    int *array;
    int length;
    int length2;
} ArrayArrayInt2;

static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
    array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
    array->length = length;
    array->length2 = length2;
}

static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
    free(array->array);
}

#define aai2At(aai2, i) (&aai2.array[i * aai2.length2])

当运行下面的测试代码时,第二个版本的运行速度大约快20%。可能的原因是什么,这是一种普遍适用的优化技术吗?是否有一些库定义这种类型的数组类型以用于优化目的?
在编辑之前,我在测试代码中犯了一个巨大的错误。第一个版本运行得比较慢,因为它的分配和解除分配保持在for循环内,而第二个版本在进入循环之前只执行了一次。请参阅下面测试代码中的注释。在使两个测试相等之后,我发现第一个版本可以运行得更快,特别是在优化之后。我将更复杂的操作和各种副本放入测试代码中,我发现第一个总是运行得更快一些。在我的机器中,索引的乘法似乎慢了吗?不过,我不确定原因。
static double ElapsedTime(clock_t startTime, clock_t endTime) {
    return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}

#define N 2000

int main() {
    ArrayArrayInt aai;
    ArrayArrayInt2 aai2;
    long long int sum;
    clock_t startTime, endTime;

    startTime = clock();
    sum = 0;
    for (int k = 0; k < N; k += 1) {
        ArrayArrayIntCreate(&aai, N, N);
        for (int i = 0; i < aai.length; i += 1) {
            int j = 0;
            for (; j < aai.array[i].length; j += 1) {
                aai.array[i].array[j] = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai.array[i].array[j] - i + 1;
            }
        }
        ArrayArrayIntDelete(&aai);
    }
    endTime = clock();
    printf("aai: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));

    startTime = clock();
    sum = 0;
    ArrayArrayInt2Create(&aai2, N, N); //Mistake Here!!
    for (int k = 0; k < N; k += 1) {
        for (int i = 0; i < aai2.length; i += 1) {
            int j = 0;
            for (; j < aai2.length2; j += 1) {
                aai2At(aai2, i)[j] = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai2At(aai2, i)[j] - i + 1;
            }
        }
    }
    ArrayArrayInt2Delete(&aai2); //Should go inside the loop block..
    endTime = clock();
    printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));

    return 0;
}

最佳答案

是的,使用算术和单基指针是编译器在内部为非动态分配的二维(n维)数组所做的工作。
因为只有一个计算和索引查找,所以获得的性能最高。在显示2D数组的情况下,每个数组访问有两个指针查找和两个索引计算(一个索引计算和查找用于访问右数组,另一个索引计算和查找用于访问右数组中的元素)。对于一个3D数组,将有三个索引计算和三个查找。
你也分配更少的内存,需要更少的内存分配,但这些都是二阶效应。
此外,正如WhozCraigcomment中指出的,但我没有提到,与多个较小的块(比单个大块的内存加起来要多)相比,使用单个大块内存可以获得更好的引用局部性和更智能的预取潜力。
我在Mac OS X 10.10.2 Yosemite上测试了这个用GCC4.9.1编译的文件(sim2d.c)。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

static void *MjMalloc(size_t nbytes)
{
    void *rv = malloc(nbytes);
    if (rv == 0)
    {
        fprintf(stderr, "Memory allocation failure (%zu bytes)\n", nbytes);
        exit(1);
    }
    return rv;
}

/* Mechanism 1 */
typedef struct ArrayInt {
    int *array;
    int length;
} ArrayInt;

static void ArrayIntCreate(ArrayInt *array, int length) {
    array->array = MjMalloc(length * sizeof(int));
    array->length = length;
}

static void ArrayIntDelete(ArrayInt *array) {
    free(array->array);
}

typedef struct ArrayArrayInt {
    ArrayInt *array;
    int length;
} ArrayArrayInt;

static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
    array->array = MjMalloc(length * sizeof(ArrayInt));
    array->length = length;
    for (int i = 0; i < length; i += 1) {
        ArrayIntCreate(&array->array[i], length2);
    }
}

static void ArrayArrayIntDelete(ArrayArrayInt *array) {
    for (int i = 0; i < array->length; i += 1) {
        ArrayIntDelete(&array->array[i]);
    }
    free(array->array);
}

/* Mechanism 2 */
typedef struct ArrayArrayInt2 {
    int *array;
    int length;
    int length2;
} ArrayArrayInt2;

static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
    array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
    array->length = length;
    array->length2 = length2;
}

static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
    free(array->array);
}

#define aai2At(aai2, i) (&aai2.array[(i) * aai2.length2])
#define aai2At2(aai2, i, j) (aai2.array[(i) * aai2.length2 + (j)])

/* Head-to-head testing */
static double ElapsedTime(clock_t startTime, clock_t endTime) {
    return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}

#define N 2000
#define N_CYCLES    1000

static void one_test_cycle(void)
{
    ArrayArrayInt aai;
    ArrayArrayInt2 aai2;
    long long int sum;
    clock_t startTime, endTime;

    startTime = clock();
    sum = 0;
    for (int k = 0; k < N_CYCLES; k += 1) {
        ArrayArrayIntCreate(&aai, N, N);
        for (int i = 0; i < aai.length; i += 1) {
            int j = 0;
            for (; j < aai.array[i].length; j += 1) {
                aai.array[i].array[j] = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai.array[i].array[j] - i + 1;
            }
        }
        ArrayArrayIntDelete(&aai);
    }
    endTime = clock();
    printf("aai1: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));

    startTime = clock();
    sum = 0;
    for (int k = 0; k < N_CYCLES; k += 1) {
        ArrayArrayInt2Create(&aai2, N, N);
        for (int i = 0; i < aai2.length; i += 1) {
            int j = 0;
            for (; j < aai2.length2; j += 1) {
                aai2At(aai2, i)[j] = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai2At(aai2, i)[j] - i + 1;
            }
        }
        ArrayArrayInt2Delete(&aai2);
    }
    endTime = clock();
    printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));

    startTime = clock();
    sum = 0;
    for (int k = 0; k < N_CYCLES; k += 1) {
        ArrayArrayInt2Create(&aai2, N, N);
        for (int i = 0; i < aai2.length; i += 1) {
            int j = 0;
            for (; j < aai2.length2; j += 1) {
                aai2At2(aai2, i, j) = i;
            }
            while ((j -= 1) >= 0) {
                sum += aai2At2(aai2, i, j) - i + 1;
            }
        }
        ArrayArrayInt2Delete(&aai2);
    }
    endTime = clock();
    printf("aai3: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
}

static void print_now(const char *tag)
{
    time_t now = time(0);
    struct tm *lt = localtime(&now);
    char buffer[32];
    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", lt);
    printf("%s: %s\n", tag, buffer);
}

int main(void)
{
    print_now("Started");
    for (int i = 0; i < 3; i++)
        one_test_cycle();
    print_now("Finished");
    return 0;
}

访问aai2数据有两种稍微不同的方式。我还将数组大小(N=2000)与单个测试中的循环数(N_cycles=1000)分开。我得到的计时结果是:
Started: 2015-04-07 07:40:41
aai1: sum = 4000000000; time = 6.80
aai2: sum = 4000000000; time = 5.99
aai3: sum = 4000000000; time = 5.98
aai1: sum = 4000000000; time = 6.75
aai2: sum = 4000000000; time = 6.02
aai3: sum = 4000000000; time = 5.99
aai1: sum = 4000000000; time = 6.72
aai2: sum = 4000000000; time = 6.01
aai3: sum = 4000000000; time = 5.99
Finished: 2015-04-07 07:41:38

我得到了类似的模式(N_CYCLE=2000),但它需要两倍的时间来运行-惊喜,惊喜。
我看到单个分配代码有一个很小但值得注意的好处(大约减少了13%),但是两个“aai2”测试的计时之间没有显著差异。
基本统计:
# All data
# Count    = 9
# Mean     =  6.250000e+00
# Std Dev  =  3.807230e-01

# aai1 only:
# Count    = 3
# Mean     =  6.756667e+00
# Std Dev  =  4.041452e-02

# aai2 and aai3:
# Count    = 6
# Mean     =  5.996667e+00
# Std Dev  =  1.505545e-02

# aai2 only:
# Count    = 3
# Mean     =  6.006667e+00
# Std Dev  =  1.527525e-02

# aai3 only:
# Count    = 3
# Mean     =  5.986667e+00
# Std Dev  =  5.773503e-03

显然,正式地确保机器被卸载,并且运行更多的测试迭代,以及类似的基准测试步骤可能会改进数据,但是单分配aai2机制在这台机器上比多分配aai机制执行得更好。(旁白:为什么当人们有两个或更多版本的代码时,不在他们的第一个版本上加上后缀1?)
硬件:17“Mac Book Pro,2011年初,2.3 GHz Intel Core i7,16 GiB 1333 MHz DDR3 RAM。

关于c - 这是优化堆上多维数组的一种可行方法吗?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/29484112/

10-11 22:07
查看更多