下面是在堆上使用指向指针的指针分配多维数组的常用方法。
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
但我决定做一个版本,只分配一个内存,并通过乘法对索引值进行元素访问。
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[i * aai2.length2])
当运行下面的测试代码时,第二个版本的运行速度大约快20%。可能的原因是什么,这是一种普遍适用的优化技术吗?是否有一些库定义这种类型的数组类型以用于优化目的?
在编辑之前,我在测试代码中犯了一个巨大的错误。第一个版本运行得比较慢,因为它的分配和解除分配保持在for循环内,而第二个版本在进入循环之前只执行了一次。请参阅下面测试代码中的注释。在使两个测试相等之后,我发现第一个版本可以运行得更快,特别是在优化之后。我将更复杂的操作和各种副本放入测试代码中,我发现第一个总是运行得更快一些。在我的机器中,索引的乘法似乎慢了吗?不过,我不确定原因。
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
int main() {
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
ArrayArrayInt2Create(&aai2, N, N); //Mistake Here!!
for (int k = 0; k < N; k += 1) {
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
}
ArrayArrayInt2Delete(&aai2); //Should go inside the loop block..
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
return 0;
}
最佳答案
是的,使用算术和单基指针是编译器在内部为非动态分配的二维(n维)数组所做的工作。
因为只有一个计算和索引查找,所以获得的性能最高。在显示2D数组的情况下,每个数组访问有两个指针查找和两个索引计算(一个索引计算和查找用于访问右数组,另一个索引计算和查找用于访问右数组中的元素)。对于一个3D数组,将有三个索引计算和三个查找。
你也分配更少的内存,需要更少的内存分配,但这些都是二阶效应。
此外,正如WhozCraig在comment中指出的,但我没有提到,与多个较小的块(比单个大块的内存加起来要多)相比,使用单个大块内存可以获得更好的引用局部性和更智能的预取潜力。
我在Mac OS X 10.10.2 Yosemite上测试了这个用GCC4.9.1编译的文件(sim2d.c
)。
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
static void *MjMalloc(size_t nbytes)
{
void *rv = malloc(nbytes);
if (rv == 0)
{
fprintf(stderr, "Memory allocation failure (%zu bytes)\n", nbytes);
exit(1);
}
return rv;
}
/* Mechanism 1 */
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
/* Mechanism 2 */
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[(i) * aai2.length2])
#define aai2At2(aai2, i, j) (aai2.array[(i) * aai2.length2 + (j)])
/* Head-to-head testing */
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
#define N_CYCLES 1000
static void one_test_cycle(void)
{
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai1: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At2(aai2, i, j) = i;
}
while ((j -= 1) >= 0) {
sum += aai2At2(aai2, i, j) - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai3: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
}
static void print_now(const char *tag)
{
time_t now = time(0);
struct tm *lt = localtime(&now);
char buffer[32];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", lt);
printf("%s: %s\n", tag, buffer);
}
int main(void)
{
print_now("Started");
for (int i = 0; i < 3; i++)
one_test_cycle();
print_now("Finished");
return 0;
}
访问
aai2
数据有两种稍微不同的方式。我还将数组大小(N=2000)与单个测试中的循环数(N_cycles=1000)分开。我得到的计时结果是:Started: 2015-04-07 07:40:41
aai1: sum = 4000000000; time = 6.80
aai2: sum = 4000000000; time = 5.99
aai3: sum = 4000000000; time = 5.98
aai1: sum = 4000000000; time = 6.75
aai2: sum = 4000000000; time = 6.02
aai3: sum = 4000000000; time = 5.99
aai1: sum = 4000000000; time = 6.72
aai2: sum = 4000000000; time = 6.01
aai3: sum = 4000000000; time = 5.99
Finished: 2015-04-07 07:41:38
我得到了类似的模式(N_CYCLE=2000),但它需要两倍的时间来运行-惊喜,惊喜。
我看到单个分配代码有一个很小但值得注意的好处(大约减少了13%),但是两个“aai2”测试的计时之间没有显著差异。
基本统计:
# All data
# Count = 9
# Mean = 6.250000e+00
# Std Dev = 3.807230e-01
# aai1 only:
# Count = 3
# Mean = 6.756667e+00
# Std Dev = 4.041452e-02
# aai2 and aai3:
# Count = 6
# Mean = 5.996667e+00
# Std Dev = 1.505545e-02
# aai2 only:
# Count = 3
# Mean = 6.006667e+00
# Std Dev = 1.527525e-02
# aai3 only:
# Count = 3
# Mean = 5.986667e+00
# Std Dev = 5.773503e-03
显然,正式地确保机器被卸载,并且运行更多的测试迭代,以及类似的基准测试步骤可能会改进数据,但是单分配
aai2
机制在这台机器上比多分配aai
机制执行得更好。(旁白:为什么当人们有两个或更多版本的代码时,不在他们的第一个版本上加上后缀1?)硬件:17“Mac Book Pro,2011年初,2.3 GHz Intel Core i7,16 GiB 1333 MHz DDR3 RAM。
关于c - 这是优化堆上多维数组的一种可行方法吗?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/29484112/