我想获得在 Linux 机器上运行的 C/C++ 程序( foo )的特定函数的缓存命中率。我正在使用 gcc 而没有编译器优化。通过 perf,我可以使用以下命令获得整个程序的命中率。
但我只对内核 foo 感兴趣。
有没有办法使用 perf 或任何其他工具仅针对 foo 获得命中率?
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#define NI 192
#define NJ NI
#ifndef DATA_TYPE
#define DATA_TYPE float
#endif
static
void* xmalloc(size_t num)
{
void * nnew = NULL;
int ret = posix_memalign (&nnew, 32, num);
if(!nnew || ret)
{
fprintf(stderr, "Can not allocate Memory\n");
exit(1);
}
return nnew;
}
void* alloc_data(unsigned long long int n, int elt_size)
{
size_t val = n;
val *= elt_size;
void* ret = xmalloc(val);
return ret;
}
/* Array initialization. */
static
void init_array(int ni, int nj,
DATA_TYPE A[NI][NJ],
DATA_TYPE R[NJ][NJ],
DATA_TYPE Q[NI][NJ])
{
int i, j;
for (i = 0; i < ni; i++)
for (j = 0; j < nj; j++) {
A[i][j] = ((DATA_TYPE) i*j) / ni;
Q[i][j] = ((DATA_TYPE) i*(j+1)) / nj;
}
for (i = 0; i < nj; i++)
for (j = 0; j < nj; j++)
R[i][j] = ((DATA_TYPE) i*(j+2)) / nj;
}
/* Main computational kernel.*/
static
void foo(int ni, int nj,
DATA_TYPE A[NI][NJ],
DATA_TYPE R[NJ][NJ],
DATA_TYPE Q[NI][NJ])
{
int i, j, k;
DATA_TYPE nrm;
for (k = 0; k < nj; k++)
{
nrm = 0;
for (i = 0; i < ni; i++)
nrm += A[i][k] * A[i][k];
R[k][k] = sqrt(nrm);
for (i = 0; i < ni; i++)
Q[i][k] = A[i][k] / R[k][k];
for (j = k + 1; j < nj; j++)
{
R[k][j] = 0;
for (i = 0; i < ni; i++)
R[k][j] += Q[i][k] * A[i][j];
for (i = 0; i < ni; i++)
A[i][j] = A[i][j] - Q[i][k] * R[k][j];
}
}
}
int main(int argc, char** argv)
{
/* Retrieve problem size. */
int ni = NI;
int nj = NJ;
/* Variable declaration/allocation. */
DATA_TYPE (*A)[NI][NJ];
DATA_TYPE (*R)[NI][NJ];
DATA_TYPE (*Q)[NI][NJ];
A = ((DATA_TYPE (*)[NI][NJ])(alloc_data((NI*NJ), (sizeof(DATA_TYPE)))));
R = ((DATA_TYPE (*)[NI][NJ])(alloc_data((NI*NJ), (sizeof(DATA_TYPE)))));
Q = ((DATA_TYPE (*)[NI][NJ])(alloc_data((NI*NJ), (sizeof(DATA_TYPE)))));
/* Initialize array(s). */
init_array (ni, nj,
(*A),
(*R),
(*Q));
/* Run kernel. */
foo (ni, nj, *A, *R, *Q);
/* Be clean. */
free((void *)A);
free((void *)R);
free((void *)Q);
return 0;
}
lscpu 命令的输出是:Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 16
On-line CPU(s) list: 0-15
Thread(s) per core: 2
Core(s) per socket: 8
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 63
Model name: Intel(R) Core(TM) i7-5960X CPU @ 3.00GHz
Stepping: 2
CPU max MHz: 3500.0000
CPU min MHz: 1200.0000
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 20480K
NUMA node0 CPU(s): 0-15
最佳答案
您也可以使用 Likwid 及其 Marker-API 。它使检测代码的某些区域变得非常容易。您可以在 haswell 架构上使用预定义的性能组 ICACHE 来计算 L1 缓存未命中率,或者为 L1 命中率定义您自己的性能组。
#include likwid.h
LIKWID_MARKER_INIT;
LIKWID_MARKER_START("region foo");
foo();
LIKWID_MARKER_STOP("region foo");
LIKWID_MARKER_CLOSE;
运行应用程序:./likwid-perfctr -g ICACHE -m <your application>
确保使用 -DLIKWID-PERFMON
进行编译并添加 Likwid 包含和库路径并链接 Likwid 库: -L$LIKWID_LIB -I$LIKWID_INCLUDE -llikwid
。github wiki 上的所有内容都记录得很好
关于c - 分析C程序函数的缓存命中率,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/64586132/