在Linux上可怜的memcpy性能

本文介绍了在Linux上可怜的memcpy性能的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我们最近购买了一些新的服务器和遇到的memcpy表现不佳。该memcpy的性能3倍相比，我们的笔记本电脑在服务器上慢。

服务器规格

底盘和摩博：SUPER MICRO 1027GR-TRF

CPU：2个英特尔至强E5-2680 @ 2.70 GHz的

内存：16GB 8X DDR3 1600MHz的

编辑：我也有小幅走高的规格测试的另一台服务器上，并看到了同样的结果作为上述服务器

服务器2规格

底盘和摩博：SUPER MICRO 10227GR-TRFT

CPU：2个英特尔至强E5-2650 V2 @ 2.6GHz的

内存：16GB 8X DDR3 1866MHz的

笔记本电脑规格

机箱：联想W530

CPU：1个Intel酷睿i7 i7-3720QM 2.6GHz的@

内存：4X 4GB DDR3 1600MHz的

操作系统

  $猫的/ etc / redhat的释放
科学的Linux 6.5版（黑色）
$的uname -a
Linux的R113 2.6.32-431.1.2.el6.x86_64＃1 SMP周四12月12日13时59分19秒CST 2013 x86_64的x86_64的x86_64的GNU / Linux的

编译器（所有系统上）

  $ GCC --version
海湾合作委员会（GCC）4.6.1

还与基于从@stefan一个建议的gcc 4.8.2测试。有编译器之间的性能差异。

测试code
测试code以下是罐装测试重复我对我们的生产code看到的问题。我知道这个基准是简单的，但它是能够利用并确定我们的问题。在code创建两个1GB缓存和它们之间memcpys，定时的memcpy调用。您可以使用命令行上指定备用缓冲区大小：./big_memcpy_test [SIZE_BYTES]

 的#include＆LT;＆计时GT;
＃包括LT＆; CString的GT;
＃包括LT＆;＆iostream的GT;
＃包括LT＆;＆cstdint GT;类Timer
{
 上市：
  计时器（）
      ：mStart（），
        MSTOP（）
  {
    更新（）;
  }  无效更新（）
  {
    mStart =的std ::时辰:: high_resolution_clock ::现在（）;
    MSTOP = mStart;
  }  双elapsedMs（）
  {
    MSTOP =的std ::时辰:: high_resolution_clock ::现在（）;
    的std ::时辰::毫秒elapsed_ms =
        的std ::时辰:: duration_cast＆LT;的std ::时辰::毫秒＆GT;（MSTOP  -  mStart）;
    返回elapsed_ms.count（）;
  } 私人的：
  的std ::时辰:: high_resolution_clock :: time_point mStart;
  的std ::时辰:: high_resolution_clock :: time_point MSTOP;
};标准::字符串formatBytes（的std :: uint64_t中字节）
{
  静态const int的num_suffix = 5;
  静态为const char *后缀[num_suffix] = {B，KB，MB，GB，TB};
  双dbl_s_byte =字节;
  INT I = 0;
  为（（INT）（字节/ 1024）大于0＆放大器;＆放大器; I＆下; num_suffix;
       ++我，字节/ = 1024）
  {
    dbl_s_byte =字节/ 1024.0;
  }  const int的buf_len = 64;
  烧焦的buf [buf_len]  //使用的snprintf所以没有缓冲区溢出
  INT解析度=的snprintf（BUF，buf_len，％0.2F％S，dbl_s_byte，后缀[I]）;  //返回的snprintf将如果n有已被写入字符数
  //已经足够大，不包括终止空字符。
  //如果发生编码错误，则返回一个负数。
  如果（RES＆GT; = 0）
  {
    返回的std ::字符串（BUF）;
  }
  返回的std ::字符串（）;
}无效doMemmove（无效* pDest，常量无效*的PSource，性病::为size_t sizeBytes）
{
  memmove与（pDest，的PSource，sizeBytes）;
}INT主（INT ARGC，CHAR *的argv []）
{
  的std :: uint64_t中SIZE_BYTES = 1073741824; // 1GB  如果（argc个大于1）
  {
    SIZE_BYTES =的std :: stoull（的argv [1]）;
    性病::法院LT＆;＆LT; 在命令行中使用缓冲区大小：＆LT;＆LT; formatBytes（SIZE_BYTES）
              ＆LT;＆LT;的std :: ENDL;
  }
  其他
  {
    性病::法院LT＆;＆LT; 要指定自定义缓存大小：big_memcpy_test [SIZE_BYTES] \\ n
              ＆LT;＆LT; 使用内置的缓冲区大小：＆LT;＆LT; formatBytes（SIZE_BYTES）
              ＆LT;＆LT;的std :: ENDL;
  }
  //大数组用于测试
  字符* p_big_array = NULL;  /////////////
  //的malloc
  {
    定时器定时器;    p_big_array =（字符*）malloc的（SIZE_BYTES *的sizeof（字符））;
    如果（p_big_array == NULL）
    {
      的std :: CERR＆LT;＆LT; 错误：malloc的的＆LT;＆LT; SIZE_BYTES＆LT;＆LT; 返回NULL！
                ＆LT;＆LT;的std :: ENDL;
      返回1;
    }    性病::法院LT＆;＆LT; 为的malloc＆LT;＆LT; formatBytes（SIZE_BYTES）LT;＆LT;  拿 
              ＆LT;＆LT; timer.elapsedMs（）＆所述;＆下; 女士
              ＆LT;＆LT;的std :: ENDL;
  }  /////////////
  // memset的
  {
    定时器定时器;    //在p_big_array设置的所有数据为0
    memset的（p_big_array，0xF的，SIZE_BYTES *的sizeof（字符））;    双elapsed_ms = timer.elapsedMs（）;
    性病::法院LT＆;＆LT; memset的为＆LT;＆LT; formatBytes（SIZE_BYTES）LT;＆LT;  拿 
              ＆LT;＆LT; elapsed_ms＆LT;＆LT; 女士 
              ＆LT;＆LT; （＆所述;＆下; formatBytes（SIZE_BYTES /（elapsed_ms / 1.0e3））≤;＆下;字节/秒）
              ＆LT;＆LT;的std :: ENDL;
  }  /////////////
  //的memcpy
  {
    字符* p_dest_array =（字符*）malloc的（SIZE_BYTES）;
    如果（p_dest_array == NULL）
    {
      的std :: CERR＆LT;＆LT; 错误：malloc的的＆LT;＆LT; SIZE_BYTES＆LT;＆LT; 为的memcpy测试
                ＆LT;＆LT; 返回NULL！
                ＆LT;＆LT;的std :: ENDL;
      返回1;
    }
    memset的（p_dest_array，0xF的，SIZE_BYTES *的sizeof（字符））;    //时间只有FROM的memcpy TO p_big_array p_dest_array
    定时器定时器;    的memcpy（p_dest_array，p_big_array，SIZE_BYTES *的sizeof（字符））;    双elapsed_ms = timer.elapsedMs（）;
    性病::法院LT＆;＆LT; 为的memcpy＆LT;＆LT; formatBytes（SIZE_BYTES）LT;＆LT;  拿 
              ＆LT;＆LT; elapsed_ms＆LT;＆LT; 女士 
              ＆LT;＆LT; （＆所述;＆下; formatBytes（SIZE_BYTES /（elapsed_ms / 1.0e3））≤;＆下;字节/秒）
              ＆LT;＆LT;的std :: ENDL;    //清理p_dest_array
    免费（p_dest_array）;
    p_dest_array = NULL;
  }  /////////////
  // memmove与
  {
    字符* p_dest_array =（字符*）malloc的（SIZE_BYTES）;
    如果（p_dest_array == NULL）
    {
      的std :: CERR＆LT;＆LT; 错误：malloc的的＆LT;＆LT; SIZE_BYTES＆LT;＆LT; 对于memmove与测试
                ＆LT;＆LT; 返回NULL！
                ＆LT;＆LT;的std :: ENDL;
      返回1;
    }
    memset的（p_dest_array，0xF的，SIZE_BYTES *的sizeof（字符））;    //时间只有FROM的memmove TO p_big_array p_dest_array
    定时器定时器;    // memmove与（p_dest_array，p_big_array，SIZE_BYTES *的sizeof（字符））;
    doMemmove（p_dest_array，p_big_array，SIZE_BYTES *的sizeof（字符））;    双elapsed_ms = timer.elapsedMs（）;
    性病::法院LT＆;＆LT; 为的memmove＆LT;＆LT; formatBytes（SIZE_BYTES）LT;＆LT;  拿 
              ＆LT;＆LT; elapsed_ms＆LT;＆LT; 女士 
              ＆LT;＆LT; （＆所述;＆下; formatBytes（SIZE_BYTES /（elapsed_ms / 1.0e3））≤;＆下;字节/秒）
              ＆LT;＆LT;的std :: ENDL;    //清理p_dest_array
    免费（p_dest_array）;
    p_dest_array = NULL;
  }
  // 清理
  免费（p_big_array）;
  p_big_array = NULL;  返回0;
}

CMake的文件生成

 项目（big_memcpy_test）
cmake_minimum_required（2.4.0版）include_directories（$ {} CMAKE_CURRENT_SOURCE_DIR）＃创建详细的makefile，显示每个命令行，因为它发出
集（CMAKE_VERBOSE_MAKEFILE缓存上BOOL详细FORCE）
＃释放模式
集（CMAKE_BUILD_TYPE发布）
在CXXFLAGS环境变量＃抢并追加C ++ 11和-Wall选项
集（CMAKE_CXX_FLAGS$ {} CMAKE_CXX_FLAGS -std =的C ++ 0x -Wall -march =本地-mtune =原生）
消息（INFOCMAKE_CXX_FLAGS = $ {CMAKE_CXX_FLAGS}）＃源建设
设置（big_memcpy_test_SRCS
  main.cpp中
）＃创建一个名为big_memcpy_test从一个可执行文件
＃在变量big_memcpy_test_SRCS的源文件。
add_executable（big_memcpy_test $ {} big_memcpy_test_SRCS）

测试结果

 缓冲区大小：1GB |的malloc（毫秒）| memset的（毫秒）|的memcpy（毫秒）| NUMA节点（numactl的--hardware）
-------------------------------------------------- -------------------------------------------
笔记本电脑1 | 0 | 127 | 113 | 1
笔记本电脑2 | 0 | 180 | 120 | 1
服务器1 | 0 | 306 | 301 | 2
服务器2 | 0 | 352 | 325 | 2

正如你可以看到我们的服务器上memcpys和memsets比我们的笔记本电脑memcpys和memsets慢得多。

变缓冲区大小

我试图从100MB缓冲区5GB都具有类似的结果（服务器慢于笔记本电脑）

NUMA亲和力

我读到具有NUMA性能问题的人，所以我尝试使用numactl的设置CPU和内存的亲和力，但结果仍然是相同的。

服务器硬件NUMA

  $ numactl的--hardware
可供选择：2个节点（0-1）
节点0的CPU：0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23
节点0大小：65501 MB
节点0免费：62608 MB
节点1的CPU：8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31
节点1大小：65536 MB
节点1免费：63837 MB
节点的距离：
节点0 1
  0：10 21
  1：21 10

笔记本NUMA硬件

  $ numactl的--hardware
可供选择：1个节点（0）
节点0的CPU：0 1 2 3 4 5 6 7
节点0大小：16018 MB
节点0免费：6622 MB
节点的距离：
节点0
  0：10

设置NUMA亲和力

  $ numactl的--cpunodebind = 0 --membind = 0 ./big_memcpy_test

任何帮助解决这大大AP preciated。

编辑：GCC选项

根据意见，我曾尝试与不同的GCC编译选项：

与-march和-mtune设置为本地编译

  G ++ -std =的C ++ 0x -Wall -march =本地-mtune =本地-O3 -DNDEBUG -o big_memcpy_test的main.cpp

结果：相同的性能（无改善）

与-O2，而不是-O3编译

  G ++ -std =的C ++ 0x -Wall -march =本地-mtune =本地-O2 -DNDEBUG -o big_memcpy_test的main.cpp

结果：相同的性能（无改善）

编辑：改变memset的写0xF的，而不是0，以避免空页（@SteveCox）

比以外的值memsetting时未改善0（在这种情况下使用0xF的）。

编辑：Cachebench结果

为了排除我的测试程序太简单，我下载一个真正的标杆项目LLCacheBench（http://icl.cs.utk.edu/projects/llcbench/cachebench.html)

我建的每台机器上分别基准，以避免架构的问题。下面是我的结果。

注意，非常大的不同是在更大的缓冲区大小的性能。测试（16777216）的最后一个大小在18849.29 MB /秒的服务器上的笔记本电脑和6710.40进行。这是约在性能上相差3倍。还可以注意到服务器的性能空投比在笔记本电脑更陡峭。

编辑：memmove与（）是服务器

比的memcpy（）快2倍基于一些试验，我一直在使用memmove与尝试

（）代替的memcpy（）在我的测试情况，并发现服务器上提高一倍。 memmove与（）在笔记本电脑上的运行速度比的memcpy（），但奇怪的是运行在相同的速度在服务器上的memmove（）慢。这引出了一个问题，为什么memcpy的这么慢？

更新code测试用的memmove一起的memcpy。我不得不换行的memmove（）函数内，因为如果我把它内嵌GCC优化的IT和的memcpy（）执行完全一样的（我假设GCC优化它memcpy的，因为它知道该位置不重叠）。

更新的结果。

 缓冲区大小：1GB |的malloc（毫秒）| memset的（毫秒）|的memcpy（毫秒）| memmove与（）| NUMA节点（numactl的--hardware）
---------------------------------------------------------------------------------------------------------
笔记本电脑1 | 0 | 127 | 113 | 161 | 1
笔记本电脑2 | 0 | 180 | 120 | 160 | 1
服务器1 | 0 | 306 | 301 | 159 | 2
服务器2 | 0 | 352 | 325 | 159 | 2

编辑：朴素的memcpy

从@Salgar我已经实现了我自己的天真memcpy函数，并测试了它的建议为主。

天真的memcpy源

 无效naiveMemcpy（无效* pDest，常量无效*的PSource，性病::为size_t sizeBytes）
{
  字符* p_dest =（字符*）pDest;
  为const char * p_source =（为const char *）的PSource;
  对于（的std ::为size_t我= 0; I＆LT; sizeBytes ++ I）
  {
    * p_dest ++ = * p_source ++;
  }
}

天真的memcpy结果相比的memcpy（）

 缓冲区大小：1GB |的memcpy（毫秒）|的memmove（毫秒）| naiveMemcpy（）
-------------------------------------------------- ----------
笔记本电脑1 | 113 | 161 | 160
服务器1 | 301 | 159 | 159
服务器2 | 325 | 159 | 159

编辑：装配输出

简单的memcpy源

 的#include＆LT; CString的GT;
＃包括LT＆; cstdlib＆GT;INT主（INT ARGC，CHAR *的argv []）
{
  为size_t SIZE_BYTES = 1073741824; // 1GB  字符* p_big_array =（字符*）malloc的（SIZE_BYTES *的sizeof（字符））;
  字符* p_dest_array =（字符*）malloc的（SIZE_BYTES *的sizeof（字符））;  memset的（p_big_array，是0xA，SIZE_BYTES *的sizeof（字符））;
  memset的（p_dest_array，0xF的，SIZE_BYTES *的sizeof（字符））;  的memcpy（p_dest_array，p_big_array，SIZE_BYTES *的sizeof（字符））;  免费（p_dest_array）;
  免费（p_big_array）;  返回0;
}

大会输出：这是在服务器和膝上型上都完全相同。我节省了空间，而不是都粘贴

  .filemain_memcpy.cpp
        .section伪.text.startup，斧头，@ PROGBITS
        .p2align 4日，15
        .globl主
        .TYPE为主，@function
主要：
.LFB25：
        .cfi_startproc
        pushq％RBP
        .cfi_def_cfa_offset 16
        .cfi_offset 6，-16
        MOVL $ 1073741824，EDI％
        pushq％RBX
        .cfi_def_cfa_offset 24
        .cfi_offset 3，-24
        SUBQ $ 8％RSP
        .cfi_def_cfa_offset 32
        调用malloc
        MOVL $ 1073741824，EDI％
        MOVQ％RAX，RBX％
        调用malloc
        MOVL $ 1073741824，EDX％
        MOVQ％RAX，RBP％
        MOVL $ 10％ESI
        MOVQ％RBX，％RDI
        调用memset的
        MOVL $ 1073741824，EDX％
        MOVL $ 15％ESI
        MOVQ％RBP，％RDI
        调用memset的
        MOVL $ 1073741824，EDX％
        MOVQ％RBX，RSI％
        MOVQ％RBP，％RDI
        调用的memcpy
        MOVQ％RBP，％RDI
        拨打免费
        MOVQ％RBX，％RDI
        拨打免费
        addq $ 8％RSP
        .cfi_def_cfa_offset 24
        xorl％EAX，EAX％
        popq％RBX
        .cfi_def_cfa_offset 16
        popq％RBP
        .cfi_def_cfa_offset 8
        RET
        .cfi_endproc
.LFE25：
        .size为主，。，主
        .identGCC（GNU）4.6.1
        .section伪.note.GNU堆栈，，@ PROGBITS

进展!!!! ASMLib程序

根据建议，从@tbenson我试着用的memcpy的版本上运行。我最初的结果虽然穷，但改变SetMemcpyCacheLimit（）到1GB（我的缓冲区的大小）后，我在速度上不相上下运行与我天真的循环！

坏消息是，memmove与的ASMLib的版本比glibc的版本更慢，它现在在300毫秒大关运行（看齐的memcpy的glibc的版本）。奇怪的是，就当我SetMemcpyCacheLimit笔记本电脑（）大量好痛...性能

在标有SetCache已SetMemcpyCacheLimit设置为1073741824.没有SetCache结果不叫SetMemcpyCacheLimit（）线下方的结果。

使用函数从ASMLib的结果：

 缓冲区大小：1GB |的memcpy（毫秒）|的memmove（毫秒）| naiveMemcpy（）
-------------------------------------------------- ----------
笔记本电脑| 136 | 132 | 161
笔记本电脑SetCache | 182 | 137 | 161
服务器1 | 305 | 302 | 164
服务器1 SetCache | 162 | 303 | 164
服务器2 | 300 | 299 | 166
服务器2 SetCache | 166 | 301 | 166

开始走向高速缓存问题倾斜，但什么原因呢？

解决方案

[我会做这样的评论，但没有足够的声誉这样做。]

我有一个类似的系统，并看到类似的结果，但可以添加几个数据点：

如果您扭转方向的幼稚的memcpy （即转换为 * p_dest-- = * p_src - ），那么你可能会得到比正向（性能更糟糕〜637毫秒对我来说）。目前在变化的memcpy（）中的glibc 2.12的重叠的缓冲区暴露了几个错误调用的memcpy （ http://lwn.net/Articles/414467/ ），我相信这个问题是通过切换到版本引起的的memcpy 的向后运行。因此，落后与前进副本可以解释的memcpy（） / memmove与（）的差距。

这似乎是最好不要使用非临时商店。许多优化的memcpy（）实现切换到非临时存储（这是不进行缓存）大型缓冲区（即比最后一级缓存大）。我测试了瓦格纳雾的版本了memcpy（），并发现这是大致相同的速度在的glibc 版本。然而，的ASMLib 有一个函数（ SetMemcpyCacheLimit ），可设置高于该使用非时空店的门槛。设置，限制8GiB（或仅比1吉布缓冲区大），以避免非暂时存储在我的情况（时间降低到176ms）一倍的性能。当然，只有匹配正向幼稚的性能，所以它不是恒星。

在这些系统上的BIOS允许启用四个不同的硬件prefetchers /禁用（MLC流光prefetcher，MLC空间prefetcher，DCU流光prefetcher和DCU IP prefetcher ）。我试图禁用每个，但做了几个的设置，以便在保持最佳性能奇偶校验和性能降低。

禁用平均运行功率极限（RAPL）DRAM模式没有影响。

我有机会获得运行Fedora 19（glibc的2.17）等超微系统。随着超微X9DRG-HF板的Fedora 19和Xeon E5-2670处理器，我看到的性能与上述类似。上运行的至强一个超微X10SLM-F单插座板E3-1275 V3（Haswell的）和Fedora 19，我看到9.6 GB / s的的memcpy （104ms）。在Haswell的系统上的RAM为DDR3-1600（如其他系统一样）。

更新

我设置CPU电源管理，以最大性能，并在BIOS中禁用超线程。根据的/ proc内/ cpuinfo 的芯然后主频为3GHz的。然而，这种奇怪的约10％降低内存性能。

的Memtest86 + 4.10报告带宽的9091 MB / s的主内存。我找不到，如果此相对应的读，写，或复制。

基准报告复印13422 MB / S，但他们算字节读取和写入，这样对应于〜6.5 GB / s的，如果我们想比较上述结果。

We have recently purchased some new servers and are experiencing poor memcpy performance. The memcpy performance is 3x slower on the servers compared to our laptops.

Server Specs

Chassis and Mobo: SUPER MICRO 1027GR-TRF
CPU: 2x Intel Xeon E5-2680 @ 2.70 Ghz
Memory: 8x 16GB DDR3 1600MHz

Edit: I am also testing on another server with slightly higher specs and seeing the same results as the above server

Server 2 Specs

Chassis and Mobo: SUPER MICRO 10227GR-TRFT
CPU: 2x Intel Xeon E5-2650 v2 @ 2.6 Ghz
Memory: 8x 16GB DDR3 1866MHz

Laptop Specs

Chassis: Lenovo W530
CPU: 1x Intel Core i7 i7-3720QM @ 2.6Ghz
Memory: 4x 4GB DDR3 1600MHz

Operating System

$ cat /etc/redhat-release
Scientific Linux release 6.5 (Carbon) 
$ uname -a                      
Linux r113 2.6.32-431.1.2.el6.x86_64 #1 SMP Thu Dec 12 13:59:19 CST 2013 x86_64 x86_64 x86_64 GNU/Linux

Compiler (on all systems)

$ gcc --version
gcc (GCC) 4.6.1

Also tested with gcc 4.8.2 based on a suggestion from @stefan. There was no performance difference between compilers.

Test CodeThe test code below is a canned test to duplicate the problem i am seeing in our production code. I know this benchmark is simplistic but it was able to exploit and identify our problem. The code creates two 1GB buffers and memcpys between them, timing the memcpy call. You can specify alternate buffer sizes on the command line using: ./big_memcpy_test [SIZE_BYTES]

#include <chrono>
#include <cstring>
#include <iostream>
#include <cstdint>

class Timer
{
 public:
  Timer()
      : mStart(),
        mStop()
  {
    update();
  }

  void update()
  {
    mStart = std::chrono::high_resolution_clock::now();
    mStop  = mStart;
  }

  double elapsedMs()
  {
    mStop = std::chrono::high_resolution_clock::now();
    std::chrono::milliseconds elapsed_ms =
        std::chrono::duration_cast<std::chrono::milliseconds>(mStop - mStart);
    return elapsed_ms.count();
  }

 private:
  std::chrono::high_resolution_clock::time_point mStart;
  std::chrono::high_resolution_clock::time_point mStop;
};

std::string formatBytes(std::uint64_t bytes)
{
  static const int num_suffix = 5;
  static const char* suffix[num_suffix] = { "B", "KB", "MB", "GB", "TB" };
  double dbl_s_byte = bytes;
  int i = 0;
  for (; (int)(bytes / 1024.) > 0 && i < num_suffix;
       ++i, bytes /= 1024.)
  {
    dbl_s_byte = bytes / 1024.0;
  }

  const int buf_len = 64;
  char buf[buf_len];

  // use snprintf so there is no buffer overrun
  int res = snprintf(buf, buf_len,"%0.2f%s", dbl_s_byte, suffix[i]);

  // snprintf returns number of characters that would have been written if n had
  //       been sufficiently large, not counting the terminating null character.
  //       if an encoding error occurs, a negative number is returned.
  if (res >= 0)
  {
    return std::string(buf);
  }
  return std::string();
}

void doMemmove(void* pDest, const void* pSource, std::size_t sizeBytes)
{
  memmove(pDest, pSource, sizeBytes);
}

int main(int argc, char* argv[])
{
  std::uint64_t SIZE_BYTES = 1073741824; // 1GB

  if (argc > 1)
  {
    SIZE_BYTES = std::stoull(argv[1]);
    std::cout << "Using buffer size from command line: " << formatBytes(SIZE_BYTES)
              << std::endl;
  }
  else
  {
    std::cout << "To specify a custom buffer size: big_memcpy_test [SIZE_BYTES] \n"
              << "Using built in buffer size: " << formatBytes(SIZE_BYTES)
              << std::endl;
  }


  // big array to use for testing
  char* p_big_array = NULL;

  /////////////
  // malloc 
  {
    Timer timer;

    p_big_array = (char*)malloc(SIZE_BYTES * sizeof(char));
    if (p_big_array == NULL)
    {
      std::cerr << "ERROR: malloc of " << SIZE_BYTES << " returned NULL!"
                << std::endl;
      return 1;
    }

    std::cout << "malloc for " << formatBytes(SIZE_BYTES) << " took "
              << timer.elapsedMs() << "ms"
              << std::endl;
  }

  /////////////
  // memset
  {
    Timer timer;

    // set all data in p_big_array to 0
    memset(p_big_array, 0xF, SIZE_BYTES * sizeof(char));

    double elapsed_ms = timer.elapsedMs();
    std::cout << "memset for " << formatBytes(SIZE_BYTES) << " took "
              << elapsed_ms << "ms "
              << "(" << formatBytes(SIZE_BYTES / (elapsed_ms / 1.0e3)) << " bytes/sec)"
              << std::endl;
  }

  /////////////
  // memcpy 
  {
    char* p_dest_array = (char*)malloc(SIZE_BYTES);
    if (p_dest_array == NULL)
    {
      std::cerr << "ERROR: malloc of " << SIZE_BYTES << " for memcpy test"
                << " returned NULL!"
                << std::endl;
      return 1;
    }
    memset(p_dest_array, 0xF, SIZE_BYTES * sizeof(char));

    // time only the memcpy FROM p_big_array TO p_dest_array
    Timer timer;

    memcpy(p_dest_array, p_big_array, SIZE_BYTES * sizeof(char));

    double elapsed_ms = timer.elapsedMs();
    std::cout << "memcpy for " << formatBytes(SIZE_BYTES) << " took "
              << elapsed_ms << "ms "
              << "(" << formatBytes(SIZE_BYTES / (elapsed_ms / 1.0e3)) << " bytes/sec)"
              << std::endl;

    // cleanup p_dest_array
    free(p_dest_array);
    p_dest_array = NULL;
  }

  /////////////
  // memmove
  {
    char* p_dest_array = (char*)malloc(SIZE_BYTES);
    if (p_dest_array == NULL)
    {
      std::cerr << "ERROR: malloc of " << SIZE_BYTES << " for memmove test"
                << " returned NULL!"
                << std::endl;
      return 1;
    }
    memset(p_dest_array, 0xF, SIZE_BYTES * sizeof(char));

    // time only the memmove FROM p_big_array TO p_dest_array
    Timer timer;

    // memmove(p_dest_array, p_big_array, SIZE_BYTES * sizeof(char));
    doMemmove(p_dest_array, p_big_array, SIZE_BYTES * sizeof(char));

    double elapsed_ms = timer.elapsedMs();
    std::cout << "memmove for " << formatBytes(SIZE_BYTES) << " took "
              << elapsed_ms << "ms "
              << "(" << formatBytes(SIZE_BYTES / (elapsed_ms / 1.0e3)) << " bytes/sec)"
              << std::endl;

    // cleanup p_dest_array
    free(p_dest_array);
    p_dest_array = NULL;
  }


  // cleanup
  free(p_big_array);
  p_big_array = NULL;

  return 0;
}

CMake File to Build

project(big_memcpy_test)
cmake_minimum_required(VERSION 2.4.0)

include_directories(${CMAKE_CURRENT_SOURCE_DIR})

# create verbose makefiles that show each command line as it is issued
set( CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "Verbose" FORCE )
# release mode
set( CMAKE_BUILD_TYPE Release )
# grab in CXXFLAGS environment variable and append C++11 and -Wall options
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x -Wall -march=native -mtune=native" )
message( INFO "CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}" )

# sources to build
set(big_memcpy_test_SRCS
  main.cpp
)

# create an executable file named "big_memcpy_test" from
# the source files in the variable "big_memcpy_test_SRCS".
add_executable(big_memcpy_test ${big_memcpy_test_SRCS})

Test Results

Buffer Size: 1GB | malloc (ms) | memset (ms) | memcpy (ms) | NUMA nodes (numactl --hardware)
---------------------------------------------------------------------------------------------
Laptop 1         | 0           | 127         | 113         | 1
Laptop 2         | 0           | 180         | 120         | 1
Server 1         | 0           | 306         | 301         | 2
Server 2         | 0           | 352         | 325         | 2

As you can see the memcpys and memsets on our servers is much slower than the memcpys and memsets on our laptops.

Varying buffer sizes

I have tried buffers from 100MB to 5GB all with similar results (servers slower than laptop)

NUMA Affinity

I read about people having performance issues with NUMA so i tried setting CPU and memory affinity using numactl but the results remained the same.

Server NUMA Hardware

$ numactl --hardware                                                            
available: 2 nodes (0-1)                                                                     
node 0 cpus: 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23                                         
node 0 size: 65501 MB                                                                        
node 0 free: 62608 MB                                                                        
node 1 cpus: 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31                                   
node 1 size: 65536 MB                                                                        
node 1 free: 63837 MB                                                                        
node distances:                                                                              
node   0   1                                                                                 
  0:  10  21                                                                                 
  1:  21  10

Laptop NUMA Hardware

$ numactl --hardware
available: 1 nodes (0)
node 0 cpus: 0 1 2 3 4 5 6 7
node 0 size: 16018 MB
node 0 free: 6622 MB
node distances:
node   0 
  0:  10

Setting NUMA Affinity

$ numactl --cpunodebind=0 --membind=0 ./big_memcpy_test

Any help resolving this is greatly appreciated.

Edit: GCC Options

Based on comments i have tried compiling with different GCC options:

Compiling with -march and -mtune set to native

g++ -std=c++0x -Wall -march=native -mtune=native -O3 -DNDEBUG -o big_memcpy_test main.cpp

Result: Exact same performance (no improvement)

Compiling with -O2 instead of -O3

g++ -std=c++0x -Wall -march=native -mtune=native -O2 -DNDEBUG -o big_memcpy_test main.cpp

Result: Exact same performance (no improvement)

Edit: Changed memset to write 0xF instead of 0 to avoid NULL page (@SteveCox)

No improvement when memsetting with a value other than 0 (used 0xF in this case).

Edit: Cachebench results

In order to rule out that my test program is too simplistic i downloaded a real benchmarking program LLCacheBench (http://icl.cs.utk.edu/projects/llcbench/cachebench.html)

I built the benchmark on each machine separately to avoid architecture issues. Below are my results.

Notice the VERY large difference is performance on the larger buffer sizes. The last size tested (16777216) performed at 18849.29 MB/sec on the laptop and 6710.40 on the server. That's about a 3x difference in performance. You can also notice that the performance dropoff of the server is much steeper than on the laptop.

Edit: memmove() is 2x FASTER than memcpy() on the server

Based on some experimentation i have tried using memmove() instead of memcpy() in my test case and have found a 2x improvement on the server. Memmove() on the laptop runs slower than memcpy() but oddly enough runs at the same speed as the memmove() on the server. This begs the question, why is memcpy so slow?

Updated Code to test memmove along with memcpy. I had to wrap the memmove() inside a function because if i left it inline GCC optimized it and performed the exact same as memcpy() (i assume gcc optimized it to memcpy because it knew the locations didn't overlap).

Updated Results

Buffer Size: 1GB | malloc (ms) | memset (ms) | memcpy (ms) | memmove() | NUMA nodes (numactl --hardware)
---------------------------------------------------------------------------------------------------------
Laptop 1         | 0           | 127         | 113         | 161       | 1
Laptop 2         | 0           | 180         | 120         | 160       | 1
Server 1         | 0           | 306         | 301         | 159       | 2
Server 2         | 0           | 352         | 325         | 159       | 2

Edit: Naive Memcpy

Based on suggestion from @Salgar i have implemented my own naive memcpy function and tested it.

Naive Memcpy Source

void naiveMemcpy(void* pDest, const void* pSource, std::size_t sizeBytes)
{
  char* p_dest = (char*)pDest;
  const char* p_source = (const char*)pSource;
  for (std::size_t i = 0; i < sizeBytes; ++i)
  {
    *p_dest++ = *p_source++;
  }
}

Naive Memcpy Results Compared to memcpy()

Buffer Size: 1GB | memcpy (ms) | memmove(ms) | naiveMemcpy()
------------------------------------------------------------
Laptop 1         | 113         | 161         | 160
Server 1         | 301         | 159         | 159
Server 2         | 325         | 159         | 159

Edit: Assembly Output

Simple memcpy source

#include <cstring>
#include <cstdlib>

int main(int argc, char* argv[])
{
  size_t SIZE_BYTES = 1073741824; // 1GB

  char* p_big_array  = (char*)malloc(SIZE_BYTES * sizeof(char));
  char* p_dest_array = (char*)malloc(SIZE_BYTES * sizeof(char));

  memset(p_big_array,  0xA, SIZE_BYTES * sizeof(char));
  memset(p_dest_array, 0xF, SIZE_BYTES * sizeof(char));

  memcpy(p_dest_array, p_big_array, SIZE_BYTES * sizeof(char));

  free(p_dest_array);
  free(p_big_array);

  return 0;
}

Assembly Output: This is the exact same on both the server and the laptop. I'm saving space and not pasting both.

        .file   "main_memcpy.cpp"
        .section        .text.startup,"ax",@progbits
        .p2align 4,,15
        .globl  main
        .type   main, @function
main:
.LFB25:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movl    $1073741824, %edi
        pushq   %rbx
        .cfi_def_cfa_offset 24
        .cfi_offset 3, -24
        subq    $8, %rsp
        .cfi_def_cfa_offset 32
        call    malloc
        movl    $1073741824, %edi
        movq    %rax, %rbx
        call    malloc
        movl    $1073741824, %edx
        movq    %rax, %rbp
        movl    $10, %esi
        movq    %rbx, %rdi
        call    memset
        movl    $1073741824, %edx
        movl    $15, %esi
        movq    %rbp, %rdi
        call    memset
        movl    $1073741824, %edx
        movq    %rbx, %rsi
        movq    %rbp, %rdi
        call    memcpy
        movq    %rbp, %rdi
        call    free
        movq    %rbx, %rdi
        call    free
        addq    $8, %rsp
        .cfi_def_cfa_offset 24
        xorl    %eax, %eax
        popq    %rbx
        .cfi_def_cfa_offset 16
        popq    %rbp
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE25:
        .size   main, .-main
        .ident  "GCC: (GNU) 4.6.1"
        .section        .note.GNU-stack,"",@progbits

PROGRESS!!!! asmlib

Based on suggestion from @tbenson i tried running with the asmlib version of memcpy. My results initially were poor but after changing SetMemcpyCacheLimit() to 1GB (size of my buffer) i was running at speed on par with my naive for loop!

Bad news is that the asmlib version of memmove is slower than the glibc version, it is now running at the 300ms mark (on par with the glibc version of memcpy). Weird thing is that on the laptop when i SetMemcpyCacheLimit() to a large number it hurts performance...

In the results below the lines marked with SetCache have SetMemcpyCacheLimit set to 1073741824. The results without SetCache do not call SetMemcpyCacheLimit()

Results using functions from asmlib:

Buffer Size: 1GB  | memcpy (ms) | memmove(ms) | naiveMemcpy()
------------------------------------------------------------
Laptop            | 136         | 132         | 161
Laptop SetCache   | 182         | 137         | 161
Server 1          | 305         | 302         | 164
Server 1 SetCache | 162         | 303         | 164
Server 2          | 300         | 299         | 166
Server 2 SetCache | 166         | 301         | 166

Starting to lean towards cache issue, but what would cause this?

解决方案

[I would make this a comment, but do not have enough reputation to do so.]

I have a similar system and see similar results, but can add a few data points:

If you reverse the direction of your naive memcpy (i.e. convert to *p_dest-- = *p_src--), then you may get much worse performance than for the forward direction (~637 ms for me). There was a change in memcpy() in glibc 2.12 that exposed several bugs for calling memcpy on overlapping buffers (http://lwn.net/Articles/414467/) and I believe the issue was caused by switching to a version of memcpy that operates backwards. So, backward versus forward copies may explain the memcpy()/memmove() disparity.
It seems to be better to not use non-temporal stores. Many optimized memcpy() implementations switch to non-temporal stores (which are not cached) for large buffers (i.e. larger than the last level cache). I tested Agner Fog's version of memcpy (http://www.agner.org/optimize/#asmlib) and found that it was approximately the same speed as the version in glibc. However, asmlib has a function (SetMemcpyCacheLimit) that allows setting the threshold above which non-temporal stores are used. Setting that limit to 8GiB (or just larger than the 1 GiB buffer) to avoid the non-temporal stores doubled performance in my case (time down to 176ms). Of course, that only matched the forward-direction naive performance, so it is not stellar.
The BIOS on those systems allows four different hardware prefetchers to be enabled/disabled (MLC Streamer Prefetcher, MLC Spatial Prefetcher, DCU Streamer Prefetcher, and DCU IP Prefetcher). I tried disabling each, but doing so at best maintained performance parity and reduced performance for a few of the settings.
Disabling the running average power limit (RAPL) DRAM mode has no impact.
I have access to other Supermicro systems running Fedora 19 (glibc 2.17). With a Supermicro X9DRG-HF board, Fedora 19, and Xeon E5-2670 CPUs, I see similar performance as above. On a Supermicro X10SLM-F single socket board running a Xeon E3-1275 v3 (Haswell) and Fedora 19, I see 9.6 GB/s for memcpy (104ms). The RAM on the Haswell system is DDR3-1600 (same as the other systems).

UPDATES

I set the CPU power management to Max Performance and disabled hyperthreading in the BIOS. Based on /proc/cpuinfo, the cores were then clocked at 3 GHz. However, this oddly decreased memory performance by around 10%.
memtest86+ 4.10 reports bandwidth to main memory of 9091 MB/s. I could not find if this corresponds to read, write, or copy.
The STREAM benchmark reports 13422 MB/s for copy, but they count bytes as both read and written, so that corresponds to ~6.5 GB/s if we want to compare to the above results.

这篇关于在Linux上可怜的memcpy性能的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！