我发现,当
而不是使用变量(i)来计算std::vector<T>::iterator
。
多亏了一些评论,这里有一些其他信息:(1)
我使用Visual Studio C++编译器; (2)我在 Release模式下进行了编译,并使用了-O2优化:)
Image of the console
如果变量i递增,则迭代将花费
5875ms:
std::vector<Data> vec(MAX_DATA);
stopWatch.start();
for (unsigned i = 0U; i < MAX_DATA; ++i) {
vec[i].x = 0;
vec[i].y = 0;
}
stopWatch.stop();
stopWatch.printSpanAsMs("The data are stored in memory next to each other");
或5723毫秒:
std::vector<Data*> vec2;
for (unsigned i = 0U; i < MAX_DATA; ++i)
vec2.push_back(new Data());
stopWatch.start();
for (unsigned i = 0U; i < MAX_DATA; ++i) {
vec2[i]->x = 0;
vec2[i]->y = 0;
}
stopWatch.stop();
stopWatch.printSpanAsMs("The data is in memory at a random position");
如果使用
std::vector<Data>::Iterator
进行迭代,则迭代将花费29毫秒:
std::vector<Data> vec(MAX_DATA);
stopWatch.start();
for (auto& it : vec) {
it.x = 0;
it.y = 0;
}
stopWatch.stop();
stopWatch.printSpanAsMs("The data are stored in memory next to each other");
或110毫秒:
std::vector<Data*> vec2;
for (unsigned i = 0U; i < MAX_DATA; ++i)
vec2.push_back(new Data());
stopWatch.start();
for (auto& it : vec2) {
it->x = 0;
it->y = 0;
}
stopWatch.stop();
stopWatch.printSpanAsMs("The data is in memory at a random position");
为什么另一个迭代这么快?
我想知道,使用变量i进行数据在内存中不同位置处的迭代是否与使用变量i进行数据存储中并置的迭代一样快。
数据在内存中彼此相邻的事实应减少高速缓存未命中,并且可以与
std::vector<Data>::Iterator
一起进行迭代,为什么不与另一项一起进行?还是我敢于和29到110ms的距离不是高速缓存未偿还债务吗?
整个程序如下所示:
#include <iostream>
#include <chrono>
#include <vector>
#include <string>
class StopWatch
{
public:
void start() {
this->t1 = std::chrono::high_resolution_clock::now();
}
void stop() {
this->t2 = std::chrono::high_resolution_clock::now();
this->diff = t2 - t1;
}
void printSpanAsMs(std::string startText = "time span") {
long diffAsMs = std::chrono::duration_cast<std::chrono::milliseconds>
(diff).count();
std::cout << startText << ": " << diffAsMs << "ms" << std::endl;
}
private:
std::chrono::high_resolution_clock::time_point t1, t2;
std::chrono::high_resolution_clock::duration diff;
} stopWatch;
struct Data {
int x, y;
};
const unsigned long MAX_DATA = 20000000;
void test1()
{
std::cout << "1. Test \n Use i to iterate through the vector" <<
std::endl;
std::vector<Data> vec(MAX_DATA);
stopWatch.start();
for (unsigned i = 0U; i < MAX_DATA; ++i) {
vec[i].x = 0;
vec[i].y = 0;
}
stopWatch.stop();
stopWatch.printSpanAsMs("The data are stored in memory next to each
other");
//////////////////////////////////////////////////
std::vector<Data*> vec2;
for (unsigned i = 0U; i < MAX_DATA; ++i)
vec2.push_back(new Data());
stopWatch.start();
for (unsigned i = 0U; i < MAX_DATA; ++i) {
vec2[i]->x = 0;
vec2[i]->y = 0;
}
stopWatch.stop();
stopWatch.printSpanAsMs("The data is in memory at a random position");
for (unsigned i = 0U; i < MAX_DATA; ++i) {
delete vec2[i];
vec2[i] = nullptr;
}
}
void test2()
{
std::cout << "2. Test \n Use std::vector<T>::iteraror to iterate through
the vector" << std::endl;
std::vector<Data> vec(MAX_DATA);
stopWatch.start();
for (auto& it : vec) {
it.x = 0;
it.y = 0;
}
stopWatch.stop();
stopWatch.printSpanAsMs("The data are stored in memory next to each
other");
//////////////////////////////////////////////////
std::vector<Data*> vec2;
for (unsigned i = 0U; i < MAX_DATA; ++i)
vec2.push_back(new Data());
stopWatch.start();
for (auto& it : vec2) {
it->x = 0;
it->y = 0;
}
stopWatch.stop();
stopWatch.printSpanAsMs("The data is in memory at a random position");
for (auto& it : vec2) {
delete it;
it = nullptr;
}
}
int main()
{
test1();
test2();
system("PAUSE");
return 0;
}
最佳答案
原因是MSVC 2017无法正确优化它。
在第一种情况下,它完全无法优化循环:
for (unsigned i = 0U; i < MAX_DATA; ++i) {
vec[i].x = 0;
vec[i].y = 0;
}
生成的代码(live demo):
xor r9d, r9d
mov eax, r9d
$LL4@test1:
mov rdx, QWORD PTR [rcx]
lea rax, QWORD PTR [rax+16]
mov DWORD PTR [rax+rdx-16], r9d
mov rdx, QWORD PTR [rcx]
mov DWORD PTR [rax+rdx-12], r9d
mov rdx, QWORD PTR [rcx]
mov DWORD PTR [rax+rdx-8], r9d
mov rdx, QWORD PTR [rcx]
mov DWORD PTR [rax+rdx-4], r9d
sub r8, 1
jne SHORT $LL4@test1
用
unsigned i
替换size_t i
或将索引访问提升到引用中无济于事(demo)。唯一有用的是使用迭代器,就像您已经发现的那样:
for (auto& it : vec) {
it.x = 0;
it.y = 0;
}
生成的代码(live demo):
xor ecx, ecx
npad 2
$LL4@test2:
mov QWORD PTR [rax], rcx
add rax, 8
cmp rax, rdx
jne SHORT $LL4@test2
在这两种情况下,clang只会调用
memset
。故事的寓意:如果您关心性能,请查看生成的代码。向供应商报告问题。
关于c++ - 迭代时的性能(缓存未命中),我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/47018162/