当使用可变范围时，循环不被矢量化

本文介绍了当使用可变范围时，循环不被矢量化的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

A版代码不向量化，而版本B的代码被量化。

如何让A版本矢量并保持可变范围（不使用文字程度）

嵌套的循环是与广播乘法为Python和MATLAB的numpy的库。在numpy的图书馆广播说明是。

版本A代码（无std :: vector，无向量化）

这只使用 imull在 .L169 中，不是SIMD指令。

 
 
（％rsi），％edx

的

  #include< iostream> 
 #include< stdint.h> 
 typedef int32_t DATA_TYPE; 
 template< size_t N> 
 size_t cal_size（size_t（& Ashape）[N]）{
 size_t size = 1; 
 for（size_t i = 0; i  return size; 
} 
 template< size_t N> 
 size_t * cal_stride（size_t（& Ashape）[N]）{
 size_t size = cal_size（Ashape）; 
 
 size_t * Astride = new size_t [N]; 
 Astride [0] = size / Ashape [0]; 
 for（size_t i = 1; i  Astride [i] = Astride [i-1] / Ashape [i]; 
} 
 return Astride; 
} 
 template< size_t N> 
 DATA_TYPE * init_data（size_t（& Ashape）[N]）{
 size_t size = cal_size（Ashape）; 
 DATA_TYPE * data = new DATA_TYPE [size]; 
 for（size_t i = 0; i  data [i] = i + 1; 
} 
返回数据; 
} 
 
 template< size_t N> 
 void print_data（DATA_TYPE * Adata，size_t（& Ashape）[N]）{
 size_t size = cal_size（Ashape）; 
 for（size_t i = 0; i  std :: cout< Adata [i]< ，; 
} 
 std :: cout<< std :: endl; 
} 
 
 int main（void）{
 constexpr size_t nd = 3; 
 size_t Ashape [] = {20,3,4}; 
 size_t Bshape [] = {1,3,1}; 
 auto Astride = cal_stride（Ashape）; 
 auto Bstride = cal_stride（Bshape）; 
 
 auto Adata = init_data（Ashape）; 
 auto Bdata = init_data（Bshape）; 
 
 size_t c [nd] = {0,0,0}; 
 /// counter 
 size_t hint [nd] = {0,2,0}; 
 //提示告诉哪些是广播轴。 
 size_t A_i，B_i; 
 for（c [0] = 0; c [0]  for（c [ 1] = 0; c [1]  for（c [2] = 0; c [2] ]< Ashape [2]; ++ c [2]）{// Asape as hint [2] = 0 
 A_i = c [0] * Astride [0] + c [1] * Astride [1] ] + c [2] * Astride [2]; 
 B_i = c [1] * Bstride [1]; 
 Adata [A_i] * = Bdata [B_i]; 
} 
} 
} 
 print_data（Adata，Ashape）; 
}

版本B代码（无std :: vector。文字扩展，）

这使用 .L20 中的 pmulld％xmm3，％xmm2 $ C>，这是一个SIMD指令。

 
 
 的 
  #include< iostream> 
 #include< stdint.h> 
 typedef int32_t DATA_TYPE; 
 
 
无效print_data（DATA_TYPE *威刚，为size_t大小）{
为（为size_t i = 0; I<大小; ++ I）{
标准： ：cout< Adata [i]< ，; 
} 
 std :: cout<< std :: endl; 
} 
 
 int main（void）{
 int32_t Adata [240]; 
 int32_t Bdata [3]; 
 size_t A_i，B_i，i，j，k; 
 for（i = 0; i  for（j = 0; j  for（k = 0; k  A_i = i * 12 + j * 4 + k * 1; 
 B_i = j * 1; 
 Adata [A_i] * = Bdata [B_i]; 
} 
} 
} 
 print_data（Adata，240）; 
} 
  
 boost multiarray vectorize但为什么？ 
不确定是否使用simd对齐内存。
 
 
   
  #includeboost / multi_array .hpp
 #include< iostream> 
 
 int 
 main（）{
 //创建一个3D数组，为3 x 4 x 2 
 int d1，d2，d3; 
 typedef boost :: multi_array< int，3> array_type; 
 typedef array_type :: index index; 
 array_type A（boost :: extend [d1] [d2] [d3]）; 
 array_type B（boost :: extents [1] [d2] [1]）; 
 //为元素赋值
 
 for（index i = 0; i！= d1; ++ i）
 for（index j = 0; j！= d2 ; ++ j）
 for（index k = 0; k！= d3; ++ k）
 A [i] [j] [k] * = B [0] [j] [0 ]; 
 
 for（index i = 0; i！= d1; ++ i）
 for（index j = 0; j！= d2; ++ j）
 for索引k = 0; k！= d3; ++ k）
 std :: cout< A [i] [j] [k]; 
 return 0; 
} 
  
 在gcc.gnu.org，描述一些gcc的循环优化。我希望Symbolic Chrecs（对应于未分析的变量）意味着gcc可以融合具有可变范围的嵌套循环。
 
 
 最后的手段是实现带有meta-编程。
解决方案
为了测试循环迭代内和跨环循环之间的内存访问之间的依赖关系，数组索引编译），应该具有 i * c0 + j * c1 + k * c2 + ... 的形式，其中 i，j，k .. 。是循环计数器， c0，c1，c2 ... 是整数常量。
 
 
  c [2] * Astride [2]

，因为

 Astride [2]   
 > A_i = c [0] * Astride [0] + c [1] * Astride [1] + c [2] 
  
允许向量化循环。
 
 
   
 
 
 （编辑：请注意， X = Adata + c [0] * Astride [0] + c [1] * Astride [1] 是循环不变，因此在最内循环中，我们只有数组 X 和索引 c [0] ）。
 
 
 现在，问题是如何在 AStride [2] 的地方涓滴一个常数。幸运的是，从你的计算看来， cal_stride ，最后一个总是1.所以我们应该做。
 
Version A code is not vectorized while version B code is vectorized.
How to make version A vectorize and keep the variable extents (without using literal extents)?
The nested loop is for multiplication with broadcasting as in numpy library of python and matlab. Description of broadcasting in numpy library is here.
Version A code (no std::vector. no vectorization.)
This only uses imull   (%rsi), %edx in .L169, which is not a SIMD instruction.
gcc godbolt
#include <iostream>
#include <stdint.h>
typedef int32_t DATA_TYPE;
template <size_t N>
size_t cal_size(size_t (&Ashape)[N]){
    size_t size = 1;
    for(size_t i = 0; i < N; ++i) size *= Ashape[i];
    return size;
}
template <size_t N>
size_t * cal_stride(size_t (&Ashape)[N] ) {
    size_t size = cal_size(Ashape);

    size_t * Astride = new size_t[N];
    Astride[0] = size/Ashape[0];
    for(size_t i = 1; i < N; ++i){
        Astride[i] = Astride[i-1]/Ashape[i];
    }
    return Astride;
}
template <size_t N>
DATA_TYPE * init_data( size_t (&Ashape)[N] ){
    size_t size = cal_size(Ashape);
    DATA_TYPE * data = new DATA_TYPE[size];
    for(size_t i = 0; i < size; ++i){
        data[i] = i + 1;
    }
    return data;
}

template <size_t N>
void print_data(DATA_TYPE * Adata, size_t (&Ashape)[N] ){
    size_t size = cal_size(Ashape);
    for(size_t i = 0; i < size; ++i){
        std::cout << Adata[i] << ", ";
    }
    std::cout << std::endl;
}

int main(void){
    constexpr size_t nd = 3;
    size_t Ashape[] = {20,3,4};
    size_t Bshape[] = {1,3,1};
    auto Astride = cal_stride(Ashape);
    auto Bstride = cal_stride(Bshape);

    auto Adata = init_data(Ashape);
    auto Bdata = init_data(Bshape);

    size_t c[nd] = {0,0,0};
        ///counter
    size_t hint[nd] = {0,2,0};
        //hint tells which are the broadcasting axes.
    size_t A_i, B_i;
    for(c[0] = 0; c[0] < Ashape[0]; ++c[0]){ // Ashape as hint[0] = 0
        for(c[1] = 0; c[1] < Bshape[1]; ++c[1]){ //Bshape as hint[1] = 2
            for(c[2] = 0; c[2] < Ashape[2];++c[2]){ //Asape as hint[2] = 0
                A_i = c[0]*Astride[0] + c[1]*Astride[1] + c[2]*Astride[2];
                B_i = c[1]*Bstride[1];
                Adata[A_i] *= Bdata[B_i];
            }
        }
    }
    print_data(Adata, Ashape);
}
Version B Code (no std::vector. literal extents, and this vectorizes)
This uses pmulld  %xmm3, %xmm2 in .L20, which is a SIMD instruction.
gcc godbolt
#include <iostream>
#include <stdint.h>
typedef int32_t DATA_TYPE;


void print_data(DATA_TYPE * Adata, size_t size ){
    for(size_t i = 0; i < size; ++i){
        std::cout << Adata[i] << ", ";
    }
    std::cout << std::endl;
}

int main(void){
    int32_t Adata[240];
    int32_t Bdata[3];
    size_t A_i, B_i, i,j,k;
    for(i = 0; i < 20; ++i){
        for(j = 0; j < 3; ++j){
            for(k = 0; k < 4;++k){
                A_i = i*12 + j*4 + k*1;
                B_i = j*1;
                Adata[A_i] *= Bdata[B_i];
            }
        }
    }
    print_data(Adata, 240);
}
boost multiarray vectorize but why?Not sure if it use simd alignment for memory.
gcc godbolt
#include "boost/multi_array.hpp"
#include <iostream>

int
main () {
  // Create a 3D array that is 3 x 4 x 2
  int d1,d2,d3;
  typedef boost::multi_array<int, 3> array_type;
  typedef array_type::index index;
  array_type A(boost::extents[d1][d2][d3]);
  array_type B(boost::extents[1][d2][1]);
  // Assign values to the elements

  for(index i = 0; i != d1; ++i)
    for(index j = 0; j != d2; ++j)
      for(index k = 0; k != d3; ++k)
        A[i][j][k] *= B[0][j][0];

  for(index i = 0; i != d1; ++i)
    for(index j = 0; j != d2; ++j)
      for(index k = 0; k != d3; ++k)
        std::cout << A[i][j][k];
  return 0;
}
2004 pdf at gcc.gnu.org that describes some loop optimization of gcc. I hope the "Symbolic Chrecs" (which corresponds to unanalyzed variables) means gcc can fuse nested loop with variable extents.
The last resort is to implement loop fusion with meta-programming.
 解决方案 
In order to test dependencies between memory accesses within and across loop iterations, the array indices (in the polyhedral model of compilation), should have the form i*c0 + j*c1 + k*c2 + ..., where i, j, k ... are loop counters and c0, c1, c2 ... are integer constants.
In your example, apparently the problem is with c[2] * Astride[2], because Astride[2] is not a constant.
Indeed, leaving the expression 
A_i = c[0]*Astride[0] + c[1]*Astride[1] + c[2];
allows the loop to be vectorised.
https://godbolt.org/g/gOUVfE
(edit: Note that X = Adata + c[0]*Astride[0] + c[1]*Astride[1] is a loop invariant, so within the innermost loop we have just array X and index c[0]).
Now, the question is how to trickle down a constant in the place of AStride[2]. Fortunately, it seems from your calculation in cal_stride, the last one is always 1. So we should be done.
                        这篇关于当使用可变范围时，循环不被矢量化的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！