在数值计算中,通常需要将数字缩放到安全范围内。

例如,计算欧几里得距离:sqrt(a^2+b^2)。在此,如果ab的大小太小/太大,则可能发生下溢/上溢。

解决此问题的常用方法是将数字除以最大数量级数。但是,此解决方案是:

  • 慢(除法慢)
  • 会引起一些额外的误差

  • 因此,我认为不要将其除以最大的数量级,而应将其乘以2的幂的倒数。这似乎是一个更好的解决方案,如下所示:
  • 乘法比除法
  • 快得多
  • 精度更高,因为乘以2的幂可以精确地表示

  • 因此,我想创建一个小的实用程序函数,它具有如下所示的逻辑(通过^,我的意思是幂运算):
    void getScaler(double value, double &scaler, double &scalerReciprocal) {
        int e = <exponent of value>;
        if (e<-1022) { scaler=2^-1022; scalerReciprocal = 2^1022; }
        } else if (e>1022) { scaler=2^1022; scalerReciprocal = 2^-1022; }
        } else { scaler=2^e; scalerReciprocal = 2^(2046-e); }
    }
    

    此函数应返回规范化的scalerscalerReciprocal,它们都是2的幂,其中scaler接近value,而scalerReciprocalscaler的倒数。
    scaler/scaleReciprocal允许的最大指数是-1022..1022(我不想使用次普通scaler,因为次普通数字可能很慢)。

    什么是快速方法做到这一点?可以使用纯浮点运算来完成此操作吗?还是应该从value提取指数,并使用简单的if进行逻辑处理?是否有某种技巧可以快速与(-)1022进行比较(因为范围是对称的)?

    注意:scaler不必是最接近的2的幂。如果某些逻辑需要它,scaler可以与最接近的值相差很小的2的幂。

    最佳答案

    函数s = get_scale(z)计算“2的关闭幂”。由于s的小数位
    如果为零,则s的倒数只是一个(廉价的)整数减法:请参见函数inv_of_scale

    在x86上,get_scaleinv_of_scale可以编译为使用clang的高效汇编程序。
    编译器clang将三元运算符转换为minsdmaxsd
    另请参阅彼得·科德斯(Peter Cordes)的comment
    使用gcc时,
    将这些功能转换为x86内在函数
    代码(get_scale_x86inv_of_scale_x86),see Godbolt

    注意C explicitly permits type-punningthrough a union, whereas C++ (c++11) has no such permission
    尽管gcc 8.2和clang 7.0并没有提示 union ,但是您可以改进
    C++通过using the memcpy trick而不是
    union 把戏。对代码的这种修改应该是微不足道的。
    该代码应正确处理次常态。

    #include<stdio.h>
    #include<stdint.h>
    #include<immintrin.h>
    /* gcc -Wall -m64 -O3 -march=sandybridge dbl_scale.c */
    
    union dbl_int64{
        double d;
        uint64_t i;
    };
    
    double get_scale(double t){
        union dbl_int64 x;
        union dbl_int64 x_min;
        union dbl_int64 x_max;
        uint64_t mask_i;
               /* 0xFEDCBA9876543210 */
        x_min.i = 0x0010000000000000ull;
        x_max.i = 0x7FD0000000000000ull;
        mask_i =  0x7FF0000000000000ull;
        x.d = t;
        x.i = x.i & mask_i;                    /* Set fraction bits to zero, take absolute value */
        x.d = (x.d < x_min.d) ? x_min.d : x.d; /* If subnormal: set exponent to 1                */
        x.d = (x.d > x_max.d) ? x_max.d : x.d; /* If exponent is very large: set exponent to 7FD, otherwise the inverse is a subnormal */
        return x.d;
    }
    
    double get_scale_x86(double t){
        __m128d x = _mm_set_sd(t);
        __m128d x_min = _mm_castsi128_pd(_mm_set1_epi64x(0x0010000000000000ull));
        __m128d x_max = _mm_castsi128_pd(_mm_set1_epi64x(0x7FD0000000000000ull));
        __m128d mask  = _mm_castsi128_pd(_mm_set1_epi64x(0x7FF0000000000000ull));
                x     = _mm_and_pd(x, mask);
                x     = _mm_max_sd(x, x_min);
                x     = _mm_min_sd(x, x_max);
        return _mm_cvtsd_f64(x);
    }
    
    /* Compute the inverse 1/t of a double t with all zero fraction bits     */
    /* and exponent between the limits of function get_scale                 */
    /* A single integer subtraction is much less expensive than a            */
    /* floating point division.                                               */
    double inv_of_scale(double t){
        union dbl_int64 x;
                         /* 0xFEDCBA9876543210 */
        uint64_t inv_mask = 0x7FE0000000000000ull;
        x.d = t;
        x.i = inv_mask - x.i;
        return x.d;
    }
    
    double inv_of_scale_x86(double t){
        __m128i inv_mask = _mm_set1_epi64x(0x7FE0000000000000ull);
        __m128d x        = _mm_set_sd(t);
        __m128i x_i      = _mm_sub_epi64(inv_mask, _mm_castpd_si128(x));
        return _mm_cvtsd_f64(_mm_castsi128_pd(x_i));
    }
    
    
    int main(){
        int n = 14;
        int i;
        /* Several example values, 4.94e-324 is the smallest subnormal */
        double y[14] = { 4.94e-324, 1.1e-320,  1.1e-300,  1.1e-5,  0.7,  1.7,  123.1, 1.1e300,
                         1.79e308, -1.1e-320,    -0.7, -1.7, -123.1,  -1.1e307};
        double z, s, u;
    
        printf("Portable code:\n");
        printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
        for (i = 0; i < n; i++){
            z = y[i];
            s = get_scale(z);
            u = inv_of_scale(s);
            printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
        }
    
        printf("\nx86 specific SSE code:\n");
        printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
        for (i = 0; i < n; i++){
            z = y[i];
            s = get_scale_x86(z);
            u = inv_of_scale_x86(s);
            printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
        }
    
        return 0;
    }
    

    输出看起来不错:
    Portable code:
                 x       pow_of_2        inverse       pow2*inv      x*inverse
     4.940656e-324  2.225074e-308  4.494233e+307   1.000000e+00   2.220446e-16
     1.099790e-320  2.225074e-308  4.494233e+307   1.000000e+00   4.942713e-13
     1.100000e-300  7.466109e-301  1.339386e+300   1.000000e+00   1.473324e+00
      1.100000e-05   7.629395e-06   1.310720e+05   1.000000e+00   1.441792e+00
      7.000000e-01   5.000000e-01   2.000000e+00   1.000000e+00   1.400000e+00
      1.700000e+00   1.000000e+00   1.000000e+00   1.000000e+00   1.700000e+00
      1.231000e+02   6.400000e+01   1.562500e-02   1.000000e+00   1.923437e+00
     1.100000e+300  6.696929e+299  1.493222e-300   1.000000e+00   1.642544e+00
     1.790000e+308  4.494233e+307  2.225074e-308   1.000000e+00   3.982882e+00
    -1.099790e-320  2.225074e-308  4.494233e+307   1.000000e+00  -4.942713e-13
     -7.000000e-01   5.000000e-01   2.000000e+00   1.000000e+00  -1.400000e+00
     -1.700000e+00   1.000000e+00   1.000000e+00   1.000000e+00  -1.700000e+00
     -1.231000e+02   6.400000e+01   1.562500e-02   1.000000e+00  -1.923437e+00
    -1.100000e+307  5.617791e+306  1.780059e-307   1.000000e+00  -1.958065e+00
    
    x86 specific SSE code:
                 x       pow_of_2        inverse       pow2*inv      x*inverse
     4.940656e-324  2.225074e-308  4.494233e+307   1.000000e+00   2.220446e-16
     1.099790e-320  2.225074e-308  4.494233e+307   1.000000e+00   4.942713e-13
     1.100000e-300  7.466109e-301  1.339386e+300   1.000000e+00   1.473324e+00
      1.100000e-05   7.629395e-06   1.310720e+05   1.000000e+00   1.441792e+00
      7.000000e-01   5.000000e-01   2.000000e+00   1.000000e+00   1.400000e+00
      1.700000e+00   1.000000e+00   1.000000e+00   1.000000e+00   1.700000e+00
      1.231000e+02   6.400000e+01   1.562500e-02   1.000000e+00   1.923437e+00
     1.100000e+300  6.696929e+299  1.493222e-300   1.000000e+00   1.642544e+00
     1.790000e+308  4.494233e+307  2.225074e-308   1.000000e+00   3.982882e+00
    -1.099790e-320  2.225074e-308  4.494233e+307   1.000000e+00  -4.942713e-13
     -7.000000e-01   5.000000e-01   2.000000e+00   1.000000e+00  -1.400000e+00
     -1.700000e+00   1.000000e+00   1.000000e+00   1.000000e+00  -1.700000e+00
     -1.231000e+02   6.400000e+01   1.562500e-02   1.000000e+00  -1.923437e+00
    -1.100000e+307  5.617791e+306  1.780059e-307   1.000000e+00  -1.958065e+00
    

    向量化

    函数get_scale应该使用支持自动矢量化的编译器进行矢量化。以下的
    代码vectorizes very well with clang(无需编写SSE/AVX内部函数代码)。
    /* Test how well get_scale vectorizes: */
    void get_scale_vec(double * __restrict__ t, double * __restrict__ x){
        int n = 1024;
        int i;
        for (i = 0; i < n; i++){
            x[i] = get_scale(t[i]);
        }
    }
    

    不幸的是,gcc找不到vmaxpdvminpd指令。

    10-07 15:20