c# - 在int中对单个字节进行操作的最快方法

我发现我的应用程序花了其25％的时间来循环执行此操作:

private static int Diff (int c0, int c1)
{
    unsafe {
        byte* pc0 = (byte*) &c0;
        byte* pc1 = (byte*) &c1;
        int d0 = pc0[0] - pc1[0];
        int d1 = pc0[1] - pc1[1];
        int d2 = pc0[2] - pc1[2];
        int d3 = pc0[3] - pc1[3];
        d0 *= d0;
        d1 *= d1;
        d2 *= d2;
        d3 *= d3;
        return d0 + d1 + d2 + d3;
    }
}

如何改善这种方法的性能？到目前为止，我的想法是:

最明显的是，这将从SIMD中受益，但是让我们假设我不想去那里，因为这有点麻烦。

低级内容也是如此(调用C库，在GPGPU上执行)

多线程-我将使用它。

编辑:为了方便起见，一些反射(reflect)真实环境和用例的测试代码。 (实际上，甚至涉及更多的数据，并且不是在单个大块中比较数据，而是在每个几个kb的许多块中比较数据。)

public static class ByteCompare
{
    private static void Main ()
    {
        const int n = 1024 * 1024 * 20;
        const int repeat = 20;
        var rnd = new Random (0);

        Console.Write ("Generating test data... ");
        var t0 = Enumerable.Range (1, n)
            .Select (x => rnd.Next (int.MinValue, int.MaxValue))
            .ToArray ();
        var t1 = Enumerable.Range (1, n)
            .Select (x => rnd.Next (int.MinValue, int.MaxValue))
            .ToArray ();
        Console.WriteLine ("complete.");
        GC.Collect (2, GCCollectionMode.Forced);
        Console.WriteLine ("GCs: " + GC.CollectionCount (0));

        {
            var sw = Stopwatch.StartNew ();
            long res = 0;
            for (int reps = 0; reps < repeat; reps++) {
                for (int i = 0; i < n; i++) {
                    int c0 = t0[i];
                    int c1 = t1[i];
                    res += ByteDiff_REGULAR (c0, c1);
                }
            }
            sw.Stop ();
            Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_REGULAR");
        }
        {
            var sw = Stopwatch.StartNew ();
            long res = 0;
            for (int reps = 0; reps < repeat; reps++) {
                for (int i = 0; i < n; i++) {
                    int c0 = t0[i];
                    int c1 = t1[i];
                    res += ByteDiff_UNSAFE (c0, c1);
                }
            }
            sw.Stop ();
            Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_UNSAFE_PTR");
        }

        Console.WriteLine ("GCs: " + GC.CollectionCount (0));
        Console.WriteLine ("Test complete.");
        Console.ReadKey (true);
    }

    public static int ByteDiff_REGULAR (int c0, int c1)
    {
        var c00 = (byte) (c0 >> (8 * 0));
        var c01 = (byte) (c0 >> (8 * 1));
        var c02 = (byte) (c0 >> (8 * 2));
        var c03 = (byte) (c0 >> (8 * 3));
        var c10 = (byte) (c1 >> (8 * 0));
        var c11 = (byte) (c1 >> (8 * 1));
        var c12 = (byte) (c1 >> (8 * 2));
        var c13 = (byte) (c1 >> (8 * 3));
        var d0 = (c00 - c10);
        var d1 = (c01 - c11);
        var d2 = (c02 - c12);
        var d3 = (c03 - c13);
        d0 *= d0;
        d1 *= d1;
        d2 *= d2;
        d3 *= d3;
        return d0 + d1 + d2 + d3;
    }

    private static int ByteDiff_UNSAFE (int c0, int c1)
    {
        unsafe {
            byte* pc0 = (byte*) &c0;
            byte* pc1 = (byte*) &c1;
            int d0 = pc0[0] - pc1[0];
            int d1 = pc0[1] - pc1[1];
            int d2 = pc0[2] - pc1[2];
            int d3 = pc0[3] - pc1[3];
            d0 *= d0;
            d1 *= d1;
            d2 *= d2;
            d3 *= d3;
            return d0 + d1 + d2 + d3;
        }
    }
}

这对我有帮助(在i5上以x64 Release运行):

Generating test data... complete.
GCs: 8
res=18324555528140, t=1.46s - ByteDiff_REGULAR
res=18324555528140, t=1.15s - ByteDiff_UNSAFE
res=18324555528140, t=1.73s - Diff_Alex1
res=18324555528140, t=1.63s - Diff_Alex2
res=18324555528140, t=3.59s - Diff_Alex3
res=18325828513740, t=3.90s - Diff_Alex4
GCs: 8
Test complete.

最佳答案

如果需要，请避免使用它，但是实际上C#直接为它提供了很好的支持。如果没有较大的算法适合SIMD处理，那么我不希望将其卸载到GPU上，这将是迄今为止最大的性能赢家。

http://www.drdobbs.com/architecture-and-design/simd-enabled-vector-types-with-c/240168888

当然，每个CPU内核使用一个线程。您还可以使用Parallel.For之类的结构，并让.NET整理出要使用的线程数。这样做很不错，但是由于您知道这肯定是受CPU限制的，因此您可以(也可以不)通过自己管理线程来获得更好的结果。

至于加快实际代码块的速度，使用位掩码和位移位以使各个值可以工作而不是使用指针可能会更快。这样做还有一个好处，就是您不需要不安全的代码块，例如

byte b0_leftmost = (c0 & 0xff000000) >> 24;

关于c# - 在int中对单个字节进行操作的最快方法，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/30008214/

在I

c# - 在int中对单个字节进行操作的最快方法