java - 通过JMH在sun.misc.Unsafe.compareAndSwap测量中的奇怪行为

我决定使用不同的锁定策略并使用JMH来测量增量。
我正在使用JMH来检查吞吐量和平均时间，以及用于检查正确性的简单自定义测试。
有六种策略:

原子计数

ReadWrite锁定计数

与 Volatile

同步

没有易失

的同步块

sun.misc.Unsafe.compareAndSwap

sun.misc.Unsafe.getAndAdd

不同步计数

基准代码:

@State(Scope.Benchmark)
@BenchmarkMode({Mode.Throughput, Mode.AverageTime})
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@Fork(1)
@Warmup(iterations = 5)
@Measurement(iterations = 5)
public class UnsafeCounter_Benchmark {
    public Counter unsync, syncNoV, syncV, lock, atomic, unsafe, unsafeGA;

    @Setup(Level.Iteration)
    public void prepare() {
        unsync = new UnsyncCounter();
        syncNoV = new SyncNoVolatileCounter();
        syncV = new SyncVolatileCounter();
        lock = new LockCounter();
        atomic = new AtomicCounter();
        unsafe = new UnsafeCASCounter();
        unsafeGA = new UnsafeGACounter();
    }

    @Benchmark
    public void unsyncCount() {
        unsyncCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void unsyncCounter() {
        unsync.increment();
    }

    @Benchmark
    public void syncNoVCount() {
        syncNoVCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void syncNoVCounter() {
        syncNoV.increment();
    }

    @Benchmark
    public void syncVCount() {
        syncVCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void syncVCounter() {
        syncV.increment();
    }

    @Benchmark
    public void lockCount() {
        lockCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void lockCounter() {
        lock.increment();
    }

    @Benchmark
    public void atomicCount() {
        atomicCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void atomicCounter() {
        atomic.increment();
    }

    @Benchmark
    public void unsafeCount() {
        unsafeCounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void unsafeCounter() {
        unsafe.increment();
    }

    @Benchmark
    public void unsafeGACount() {
        unsafeGACounter();
    }

    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void unsafeGACounter() {
        unsafeGA.increment();
    }

    public static void main(String[] args) throws RunnerException {
        Options baseOpts = new OptionsBuilder()
                .include(UnsafeCounter_Benchmark.class.getSimpleName())
                .threads(100)
                .jvmArgs("-ea")
                .build();

        new Runner(baseOpts).run();
    }
}

和板凳的结果:
JDK 8u20

Benchmark                                         Mode  Samples   Score    Error   Units
o.k.u.u.UnsafeCounter_Benchmark.atomicCount      thrpt        5  42.178 ± 17.643  ops/us
o.k.u.u.UnsafeCounter_Benchmark.lockCount        thrpt        5  24.044 ±  2.264  ops/us
o.k.u.u.UnsafeCounter_Benchmark.syncNoVCount     thrpt        5  22.849 ±  1.344  ops/us
o.k.u.u.UnsafeCounter_Benchmark.syncVCount       thrpt        5  20.235 ±  2.027  ops/us
o.k.u.u.UnsafeCounter_Benchmark.unsafeCount      thrpt        5  12.460 ±  1.326  ops/us
o.k.u.u.UnsafeCounter_Benchmark.unsafeGACount    thrpt        5  39.106 ±  2.966  ops/us
o.k.u.u.UnsafeCounter_Benchmark.unsyncCount      thrpt        5  93.076 ±  9.674  ops/us
o.k.u.u.UnsafeCounter_Benchmark.atomicCount       avgt        5   2.604 ±  0.133   us/op
o.k.u.u.UnsafeCounter_Benchmark.lockCount         avgt        5   4.161 ±  0.546   us/op
o.k.u.u.UnsafeCounter_Benchmark.syncNoVCount      avgt        5   4.440 ±  0.523   us/op
o.k.u.u.UnsafeCounter_Benchmark.syncVCount        avgt        5   5.073 ±  0.439   us/op
o.k.u.u.UnsafeCounter_Benchmark.unsafeCount       avgt        5   9.088 ±  5.964   us/op
o.k.u.u.UnsafeCounter_Benchmark.unsafeGACount     avgt        5   2.611 ±  0.164   us/op
o.k.u.u.UnsafeCounter_Benchmark.unsyncCount       avgt        5   1.047 ±  0.050   us/op

正如我所期望的那样，除了UnsafeCounter_Benchmark.unsafeCount与sun.misc.Unsafe.compareAndSwapLong循环一起使用while之外，大多数测量都是如此。它是最慢的锁定。

public void increment() {
    long before = counter;
    while (!unsafe.compareAndSwapLong(this, offset, before, before + 1L)) {
        before = counter;
    }
}

我建议性能低下是由于while循环和JMH引起更高的争用，但是当我通过Executors检查正确性时，得到的数字与我期望的一样:

Counter result: UnsyncCounter 97538676
Time passed in ms:259
Counter result: AtomicCounter 100000000
Time passed in ms:1805
Counter result: LockCounter 100000000
Time passed in ms:3904
Counter result: SyncNoVolatileCounter 100000000
Time passed in ms:14227
Counter result: SyncVolatileCounter 100000000
Time passed in ms:19224
Counter result: UnsafeCASCounter 100000000
Time passed in ms:8077
Counter result: UnsafeGACounter 100000000
Time passed in ms:2549

正确性测试代码:

public class UnsafeCounter_Test {
    static class CounterClient implements Runnable {
        private Counter c;
        private int num;

        public CounterClient(Counter c, int num) {
            this.c = c;
            this.num = num;
        }

        @Override
        public void run() {
            for (int i = 0; i < num; i++) {
                c.increment();
            }
        }
    }

    public static void makeTest(Counter counter) throws InterruptedException {
        int NUM_OF_THREADS = 1000;
        int NUM_OF_INCREMENTS = 100000;
        ExecutorService service = Executors.newFixedThreadPool(NUM_OF_THREADS);
        long before = System.currentTimeMillis();
        for (int i = 0; i < NUM_OF_THREADS; i++) {
            service.submit(new CounterClient(counter, NUM_OF_INCREMENTS));
        }
        service.shutdown();
        service.awaitTermination(1, TimeUnit.MINUTES);
        long after = System.currentTimeMillis();
        System.out.println("Counter result: " + counter.getClass().getSimpleName() + " " + counter.getCounter());
        System.out.println("Time passed in ms:" + (after - before));
    }

    public static void main(String[] args) throws InterruptedException {
        makeTest(new UnsyncCounter());
        makeTest(new AtomicCounter());
        makeTest(new LockCounter());
        makeTest(new SyncNoVolatileCounter());
        makeTest(new SyncVolatileCounter());
        makeTest(new UnsafeCASCounter());
        makeTest(new UnsafeGACounter());
    }
}

我知道这是一个非常糟糕的测试，但是在这种情况下，不安全的CAS比Sync变体快两倍，并且一切都按预期进行。
有人可以澄清所描述的行为吗？
有关更多信息，请参见GitHub存储库:Bench，Unsafe CAS counter

最佳答案

大声思考:人们每隔90％的乏味工作会多么频繁，而将10％(从乐趣开始的地方)留给别人，这是很了不起的!好吧，我正在享受所有的乐趣!

让我首先在我的i7-4790K 8u40 EA上重复实验:

Benchmark                                 Mode  Samples    Score    Error   Units
UnsafeCounter_Benchmark.atomicCount      thrpt        5   47.669 ± 18.440  ops/us
UnsafeCounter_Benchmark.lockCount        thrpt        5   14.497 ±  7.815  ops/us
UnsafeCounter_Benchmark.syncNoVCount     thrpt        5   11.618 ±  2.130  ops/us
UnsafeCounter_Benchmark.syncVCount       thrpt        5   11.337 ±  4.532  ops/us
UnsafeCounter_Benchmark.unsafeCount      thrpt        5    7.452 ±  1.042  ops/us
UnsafeCounter_Benchmark.unsafeGACount    thrpt        5   43.332 ±  3.435  ops/us
UnsafeCounter_Benchmark.unsyncCount      thrpt        5  102.773 ± 11.943  ops/us

的确，有关unsafeCount测试的事情似乎有些可疑。确实，您必须先验证所有数据，然后才能对其进行验证。对于nanobenchmark，您必须验证生成的代码，以查看是否实际测量了要测量的东西。在JMH中，可以使用-prof perfasm非常快速地实现和。实际上，如果您查看那里最热的unsafeCount区域，您会发现一些有趣的事情:

  0.12%    0.04%    0x00007fb45518e7d1: mov    0x10(%r10),%rax
 17.03%   23.44%    0x00007fb45518e7d5: test   %eax,0x17318825(%rip)
  0.21%    0.07%    0x00007fb45518e7db: mov    0x18(%r10),%r11    ; getfield offset
 30.33%   10.77%    0x00007fb45518e7df: mov    %rax,%r8
  0.00%             0x00007fb45518e7e2: add    $0x1,%r8
  0.01%             0x00007fb45518e7e6: cmp    0xc(%r10),%r12d    ; typecheck
                    0x00007fb45518e7ea: je     0x00007fb45518e80b ; bail to v-call
  0.83%    0.48%    0x00007fb45518e7ec: lock cmpxchg %r8,(%r10,%r11,1)
 33.27%   25.52%    0x00007fb45518e7f2: sete   %r8b
  0.12%    0.01%    0x00007fb45518e7f6: movzbl %r8b,%r8d
  0.03%    0.04%    0x00007fb45518e7fa: test   %r8d,%r8d
                    0x00007fb45518e7fd: je     0x00007fb45518e7d1 ; back branch

翻译:a)offset字段在每次迭代时都会重新读取-因为CAS内存影响意味着 Volatile 读取，因此需要悲观地重新读取该字段； b)有趣的是，出于相同的原因，也正在重新读取unsafe字段以进行类型检查。

这就是为什么高性能代码应如下所示的原因:

--- a/utils bench/src/main/java/org/kirmit/utils/unsafe/concurrency/UnsafeCASCounter.java
+++ b/utils bench/src/main/java/org/kirmit/utils/unsafe/concurrency/UnsafeCASCounter.java
@@ -5,13 +5,13 @@ import sun.misc.Unsafe;

 public class UnsafeCASCounter implements Counter {
     private volatile long counter = 0;
-    private final Unsafe unsafe = UnsafeHelper.unsafe;
-    private long offset;
-    {
+    private static final Unsafe unsafe = UnsafeHelper.unsafe;
+    private static final long offset;
+    static {
         try {
             offset = unsafe.objectFieldOffset(UnsafeCASCounter.class.getDeclaredField("counter"));
         } catch (NoSuchFieldException e) {
-            e.printStackTrace();
+            throw new IllegalStateException("Whoops!");
         }
     }

如果这样做，unsafeCount性能将立即提高:

Benchmark                              Mode  Samples   Score    Error   Units
UnsafeCounter_Benchmark.unsafeCount    thrpt        5  9.733 ± 0.673  ops/us

给定误差范围，它现在非常接近同步测试。如果现在看-prof perfasm，这是一个unsafeCount循环:

  0.08%    0.02%    0x00007f7575191900: mov    0x10(%r10),%rax
 28.09%   28.64%    0x00007f7575191904: test   %eax,0x161286f6(%rip)
  0.23%    0.08%    0x00007f757519190a: mov    %rax,%r11
                    0x00007f757519190d: add    $0x1,%r11
                    0x00007f7575191911: lock cmpxchg %r11,0x10(%r10)
 47.27%   23.48%    0x00007f7575191917: sete   %r8b
  0.10%             0x00007f757519191b: movzbl %r8b,%r8d
  0.02%             0x00007f757519191f: test   %r8d,%r8d
                    0x00007f7575191922: je     0x00007f7575191900

这个循环非常紧密，似乎没有什么可以使它运行得更快。我们花费大部分时间来加载“更新的”值并实际对其进行CAS-ing。但是我们竞争很多!为了弄清楚争用是否是主要原因，让我们添加退避:

--- a/utils bench/src/main/java/org/kirmit/utils/unsafe/concurrency/UnsafeCASCounter.java
+++ b/utils bench/src/main/java/org/kirmit/utils/unsafe/concurrency/UnsafeCASCounter.java
@@ -20,6 +21,7 @@ public class UnsafeCASCounter implements Counter {
         long before = counter;
         while (!unsafe.compareAndSwapLong(this, offset, before, before + 1L)) {
             before = counter;
+            Blackhole.consumeCPU(1000);
         }
     }

...运行中:

Benchmark                                 Mode  Samples    Score    Error   Units
UnsafeCounter_Benchmark.unsafeCount      thrpt        5   99.869 ± 107.933  ops/us

瞧我们在循环中需要做更多的工作，但是这使我们免于竞争。我之前曾尝试在"Nanotrusting the Nanotime"中对此进行解释，可能最好回到那里并进一步了解基准测试方法，尤其是在测量重量级操作时。这不仅突出了unsafeCount，还突出了整个实验的陷阱。

OP和感兴趣的读者的练习:解释为什么unsafeGACount和atomicCount的执行速度比其他测试快得多。您现在有了工具。

附言在具有C(C
P.P.S.时间检查:10分钟进行性能分析和其他实验，20分钟进行编写。您浪费了多少时间手动复制结果？ ;)