Agner`s CPU blog

Software optimization resources | E-mail subscription to this blog | www.agner.org

Instruction Throughput on Skylake
Author:  Date: 2016-07-11 22:21
OK, here's my cleaned up test code.

// gcc -g -Wall -O2 fusion.c -o fusion -DLIKWID -llikwid [may also need -lm -lpthread]
// likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion

#include <x86intrin.h> #include <stdint.h> #include <stdio.h>

#ifdef LIKWID #include <likwid.h> #define MEASURE_INIT() \ do { \ likwid_markerInit(); \ likwid_markerThreadInit(); \ } while (0) #define MEASURE_FINI() \ do { \ likwid_markerClose(); \ } while (0) #define MEASURE(name, code) \ do { \ sum1 = sum2 = 0; \ likwid_markerStartRegion(name); \ code; \ likwid_markerStopRegion(name); \ printf("%s: sum1=%ld, sum2=%ld\n", name, sum1, sum2); \ } while (0) #else // not LIKWID #define MEASURE_INIT() #define MEASURE_FINI() #define MEASURE(name, code) \ do { \ sum1 = sum2 = 0; \ code; \ printf("%s: sum1=%ld, sum2=%ld\n", name, sum1, sum2); \ } while (0) #endif // not LIKWID

#define ASM_TWO_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max) \ __asm volatile ("1:\n" \ "add (%[IN1]), %[SUM1]\n" \ "cmp %[MAX], %[SUM1]\n" \ "jae 2f\n" \ "add (%[IN2]), %[SUM2]\n" \ "cmp %[MAX], %[SUM2]\n" \ "jb 1b\n" \ "2:" : \ [SUM1] "+&r" (sum1), \ [SUM2] "+&r" (sum2) : \ [IN1] "r" (in1), \ [IN2] "r" (in2), \ [MAX] "r" (max))

#define ASM_NO_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max, tmp1, tmp2) \ __asm volatile ("1:\n" \ "mov (%[IN1]), %[TMP1]\n" \ "add %[TMP1], %[SUM1]\n" \ "cmp %[MAX], %[SUM1]\n" \ "jae 2f\n" \ "mov (%[IN2]), %[TMP2]\n" \ "add %[TMP2], %[SUM2]\n" \ "cmp %[MAX], %[SUM2]\n" \ "jb 1b\n" \ "2:" : \ [TMP1] "=&r" (tmp1), \ [TMP2] "=&r" (tmp2), \ [SUM1] "+&r" (sum1), \ [SUM2] "+&r" (sum2) : \ [IN1] "r" (in1), \ [IN2] "r" (in2), \ [MAX] "r" (max))

#define ASM_ONE_MICRO_TWO_MACRO(in1, sum1, in2, sum2, max, tmp) \ __asm volatile ("1:\n" \ "add (%[IN1]), %[SUM1]\n" \ "cmp %[MAX], %[SUM1]\n" \ "jae 2f\n" \ "mov (%[IN2]), %[TMP]\n" \ "add %[TMP], %[SUM2]\n" \ "cmp %[MAX], %[SUM2]\n" \ "jb 1b\n" \ "2:" : \ [TMP] "=&r" (tmp), \ [SUM1] "+&r" (sum1), \ [SUM2] "+&r" (sum2) : \ [IN1] "r" (in1), \ [IN2] "r" (in2), \ [MAX] "r" (max))

#define ASM_ONE_MICRO_ONE_MACRO(in1, sum1, in2, sum2, max, tmp) \ __asm volatile ("1:\n" \ "add (%[IN1]), %[SUM1]\n" \ "cmp %[MAX], %[SUM1]\n" \ "mov (%[IN1]), %[TMP]\n" \ "jae 2f\n" \ "add %[TMP], %[SUM2]\n" \ "cmp %[MAX], %[SUM2]\n" \ "jb 1b\n" \ "2:" : \ [TMP] "=&r" (tmp), \ [SUM1] "+&r" (sum1), \ [SUM2] "+&r" (sum2) : \ [IN1] "r" (in1), \ [IN2] "r" (in2), \ [MAX] "r" (max))

// two separate loads and adds, two non-fused cmp then jcc #define ASM_NO_MICRO_NO_MACRO(in1, sum1, in2, sum2, max, tmp1, tmp2) \ __asm volatile ("mov (%[IN1]), %[TMP1]\n" \ "1:\n" \ "add %[TMP1], %[SUM1]\n" \ "cmp %[MAX], %[SUM1]\n" \ "mov (%[IN2]), %[TMP2]\n" \ "jae 2f\n" \ "add %[TMP2], %[SUM2]\n" \ "cmp %[MAX], %[SUM2]\n" \ "mov (%[IN1]), %[TMP1]\n" \ "jb 1b\n" \ "2:" : \ [TMP1] "=&r" (tmp1), \ [TMP2] "=&r" (tmp2), \ [SUM1] "+&r" (sum1), \ [SUM2] "+&r" (sum2) : \ [IN1] "r" (in1), \ [IN2] "r" (in2), \ [MAX] "r" (max))

int main(/* int argc, char **argv */) { uint64_t tmp, tmp1, tmp2; uint64_t sum1, sum2; uint64_t in1 = 1; uint64_t in2 = 1; uint64_t max = 10000000;

MEASURE_INIT();

MEASURE("two_micro_two_macro", ASM_TWO_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max));

MEASURE("one_micro_two_macro", ASM_ONE_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max, tmp));

MEASURE("one_micro_one_macro", ASM_ONE_MICRO_ONE_MACRO(&in1, sum1, &in2, sum2, max, tmp));

MEASURE("no_micro_two_macro", ASM_NO_MICRO_TWO_MACRO(&in1, sum1, &in2, sum2, max, tmp1, tmp2));

MEASURE("no_micro_no_macro", ASM_NO_MICRO_NO_MACRO(&in1, sum1, &in2, sum2, max, tmp1, tmp2));

MEASURE_FINI();

return 0; }

And here's what I see on Skylake:

nate@skylake:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion
CPU name:	Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
CPU type:	Intel Skylake processor
CPU clock:	3.41 GHz
--------------------------------------------------------------------------------
two_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_one_macro: sum1=10000000, sum2=9999999
no_micro_two_macro: sum1=10000000, sum2=9999999
no_micro_no_macro: sum1=10000000, sum2=9999999
--------------------------------------------------------------------------------
================================================================================
Group 1 Custom: Region two_micro_two_macro
================================================================================
|       UOPS_ISSUED_ANY      |   PMC0  | 4.000816e+07 |
|     UOPS_EXECUTED_CORE     |   PMC1  | 6.000806e+07 |
|      UOPS_RETIRED_ALL      |   PMC2  | 6.000724e+07 |
| BR_INST_RETIRED_NEAR_TAKEN |   PMC3  | 1.000056e+07 |
|      INSTR_RETIRED_ANY     |  FIXC0  | 6.000540e+07 |
|    CPU_CLK_UNHALTED_CORE   |  FIXC1  | 1.001363e+07 |
================================================================================
Group 1 Custom: Region one_micro_two_macro
================================================================================
|       UOPS_ISSUED_ANY      |   PMC0  | 5.000502e+07 |
|     UOPS_EXECUTED_CORE     |   PMC1  | 6.000506e+07 |
|      UOPS_RETIRED_ALL      |   PMC2  | 6.000471e+07 |
| BR_INST_RETIRED_NEAR_TAKEN |   PMC3  | 1.000040e+07 |
|      INSTR_RETIRED_ANY     |  FIXC0  | 7.000316e+07 |
|    CPU_CLK_UNHALTED_CORE   |  FIXC1  | 1.334216e+07 |
================================================================================
Group 1 Custom: Region one_micro_one_macro
================================================================================
|       UOPS_ISSUED_ANY      |   PMC0  | 6.000435e+07 |
|     UOPS_EXECUTED_CORE     |   PMC1  | 7.000444e+07 |
|      UOPS_RETIRED_ALL      |   PMC2  | 7.000445e+07 |
| BR_INST_RETIRED_NEAR_TAKEN |   PMC3  | 1.000039e+07 |
|      INSTR_RETIRED_ANY     |  FIXC0  | 7.000310e+07 |
|    CPU_CLK_UNHALTED_CORE   |  FIXC1  | 1.672351e+07 |
================================================================================
Group 1 Custom: Region no_micro_two_macro
================================================================================
|       UOPS_ISSUED_ANY      |   PMC0  | 6.000429e+07 |
|     UOPS_EXECUTED_CORE     |   PMC1  | 6.000438e+07 |
|      UOPS_RETIRED_ALL      |   PMC2  | 6.000438e+07 |
| BR_INST_RETIRED_NEAR_TAKEN |   PMC3  | 1.000039e+07 |
|      INSTR_RETIRED_ANY     |  FIXC0  | 8.000307e+07 |
|    CPU_CLK_UNHALTED_CORE   |  FIXC1  | 1.500636e+07 |
================================================================================
Group 1 Custom: Region no_micro_no_macro
================================================================================
|       UOPS_ISSUED_ANY      |   PMC0  | 8.000476e+07 |
|     UOPS_EXECUTED_CORE     |   PMC1  | 8.000483e+07 |
|      UOPS_RETIRED_ALL      |   PMC2  | 8.000466e+07 |
| BR_INST_RETIRED_NEAR_TAKEN |   PMC3  | 1.000039e+07 |
|      INSTR_RETIRED_ANY     |  FIXC0  | 8.000312e+07 |
|    CPU_CLK_UNHALTED_CORE   |  FIXC1  | 2.000775e+07 |

And on Haswell:

nate@haswell:~/src$ likwid-perfctr -m -g UOPS_ISSUED_ANY:PMC0,UOPS_EXECUTED_CORE:PMC1,UOPS_RETIRED_ALL:PMC2,BR_INST_RETIRED_NEAR_TAKEN:PMC3 -C 1 fusion
-------------------------------------------------------------
-------------------------------------------------------------
CPU type:	Intel Core Haswell processor
CPU clock:	3.39 GHz
-------------------------------------------------------------
fusion
two_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_two_macro: sum1=10000000, sum2=9999999
one_micro_one_macro: sum1=10000000, sum2=9999999
no_micro_two_macro: sum1=10000000, sum2=9999999
no_micro_no_macro: sum1=10000000, sum2=9999999
=====================
Region: two_micro_two_macro
=====================
|      UOPS_ISSUED_ANY       | 4.00061e+07 |
|     UOPS_EXECUTED_CORE     | 6.00062e+07 |
|      UOPS_RETIRED_ALL      | 6.00046e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
|     INSTR_RETIRED_ANY      | 6.00013e+07 |
|   CPU_CLK_UNHALTED_CORE    | 1.7392e+07  |
=====================
Region: one_micro_two_macro
=====================
+----------------------------+-------------+
|           Event            |   core 1    |
+----------------------------+-------------+
|      UOPS_ISSUED_ANY       | 5.00062e+07 |
|     UOPS_EXECUTED_CORE     | 6.00062e+07 |
|      UOPS_RETIRED_ALL      | 6.00046e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
|     INSTR_RETIRED_ANY      | 7.00013e+07 |
|   CPU_CLK_UNHALTED_CORE    | 1.4247e+07  |
=====================
Region: one_micro_one_macro
=====================
+----------------------------+-------------+
|           Event            |   core 1    |
+----------------------------+-------------+
|      UOPS_ISSUED_ANY       | 6.00065e+07 |
|     UOPS_EXECUTED_CORE     | 7.00065e+07 |
|      UOPS_RETIRED_ALL      | 7.00048e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
|     INSTR_RETIRED_ANY      | 7.00013e+07 |
|   CPU_CLK_UNHALTED_CORE    | 1.69403e+07 |
=====================
Region: no_micro_two_macro
=====================
+----------------------------+-------------+
|           Event            |   core 1    |
+----------------------------+-------------+
|      UOPS_ISSUED_ANY       | 6.00062e+07 |
|     UOPS_EXECUTED_CORE     | 6.00062e+07 |
|      UOPS_RETIRED_ALL      | 6.00046e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
|     INSTR_RETIRED_ANY      | 8.00013e+07 |
|   CPU_CLK_UNHALTED_CORE    | 1.57365e+07 |
=====================
Region: no_micro_no_macro
=====================
|      UOPS_ISSUED_ANY       | 8.00062e+07 |
|     UOPS_EXECUTED_CORE     | 8.00062e+07 |
|      UOPS_RETIRED_ALL      | 8.00046e+07 |
| BR_INST_RETIRED_NEAR_TAKEN | 1.00002e+07 |
|     INSTR_RETIRED_ANY      | 8.00013e+07 |
|   CPU_CLK_UNHALTED_CORE    | 2.0043e+07  |
+----------------------------+-------------+

The main thing to notice is that on Skylake the "two macro two micro" is fastest and executes at 1 cycle per iteration, while on Haswell is it slower than than a couple options with less fusion. BR_INST_RETIRED_NEAR_TAKEN is to show the number of loop iterations. Run time in cycles is shown by CPU_CLK_UNHALTED_CORE. The difference between INSTR_RETIRED_ANY and UOPS_RETIRED_ALL shows the effect of macro-fusion of CMP/JCC. The difference between UOPS_ISSUED_ANY and UOPS_EXECUTED_CORE shows the effect of micro-fusion of LOAD/ADD. UOPS_EXECUTED_CORE and UOPS_RETIRED_CORE are the same on both machines, showing that there is no branch misprediction occurring.

 
thread Test results for Broadwell and Skylake new - Agner - 2015-12-26
replythread Sustained 64B loads per cycle on Haswell & Sky new - Nathan Kurz - 2015-12-26
last replythread Sustained 64B loads per cycle on Haswell & Sky new - Agner - 2015-12-27
last replythread Sustained 64B loads per cycle on Haswell & Sky new - Nathan Kurz - 2015-12-27
reply Sustained 64B loads per cycle on Haswell & Sky new - John D. McCalpin - 2016-01-04
reply Sustained 64B loads per cycle on Haswell & Sky new - T - 2016-06-18
last reply Sustained 64B loads per cycle on Haswell & Sky new - Jens Nurmann - 2017-01-12
replythread Test results for Broadwell and Skylake new - Peter Cordes - 2015-12-28
last reply Test results for Broadwell and Skylake new - Agner - 2015-12-29
replythread Test results for Broadwell and Skylake new - Tacit Murky - 2016-01-04
last replythread Test results for Broadwell and Skylake new - Agner - 2016-01-05
last replythread Test results for Broadwell and Skylake new - Tacit Murky - 2016-03-09
last reply Test results for Broadwell and Skylake new - Tacit Murky - 2016-06-05
replythread Minor bug in the microarchitecture manual new - SHK - 2016-01-10
last reply Minor bug in the microarchitecture manual new - Agner - 2016-01-16
replythread Test results for Broadwell and Skylake new - John D. McCalpin - 2016-01-12
last replythread Test results for Broadwell and Skylake new - Jess - 2016-02-11
last reply Description of discrepancy new - Nathan Kurz - 2016-03-13
reply Test results for Broadwell and Skylake new - Russell Van Zandt - 2016-02-22
replythread Instruction Throughput on Skylake new - Nathan Kurz - 2016-04-23
last replythread Instruction Throughput on Skylake new - Agner - 2016-04-24
last replythread Instruction Throughput on Skylake new - Nathan Kurz - 2016-04-26
last replythread Instruction Throughput on Skylake new - Agner - 2016-04-27
last replythread Instruction Throughput on Skylake new - T - 2016-06-18
reply Instruction Throughput on Skylake new - Agner - 2016-06-19
last replythread Instruction Throughput on Skylake new - Nathan Kurz - 2016-07-08
last replythread Instruction Throughput on Skylake - Nathan Kurz - 2016-07-11
replythread Instruction Throughput on Skylake new - Tacit Murky - 2016-07-17
last replythread Haswell register renaming / unfused limits new - Peter Cordes - 2017-05-11
reply Haswell register renaming / unfused limits new - Tacit Murky - 2017-05-11
last reply Haswell register renaming / unfused limits new - Peter Cordes - 2017-05-12
last reply Instruction Throughput on Skylake new - T - 2016-08-08
reply Unlamination of micro-fused ops in SKL and earlier new - Travis - 2016-09-09
replythread 32B store-forwarding is slower than 16B new - Peter Cordes - 2017-05-11
last replythread 32B store-forwarding is slower than 16B new - Fabian Giesen - 2017-06-28
last reply 32B store-forwarding is slower than 16B new - Agner - 2017-06-28
reply SHL/SHR r,cl latency is lower than throughput new - Peter Cordes - 2017-05-27
replythread Test results for Broadwell and Skylake new - Bulat Ziganshin - 2017-05-30
last replythread Test results for Broadwell and Skylake new - Agner - 2017-05-30
last replythread Test results for Broadwell and Skylake new - Bulat Ziganshin - 2017-05-30
last replythread Test results for Broadwell and Skylake new - - - 2017-06-19
replythread Test results for Broadwell and Skylake new - Jorcy Neto - 2017-06-20
last reply Test results for Broadwell and Skylake new - Jorcy Neto - 2017-06-20
replythread Test results for Broadwell and Skylake new - Bulat Ziganshin - 2017-06-21
reply Test results for Broadwell and Skylake new - Jorcy Neto - 2017-06-26
last replythread Test results for Broadwell and Skylake new - - - 2017-07-05
last replythread Test results for Broadwell and Skylake new - - - 2017-07-12
last reply Test results for Broadwell and Skylake new - Jorcy Neto - 2017-07-19
last replythread Test results for Broadwell and Skylake new - Xing Liu - 2017-06-28
last replythread Test results for Broadwell and Skylake new - Travis - 2017-06-29
last replythread Test results for Broadwell and Skylake new - Xing Liu - 2017-06-30
last reply Test results for Broadwell and Skylake new - Travis - 2017-07-13
reply Official information about uOps and latency SNB+ new - SEt - 2017-07-17
last replythread Test results for Broadwell and Skylake new - Armand Behroozi - 2020-10-07
last reply Test results for Broadwell and Skylake new - Agner - 2020-10-11