Created
June 3, 2026 21:01
-
-
Save Hermann-SW/c4e40e823d274d03094d5e6d5071017d to your computer and use it in GitHub Desktop.
Demonstrate maximal "double sqrt" GFLOPS performance for Zen4 AMD 16C/32T 7950X CPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| f=AVX512_VNNI.vsqrtpd | |
| g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f | |
| cpplint --filter=-legal/copyright $f.cpp | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config | |
| echo off | sudo tee /sys/devices/system/cpu/smt/control | |
| echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid | |
| perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f | |
| Output: | |
| hermann@7950x:~$ ./$f | |
| Starting hardware-bound benchmark using 16 threads... | |
| ... [AVX512F] vsqrtpd(mm512d,mm512d) completed | |
| https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png | |
| ------------------------------------------- | |
| Execution Time: 4.75232 seconds | |
| Counter: 25,600,000,000 | |
| Total Compute: 204.8 double sqrt GFLOPS (counter * 8) | |
| Performance: 43.0947 GFLOPS | |
| ------------------------------------------- | |
| hermann@7950x:~$ | |
| */ | |
| #include <omp.h> | |
| #include <inttypes.h> | |
| #include <iostream> | |
| #include <chrono> // NOLINT [build/c++11] | |
| int main(int, char**) { | |
| const int iterations = 200000000; // 2*10^8 | |
| std::cout << "Starting hardware-bound benchmark using " | |
| << omp_get_max_threads() << " threads...\n"; | |
| auto start_time = std::chrono::high_resolution_clock::now(); | |
| #pragma omp parallel | |
| { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ("vsqrtpd %%zmm0, %%zmm0" : : : "zmm0"); | |
| asm __volatile__ ("vsqrtpd %%zmm1, %%zmm1" : : : "zmm1"); | |
| asm __volatile__ ("vsqrtpd %%zmm2, %%zmm2" : : : "zmm2"); | |
| asm __volatile__ ("vsqrtpd %%zmm3, %%zmm3" : : : "zmm3"); | |
| asm __volatile__ ("vsqrtpd %%zmm4, %%zmm4" : : : "zmm4"); | |
| asm __volatile__ ("vsqrtpd %%zmm5, %%zmm5" : : : "zmm5"); | |
| asm __volatile__ ("vsqrtpd %%zmm6, %%zmm6" : : : "zmm6"); | |
| asm __volatile__ ("vsqrtpd %%zmm7, %%zmm7" : : : "zmm7"); | |
| } | |
| } | |
| std::cout << "... [AVX512F] vsqrtpd(mm512d,mm512d) completed\n"; | |
| std::cout << | |
| "https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png\n"; | |
| std::chrono::duration<double> duration = | |
| std::chrono::high_resolution_clock::now() - start_time; | |
| int64_t ops_per_loop = 8 * 8 * omp_get_max_threads(); | |
| int64_t total_ops = ops_per_loop * iterations; | |
| int64_t giga_cnt = total_ops / 8; | |
| double giga_ops = total_ops / 1e9; | |
| double performance = giga_ops / duration.count(); | |
| std::cout << "-------------------------------------------\n"; | |
| std::cout << "Execution Time: " << duration.count() << " seconds\n"; | |
| std::cout.imbue(std::locale("")); | |
| std::cout << "Counter: " << giga_cnt << "\n"; | |
| std::cout << "Total Compute: " << giga_ops | |
| << " double sqrt GFLOPS (counter * 8)\n"; | |
| std::cout << "Performance: " << performance << " GFLOPS\n"; | |
| std::cout << "-------------------------------------------\n"; | |
| return 0; | |
| } |
Author
Author
Main loop in C++ with inline assembly (allowing control over the 512bit register names):
hermann@7950x:~$ sed -n "/pragma/,/^ }/p" $f.cpp
#pragma omp parallel
{
for (int i = 0; i < iterations; ++i) {
asm __volatile__ ("vsqrtpd %%zmm0, %%zmm0" : : : "zmm0");
asm __volatile__ ("vsqrtpd %%zmm1, %%zmm1" : : : "zmm1");
asm __volatile__ ("vsqrtpd %%zmm2, %%zmm2" : : : "zmm2");
asm __volatile__ ("vsqrtpd %%zmm3, %%zmm3" : : : "zmm3");
asm __volatile__ ("vsqrtpd %%zmm4, %%zmm4" : : : "zmm4");
asm __volatile__ ("vsqrtpd %%zmm5, %%zmm5" : : : "zmm5");
asm __volatile__ ("vsqrtpd %%zmm6, %%zmm6" : : : "zmm6");
asm __volatile__ ("vsqrtpd %%zmm7, %%zmm7" : : : "zmm7");
}
}
hermann@7950x:~$
Same loop with objdump:
hermann@7950x:~$ objdump -d AVX512_VNNI.vsqrtpd | sed -n "/vsqrtpd /,/jne/p"
1570: 62 f1 fd 48 51 c0 vsqrtpd %zmm0,%zmm0
1576: 62 f1 fd 48 51 c9 vsqrtpd %zmm1,%zmm1
157c: 62 f1 fd 48 51 d2 vsqrtpd %zmm2,%zmm2
1582: 62 f1 fd 48 51 db vsqrtpd %zmm3,%zmm3
1588: 62 f1 fd 48 51 e4 vsqrtpd %zmm4,%zmm4
158e: 62 f1 fd 48 51 ed vsqrtpd %zmm5,%zmm5
1594: 62 f1 fd 48 51 f6 vsqrtpd %zmm6,%zmm6
159a: 62 f1 fd 48 51 ff vsqrtpd %zmm7,%zmm7
15a0: 83 e8 01 sub $0x1,%eax
15a3: 75 cb jne 1570 <main._omp_fn.0+0x10>
hermann@7950x:~$
Author
Real world OpenMP program mona-lisa100K.cpp computing TSP tour length of 100,000 cities mona-lisa100K.tsp 500,000× does 50*10^9 double sqrt and more, and reaches 50*10^9/1.28413s = 38.94 GFLOPS, very close to peak 43 double sqrt GFLOPS !
C++ loop:
#pragma omp declare reduction(v512_add : __m512i : \
omp_out = _mm512_add_epi32(omp_out, omp_in)) \
initializer(omp_priv = _mm512_setzero_si512())
...
#pragma omp parallel for reduction(v512_add:acc)
for (int i = 0; i < 2*N; i+=32) {
__m512i a = _mm512_load_si512((const __m512i*)(xy_even+i));
__m512i b = _mm512_load_si512((const __m512i*)(xy_odd+i));
__m512i dxy = _mm512_sub_epi16(a, b);
__m512i aux = _mm512_dpwssd_epi32(_mm512_setzero_si512(), dxy, dxy);
__m512d low_doubles = _mm512_cvtepi32_pd(_mm512_castsi512_si256(aux));
__m256i high_lanes = _mm512_extracti64x4_epi64(aux, 1);
__m512d high_doubles = _mm512_cvtepi32_pd(high_lanes);
__m512d sqrt_low = _mm512_sqrt_pd(low_doubles); <---
__m512d sqrt_high = _mm512_sqrt_pd(high_doubles); <---
__m512d res_low = _mm512_add_pd(sqrt_low, half_pd);
__m512d res_high = _mm512_add_pd(sqrt_high, half_pd);
__m256i int_low = _mm512_mask_cvtt_roundpd_epi32
(_mm256_undefined_si256(), 0xFF, res_low,
(_MM_FROUND_NO_EXC));
__m256i int_high = _mm512_mask_cvtt_roundpd_epi32
(_mm256_undefined_si256(), 0xFF, res_high,
(_MM_FROUND_NO_EXC));
__m512i euc_2d = _mm512_inserti64x4(_mm512_castsi256_si512(int_low),
int_high, 1);
acc = _mm512_add_epi32(acc, euc_2d);
}
objdump loop:
14c0: 62 f1 fd 48 6f 2c 01 vmovdqa64 (%rcx,%rax,1),%zmm5
14c7: 62 f1 55 48 f9 04 02 vpsubw (%rdx,%rax,1),%zmm5,%zmm0
14ce: 62 f1 7d 48 6f cc vmovdqa32 %zmm4,%zmm1
14d4: 48 83 c0 40 add $0x40,%rax
14d8: 62 f2 7d 48 52 c8 vpdpwssd %zmm0,%zmm0,%zmm1
14de: 62 f1 7e 48 e6 c1 vcvtdq2pd %ymm1,%zmm0
14e4: 62 f3 fd 48 3b c9 01 vextracti64x4 $0x1,%zmm1,%ymm1
14eb: 62 f1 fd 48 51 c0 vsqrtpd %zmm0,%zmm0 <---
14f1: 62 f1 e5 48 58 c0 vaddpd %zmm0,%zmm3,%zmm0
14f7: 62 f1 7e 48 e6 c9 vcvtdq2pd %ymm1,%zmm1
14fd: 62 f1 fd 48 51 c9 vsqrtpd %zmm1,%zmm1 <---
1503: 62 f1 e5 48 58 c9 vaddpd %zmm1,%zmm3,%zmm1
1509: 62 f1 fd 18 e6 c0 vcvttpd2dq {sae},%zmm0,%ymm0
150f: 62 f1 fd 18 e6 c9 vcvttpd2dq {sae},%zmm1,%ymm1
1515: 62 f3 fd 48 3a c1 01 vinserti64x4 $0x1,%ymm1,%zmm0,%zmm0
151c: 62 f1 7d 48 fe c2 vpaddd %zmm2,%zmm0,%zmm0
1522: 62 f1 fd 48 6f d0 vmovdqa64 %zmm0,%zmm2
1528: 48 3d 80 1a 06 00 cmp $0x61a80,%rax
152e: 75 90 jne 14c0 <_Z6bench2v._omp_fn.0+0x70>
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
One vsqrtpd does 8 double sqrt computations:
https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png