Skip to content

Instantly share code, notes, and snippets.

@Hermann-SW
Created June 3, 2026 21:01
Show Gist options
  • Select an option

  • Save Hermann-SW/c4e40e823d274d03094d5e6d5071017d to your computer and use it in GitHub Desktop.

Select an option

Save Hermann-SW/c4e40e823d274d03094d5e6d5071017d to your computer and use it in GitHub Desktop.
Demonstrate maximal "double sqrt" GFLOPS performance for Zen4 AMD 16C/32T 7950X CPUs
/*
f=AVX512_VNNI.vsqrtpd
g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f
cpplint --filter=-legal/copyright $f.cpp
cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config
echo off | sudo tee /sys/devices/system/cpu/smt/control
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f
Output:
hermann@7950x:~$ ./$f
Starting hardware-bound benchmark using 16 threads...
... [AVX512F] vsqrtpd(mm512d,mm512d) completed
https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png
-------------------------------------------
Execution Time: 4.75232 seconds
Counter: 25,600,000,000
Total Compute: 204.8 double sqrt GFLOPS (counter * 8)
Performance: 43.0947 GFLOPS
-------------------------------------------
hermann@7950x:~$
*/
#include <omp.h>
#include <inttypes.h>
#include <iostream>
#include <chrono> // NOLINT [build/c++11]
int main(int, char**) {
const int iterations = 200000000; // 2*10^8
std::cout << "Starting hardware-bound benchmark using "
<< omp_get_max_threads() << " threads...\n";
auto start_time = std::chrono::high_resolution_clock::now();
#pragma omp parallel
{
for (int i = 0; i < iterations; ++i) {
asm __volatile__ ("vsqrtpd %%zmm0, %%zmm0" : : : "zmm0");
asm __volatile__ ("vsqrtpd %%zmm1, %%zmm1" : : : "zmm1");
asm __volatile__ ("vsqrtpd %%zmm2, %%zmm2" : : : "zmm2");
asm __volatile__ ("vsqrtpd %%zmm3, %%zmm3" : : : "zmm3");
asm __volatile__ ("vsqrtpd %%zmm4, %%zmm4" : : : "zmm4");
asm __volatile__ ("vsqrtpd %%zmm5, %%zmm5" : : : "zmm5");
asm __volatile__ ("vsqrtpd %%zmm6, %%zmm6" : : : "zmm6");
asm __volatile__ ("vsqrtpd %%zmm7, %%zmm7" : : : "zmm7");
}
}
std::cout << "... [AVX512F] vsqrtpd(mm512d,mm512d) completed\n";
std::cout <<
"https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png\n";
std::chrono::duration<double> duration =
std::chrono::high_resolution_clock::now() - start_time;
int64_t ops_per_loop = 8 * 8 * omp_get_max_threads();
int64_t total_ops = ops_per_loop * iterations;
int64_t giga_cnt = total_ops / 8;
double giga_ops = total_ops / 1e9;
double performance = giga_ops / duration.count();
std::cout << "-------------------------------------------\n";
std::cout << "Execution Time: " << duration.count() << " seconds\n";
std::cout.imbue(std::locale(""));
std::cout << "Counter: " << giga_cnt << "\n";
std::cout << "Total Compute: " << giga_ops
<< " double sqrt GFLOPS (counter * 8)\n";
std::cout << "Performance: " << performance << " GFLOPS\n";
std::cout << "-------------------------------------------\n";
return 0;
}
@Hermann-SW

Hermann-SW commented Jun 3, 2026

Copy link
Copy Markdown
Author

Real world OpenMP program mona-lisa100K.cpp computing TSP tour length of 100,000 cities mona-lisa100K.tsp 500,000× does 50*10^9 double sqrt and more, and reaches 50*10^9/1.28413s = 38.94 GFLOPS, very close to peak 43 double sqrt GFLOPS !

C++ loop:

#pragma omp declare reduction(v512_add : __m512i : \
    omp_out = _mm512_add_epi32(omp_out, omp_in)) \
    initializer(omp_priv = _mm512_setzero_si512())
...
  #pragma omp parallel for reduction(v512_add:acc)
  for (int i = 0; i < 2*N; i+=32) {
    __m512i a = _mm512_load_si512((const __m512i*)(xy_even+i));
    __m512i b = _mm512_load_si512((const __m512i*)(xy_odd+i));

    __m512i dxy = _mm512_sub_epi16(a, b);

    __m512i aux = _mm512_dpwssd_epi32(_mm512_setzero_si512(), dxy, dxy);

    __m512d low_doubles  = _mm512_cvtepi32_pd(_mm512_castsi512_si256(aux));
    __m256i high_lanes   = _mm512_extracti64x4_epi64(aux, 1);
    __m512d high_doubles = _mm512_cvtepi32_pd(high_lanes);

    __m512d sqrt_low  = _mm512_sqrt_pd(low_doubles);                                 <---
    __m512d sqrt_high = _mm512_sqrt_pd(high_doubles);                                <---

    __m512d res_low  = _mm512_add_pd(sqrt_low, half_pd);
    __m512d res_high = _mm512_add_pd(sqrt_high, half_pd);

    __m256i int_low  = _mm512_mask_cvtt_roundpd_epi32
                         (_mm256_undefined_si256(), 0xFF, res_low,
                             (_MM_FROUND_NO_EXC));
    __m256i int_high = _mm512_mask_cvtt_roundpd_epi32
                         (_mm256_undefined_si256(), 0xFF, res_high,
                           (_MM_FROUND_NO_EXC));

    __m512i euc_2d = _mm512_inserti64x4(_mm512_castsi256_si512(int_low),
                                        int_high, 1);

    acc = _mm512_add_epi32(acc, euc_2d);
  }

objdump loop:

    14c0:       62 f1 fd 48 6f 2c 01    vmovdqa64 (%rcx,%rax,1),%zmm5
    14c7:       62 f1 55 48 f9 04 02    vpsubw (%rdx,%rax,1),%zmm5,%zmm0
    14ce:       62 f1 7d 48 6f cc       vmovdqa32 %zmm4,%zmm1
    14d4:       48 83 c0 40             add    $0x40,%rax
    14d8:       62 f2 7d 48 52 c8       vpdpwssd %zmm0,%zmm0,%zmm1
    14de:       62 f1 7e 48 e6 c1       vcvtdq2pd %ymm1,%zmm0
    14e4:       62 f3 fd 48 3b c9 01    vextracti64x4 $0x1,%zmm1,%ymm1
    14eb:       62 f1 fd 48 51 c0       vsqrtpd %zmm0,%zmm0                          <---
    14f1:       62 f1 e5 48 58 c0       vaddpd %zmm0,%zmm3,%zmm0
    14f7:       62 f1 7e 48 e6 c9       vcvtdq2pd %ymm1,%zmm1
    14fd:       62 f1 fd 48 51 c9       vsqrtpd %zmm1,%zmm1                          <---
    1503:       62 f1 e5 48 58 c9       vaddpd %zmm1,%zmm3,%zmm1
    1509:       62 f1 fd 18 e6 c0       vcvttpd2dq {sae},%zmm0,%ymm0
    150f:       62 f1 fd 18 e6 c9       vcvttpd2dq {sae},%zmm1,%ymm1
    1515:       62 f3 fd 48 3a c1 01    vinserti64x4 $0x1,%ymm1,%zmm0,%zmm0
    151c:       62 f1 7d 48 fe c2       vpaddd %zmm2,%zmm0,%zmm0
    1522:       62 f1 fd 48 6f d0       vmovdqa64 %zmm0,%zmm2
    1528:       48 3d 80 1a 06 00       cmp    $0x61a80,%rax
    152e:       75 90                   jne    14c0 <_Z6bench2v._omp_fn.0+0x70>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment