Created
June 3, 2026 21:01
-
-
Save Hermann-SW/c4e40e823d274d03094d5e6d5071017d to your computer and use it in GitHub Desktop.
Demonstrate maximal "double sqrt" GFLOPS performance for Zen4 AMD 16C/32T 7950X CPUs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| f=AVX512_VNNI.vsqrtpd | |
| g++ -O3 -fopenmp -Wall -Wextra -pedantic $f.cpp -o $f | |
| cpplint --filter=-legal/copyright $f.cpp | |
| cppcheck --enable=all --suppress=missingIncludeSystem $f.cpp --check-config | |
| echo off | sudo tee /sys/devices/system/cpu/smt/control | |
| echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid | |
| perf stat -a -e fp_ops_retired_by_width.pack_512_uops_retired,cycles,instructions,task-clock ./$f | |
| Output: | |
| hermann@7950x:~$ ./$f | |
| Starting hardware-bound benchmark using 16 threads... | |
| ... [AVX512F] vsqrtpd(mm512d,mm512d) completed | |
| https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png | |
| ------------------------------------------- | |
| Execution Time: 4.75232 seconds | |
| Counter: 25,600,000,000 | |
| Total Compute: 204.8 double sqrt GFLOPS (counter * 8) | |
| Performance: 43.0947 GFLOPS | |
| ------------------------------------------- | |
| hermann@7950x:~$ | |
| */ | |
| #include <omp.h> | |
| #include <inttypes.h> | |
| #include <iostream> | |
| #include <chrono> // NOLINT [build/c++11] | |
| int main(int, char**) { | |
| const int iterations = 200000000; // 2*10^8 | |
| std::cout << "Starting hardware-bound benchmark using " | |
| << omp_get_max_threads() << " threads...\n"; | |
| auto start_time = std::chrono::high_resolution_clock::now(); | |
| #pragma omp parallel | |
| { | |
| for (int i = 0; i < iterations; ++i) { | |
| asm __volatile__ ("vsqrtpd %%zmm0, %%zmm0" : : : "zmm0"); | |
| asm __volatile__ ("vsqrtpd %%zmm1, %%zmm1" : : : "zmm1"); | |
| asm __volatile__ ("vsqrtpd %%zmm2, %%zmm2" : : : "zmm2"); | |
| asm __volatile__ ("vsqrtpd %%zmm3, %%zmm3" : : : "zmm3"); | |
| asm __volatile__ ("vsqrtpd %%zmm4, %%zmm4" : : : "zmm4"); | |
| asm __volatile__ ("vsqrtpd %%zmm5, %%zmm5" : : : "zmm5"); | |
| asm __volatile__ ("vsqrtpd %%zmm6, %%zmm6" : : : "zmm6"); | |
| asm __volatile__ ("vsqrtpd %%zmm7, %%zmm7" : : : "zmm7"); | |
| } | |
| } | |
| std::cout << "... [AVX512F] vsqrtpd(mm512d,mm512d) completed\n"; | |
| std::cout << | |
| "https://www.officedaytime.com/simd512e/simdimg/unop_qword_3.png\n"; | |
| std::chrono::duration<double> duration = | |
| std::chrono::high_resolution_clock::now() - start_time; | |
| int64_t ops_per_loop = 8 * 8 * omp_get_max_threads(); | |
| int64_t total_ops = ops_per_loop * iterations; | |
| int64_t giga_cnt = total_ops / 8; | |
| double giga_ops = total_ops / 1e9; | |
| double performance = giga_ops / duration.count(); | |
| std::cout << "-------------------------------------------\n"; | |
| std::cout << "Execution Time: " << duration.count() << " seconds\n"; | |
| std::cout.imbue(std::locale("")); | |
| std::cout << "Counter: " << giga_cnt << "\n"; | |
| std::cout << "Total Compute: " << giga_ops | |
| << " double sqrt GFLOPS (counter * 8)\n"; | |
| std::cout << "Performance: " << performance << " GFLOPS\n"; | |
| std::cout << "-------------------------------------------\n"; | |
| return 0; | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Real world OpenMP program mona-lisa100K.cpp computing TSP tour length of 100,000 cities mona-lisa100K.tsp 500,000× does 50*10^9 double sqrt and more, and reaches 50*10^9/1.28413s = 38.94 GFLOPS, very close to peak 43 double sqrt GFLOPS !
C++ loop:
objdump loop: