Skip to content

Instantly share code, notes, and snippets.

@Hermann-SW
Created June 4, 2026 00:21
Show Gist options
  • Select an option

  • Save Hermann-SW/c58d77d3dca86fe56b547a01009a624f to your computer and use it in GitHub Desktop.

Select an option

Save Hermann-SW/c58d77d3dca86fe56b547a01009a624f to your computer and use it in GitHub Desktop.
gemini double sqrt benchmark demonstrating 383.6 double sqrt GFLOPS on Radeon vii GPU
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <cmath>
// Hard architectural parameters for gfx906 (Radeon VII / MI50)
#define THREADS_PER_BLOCK 256
// --- The Un-Optimizable Hardware Smasher Kernel ---
__global__ void genuine_sqrt_kernel(const double* __restrict__ d_in,
double* __restrict__ d_out,
int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
double val = d_in[idx];
// A deep, dependent instruction pipeline chain.
// Every single step depends strictly on the result of the previous step.
// The compiler has absolutely no choice but to issue 10 sequential v_sqrt_f64 commands.
val = __builtin_amdgcn_sqrt(val);
val = __builtin_amdgcn_sqrt(val + 1.0);
val = __builtin_amdgcn_sqrt(val + 2.0);
val = __builtin_amdgcn_sqrt(val + 3.0);
val = __builtin_amdgcn_sqrt(val + 4.0);
val = __builtin_amdgcn_sqrt(val + 5.0);
val = __builtin_amdgcn_sqrt(val + 6.0);
val = __builtin_amdgcn_sqrt(val + 7.0);
val = __builtin_amdgcn_sqrt(val + 8.0);
val = __builtin_amdgcn_sqrt(val + 9.0);
// Forcing a global memory write back means the loop can never be categorized as "Dead Code"
d_out[idx] = val;
}
}
int main() {
// 100 Million Elements = 1,000,000,000 (1 Billion) total double square roots executed!
const int N = 100000000;
const int SQRTS_PER_THREAD = 10;
size_t bytes = N * sizeof(double);
std::cout << "Allocating " << (bytes * 2) / (1024 * 1024) << " MB of VRAM..." << std::endl;
// Host memory pointers
std::vector<double> h_in(N);
std::vector<double> h_out(N);
// Initialize inputs with dynamic data so the compiler cannot treat inputs as zero/constants
for (int i = 0; i < N; ++i) {
h_in[i] = static_cast<double>(i) + 0.5;
}
// Device memory pointers
double *d_in, *d_out;
hipMalloc(&d_in, bytes);
hipMalloc(&d_out, bytes);
// Copy input data to Radeon VII
hipMemcpy(d_in, h_in.data(), bytes, hipMemcpyHostToDevice);
// Grid sizing parameters
int blocks = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
std::cout << "Launching Kernel across " << blocks << " thread blocks..." << std::endl;
std::cout << "--------------------------------------------------------" << std::endl;
// Create GPU timers for pristine hardware monitoring
hipEvent_t start, stop;
hipEventCreate(&start);
hipEventCreate(&stop);
// Record hardware start timeline
hipEventRecord(start, nullptr);
// Execute the benchmark kernel
hipLaunchKernelGGL(genuine_sqrt_kernel, dim3(blocks), dim3(THREADS_PER_BLOCK), 0, nullptr, d_in, d_out, N);
// Record hardware stop timeline
hipEventRecord(stop, nullptr);
hipEventSynchronize(stop);
// Calculate execution runtime from GPU clocks
float milliseconds = 0;
hipEventElapsedTime(&milliseconds, start, stop);
double seconds = milliseconds / 1000.0;
// Fetch output data back to host to guarantee the operations are completed
hipMemcpy(h_out.data(), d_out, bytes, hipMemcpyDeviceToHost);
// Calculate actual operational metrics
double total_sqrts = static_cast<double>(N) * SQRTS_PER_THREAD;
double gflops = (total_sqrts / seconds) / 1e9;
std::cout << "Execution Completed Successfully." << std::endl;
std::cout << "Execution Time: " << seconds << " seconds" << std::endl;
std::cout << "Total Sqrt Operations: " << total_sqrts << std::endl;
std::cout << "Verification Check (Last Element): " << h_out[N-1] << std::endl;
std::cout << "\033[1;32m" << gflops << " Genuine double sqrt GFLOPS\033[0m" << std::endl;
// Clean up resources
hipEventDestroy(start);
hipEventDestroy(stop);
hipFree(d_in);
hipFree(d_out);
return 0;
}
@Hermann-SW

Hermann-SW commented Jun 4, 2026

Copy link
Copy Markdown
Author
hipcc -O3 --amdgpu-target=gfx906 benchmark_sqrt.cpp -o benchmark_sqrt

Reported value is too low ...

$ rocprofv2 -i counters.txt ./benchmark_sqrt
Allocating 1525 MB of VRAM...
ROCProfilerV2: Collecting the following counters:
- L2CacheHit
- VALUUtilization
Enabling Counter Collection
Launching Kernel across 390625 thread blocks...
--------------------------------------------------------
Execution Completed Successfully.
Execution Time: 0.00392576 seconds
Total Sqrt Operations: 1e+09
Verification Check (Last Element): 3.51285
254.728 Genuine double sqrt GFLOPS
Dispatch_ID(0), GPU_ID(1), Queue_ID(1), Process_ID(6192), Thread_ID(6192), Grid_Size(100000000), Workgroup_Size(256), LDS_Per_Workgroup(0), Scratch_Per_Workitem(0), Arch_VGPR(8), Accum_VGPR(0), SGPR(16), Wave_Size(64), Kernel_Name("genuine_sqrt_kernel(double const*, double*, int) (.kd)"), Begin_Timestamp(9883533233069), End_Timestamp(9883535839951), Correlation_ID(0), L2CacheHit(0.000735), VALUUtilization(100.000000)
$

... the profiler timestamps reveal only 9883535839951-9883533233069 = 2606882ns = 0.002606882s elapsed.
With that 383 double sqrt GFLOPS:

1000000000/(9883535839951-9883533233069) = 383.60002485728161075184

Wow:

VALUUtilization(100.000000)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment