Skip to content

Instantly share code, notes, and snippets.

@Lua12138
Last active April 28, 2026 12:19
Show Gist options
  • Select an option

  • Save Lua12138/fc4a9807b3c2f1e7b27bf60479db877a to your computer and use it in GitHub Desktop.

Select an option

Save Lua12138/fc4a9807b3c2f1e7b27bf60479db877a to your computer and use it in GitHub Desktop.
A script to detect GPU TFLOPS using torch
import torch
import time
import platform
def get_system_info():
props = torch.cuda.get_device_properties(0)
return {
"OS": f"{platform.system()} {platform.release()}",
"CPU": platform.processor(),
"GPU": props.name,
"VRAM": f"{round(props.total_memory / (1024**3), 2)} GB",
"Arch": f"Compute Cap {props.major}.{props.minor}",
"Backend": f"Torch {torch.__version__} / {torch.version.cuda}"
}
def benchmark_compute_cell(target_vram_gb, dtype):
device = "cuda"
matrix_size = 4096
chunks, A, B = None, None, None
# 1. Physical VRAM Allocation (Fix: differentiate between float and int)
elem_size = torch.tensor([], dtype=dtype).element_size()
chunk_elements = (256 * 1024 * 1024) // elem_size
num_chunks = int((target_vram_gb * 1024**3) / (256 * 1024**2))
chunks = []
try:
for _ in range(num_chunks):
# FIX: Use randint for integer types, randn for floating points
if dtype.is_floating_point:
chunks.append(torch.randn(chunk_elements, device=device, dtype=dtype))
else:
# Fill with random integers in a safe range
chunks.append(torch.randint(-10, 10, (chunk_elements,), device=device, dtype=dtype))
except Exception:
pass
if not chunks: return 0.0
# 2. Compute Performance Measurement
try:
# Prepare operands (Fix: randint for integers)
if dtype.is_floating_point:
A = torch.randn((matrix_size, matrix_size), device=device, dtype=dtype)
B = torch.randn((matrix_size, matrix_size), device=device, dtype=dtype)
else:
A = torch.randint(-10, 10, (matrix_size, matrix_size), device=device, dtype=dtype)
B = torch.randint(-10, 10, (matrix_size, matrix_size), device=device, dtype=dtype)
res = None
torch.cuda.synchronize()
iters = 15
start = time.perf_counter()
for i in range(iters):
if dtype.is_floating_point:
if res is not None:
B[0, 0] += res.mean()
res = torch.matmul(A, B)
ops_per_iter = 2 * matrix_size**3
else:
if res is not None:
B[0, 0] += res[0, 0]
res = (A * B) + A
ops_per_iter = 2 * matrix_size**3 / 16 # Scaled for normalization
# VRAM Dependency Injection
A += chunks[i % len(chunks)][:matrix_size*matrix_size].view(matrix_size, matrix_size)
A += chunks[i % len(chunks)][:matrix_size*matrix_size].view(matrix_size, matrix_size)
torch.cuda.synchronize()
end = time.perf_counter()
duration_sec = (end - start) / iters
perf_score = (ops_per_iter / duration_sec) / 1e12
return perf_score
except Exception:
return 0.0
finally:
if chunks is not None: del chunks
if A is not None: del A
if B is not None: del B
torch.cuda.empty_cache()
# --- Execution ---
info = get_system_info()
print("="*80)
print("SYSTEM DIAGNOSTICS")
for k, v in info.items(): print(f"{k:<12}: {v}")
print("="*80)
gpu_capacity = torch.cuda.get_device_properties(0).total_memory
available_vram_gb = gpu_capacity // (1024 ** 3)
test_steps = list(range(1, available_vram_gb+1))
dtypes = {
"INT8": torch.int8,
"INT16": torch.int16,
"INT32": torch.int32,
"FP16": torch.float16,
"BF16": torch.bfloat16,
"FP32": torch.float32,
"FP64": torch.float64
}
# Run Benchmarks
results_matrix = {}
for name, dt in dtypes.items():
print(f"Benchmarking {name}...", end="\r")
col_results = {}
for gb in test_steps:
col_results[gb] = benchmark_compute_cell(gb, dt)
results_matrix[name] = col_results
print(f"All benchmarking done", end="\r")
# --- Final Table Output ---
print("\n" + "="*80)
print(f"COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better")
print("-" * 80)
header = f"{'VRAM GB':<10}"
for name in dtypes.keys(): header += f" | {name:<12}"
print(header)
print("-" * 80)
for gb in test_steps:
row = f"{gb:<10.1f}"
for name in dtypes.keys():
val = results_matrix[name].get(gb, 0.0)
row += f" | {val:<12.4f}"
print(row)
print("="*80)
@Lua12138
Copy link
Copy Markdown
Author

The TFLOPS calculated by this script will typically be significantly lower than the nominal TFLOPS of your GPU, which is normal.

The main reason for this is that after each computation in the script, it reads hundreds of MB of data from VRAM. Therefore, the memory bandwidth also greatly affects the calculated result.

I believe this is a more realistic scenario, as there are very few situations where the GPU is used extensively for idle computations without involving VRAM.

Finally, feel free to comment here with the performance of your device, which can provide a reference for others.

@Lua12138
Copy link
Copy Markdown
Author

================================================================================
SYSTEM DIAGNOSTICS
OS          : Linux 6.6.113+
CPU         : x86_64
GPU         : Tesla T4
VRAM        : 14.56 GB
Arch        : Compute Cap 7.5
Backend     : Torch 2.10.0+cu128 / 12.8
================================================================================

================================================================================
COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better
--------------------------------------------------------------------------------
VRAM GB    | INT8         | INT16        | INT32        | FP16         | BF16         | FP32         | FP64        
--------------------------------------------------------------------------------
1.0        | 2.7133       | 6.0406       | 2.9837       | 23.7107      | 1.6805       | 3.4189       | 0.2415      
2.0        | 12.6171      | 6.1250       | 3.0266       | 17.5787      | 1.6406       | 3.4680       | 0.2418      
3.0        | 14.0303      | 6.2226       | 3.0294       | 18.4580      | 1.6437       | 3.3881       | 0.2418      
4.0        | 11.9880      | 6.0711       | 3.0123       | 17.9556      | 1.6492       | 3.4096       | 0.2418      
5.0        | 12.0174      | 6.0620       | 3.0008       | 17.3214      | 1.6634       | 3.3895       | 0.2418      
6.0        | 12.3675      | 6.0301       | 2.9975       | 17.0475      | 1.6799       | 3.3856       | 0.2415      
7.0        | 12.2805      | 6.0768       | 3.0047       | 17.5876      | 1.6917       | 3.3920       | 0.2416      
8.0        | 12.2435      | 6.0752       | 3.0025       | 23.3704      | 1.7222       | 3.3840       | 0.2416      
9.0        | 10.9346      | 6.0688       | 2.9992       | 21.2667      | 1.7388       | 3.4786       | 0.2417      
10.0       | 11.2239      | 6.0820       | 2.9867       | 21.8001      | 1.7918       | 3.4283       | 0.2416      
11.0       | 11.4281      | 6.0404       | 2.9991       | 23.2962      | 1.7944       | 3.4922       | 0.2417      
12.0       | 11.3640      | 6.0961       | 3.0034       | 22.3581      | 1.8077       | 3.4947       | 0.2417      
13.0       | 11.3720      | 6.0709       | 3.1963       | 21.1299      | 1.8239       | 3.5146       | 0.2419      
14.0       | 13.9182      | 6.8030       | 3.3584       | 21.0882      | 1.8286       | 3.5004       | 0.0000      
================================================================================

@Lua12138
Copy link
Copy Markdown
Author

================================================================================
SYSTEM DIAGNOSTICS
OS          : Windows 11
CPU         : AMD64 Family 25 Model 33 Stepping 2, AuthenticAMD
GPU         : AMD Radeon RX 7900 XTX
VRAM        : 23.98 GB
Arch        : Compute Cap 11.0
Backend     : Torch 2.10.0a0+rocm7.10.0a20251120 / None
================================================================================
All benchmarking done
================================================================================
COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better
--------------------------------------------------------------------------------
VRAM GB    | INT8         | INT16        | INT32        | FP16         | BF16         | FP32         | FP64
--------------------------------------------------------------------------------
1.0        | 1.3394       | 1.4338       | 2.5516       | 12.7965      | 8.4983       | 6.0509       | 1.0543
2.0        | 34.4811      | 4.5476       | 2.8527       | 48.1365      | 56.6924      | 19.1415      | 1.0249
3.0        | 45.0584      | 6.3482       | 2.4877       | 44.3068      | 58.1990      | 19.8741      | 1.0639
4.0        | 25.8562      | 6.4505       | 3.3395       | 52.3331      | 55.8807      | 20.3470      | 0.9779
5.0        | 27.3118      | 6.4253       | 3.2944       | 54.7542      | 59.4955      | 20.0175      | 1.0673
6.0        | 25.3211      | 6.4984       | 3.2779       | 55.7319      | 60.6414      | 19.7747      | 1.0582
7.0        | 26.0070      | 6.6061       | 3.1471       | 41.1660      | 57.5070      | 12.3532      | 1.0584
8.0        | 27.1570      | 6.3578       | 3.3463       | 56.3347      | 57.5408      | 20.4308      | 1.0619
9.0        | 22.6373      | 5.8721       | 3.2402       | 57.9432      | 57.6936      | 19.7200      | 1.0685
10.0       | 26.5400      | 6.4486       | 3.3798       | 57.1603      | 59.2644      | 20.4623      | 1.0661
11.0       | 25.4592      | 6.5189       | 3.3772       | 58.4769      | 59.9680      | 19.0941      | 1.0622
12.0       | 24.8455      | 6.3476       | 3.3154       | 57.3191      | 60.3427      | 20.4527      | 1.0644
13.0       | 26.1697      | 6.3580       | 3.3065       | 56.4202      | 57.0910      | 19.6925      | 1.0676
14.0       | 26.1580      | 6.3895       | 2.9189       | 57.9007      | 47.0302      | 20.4858      | 1.0636
15.0       | 19.9315      | 5.1354       | 3.4653       | 57.8984      | 61.4734      | 20.3091      | 1.0570
16.0       | 21.1530      | 6.8815       | 3.5538       | 57.1860      | 59.8424      | 19.7960      | 1.0664
17.0       | 23.4557      | 7.5250       | 3.4683       | 58.1642      | 61.3103      | 19.2114      | 1.0612
18.0       | 26.0285      | 7.2151       | 3.5146       | 46.8021      | 60.0445      | 20.8182      | 1.0697
19.0       | 24.6695      | 7.4401       | 3.4984       | 59.9399      | 60.7893      | 19.9111      | 1.0146
20.0       | 17.2417      | 7.3729       | 3.4683       | 49.9312      | 57.6158      | 20.8803      | 1.0548
21.0       | 12.8340      | 7.3103       | 3.5598       | 57.6464      | 50.1524      | 20.2402      | 1.0255
22.0       | 11.0178      | 7.2859       | 3.4966       | 57.1618      | 57.9797      | 19.9253      | 1.0221
23.0       | 11.5246      | 7.4358       | 3.5801       | 59.1279      | 61.5389      | 20.5735      | 1.0570
================================================================================

@Lua12138
Copy link
Copy Markdown
Author

================================================================================
SYSTEM DIAGNOSTICS
OS          : Linux 6.8.0-106-generic
CPU         : x86_64
GPU         : NVIDIA A10
VRAM        : 22.06 GB
Arch        : Compute Cap 8.6
Backend     : Torch 2.11.0+cu130 / 13.0
================================================================================
All benchmarking done
================================================================================
COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better
--------------------------------------------------------------------------------
VRAM GB    | INT8         | INT16        | INT32        | FP16         | BF16         | FP32         | FP64
--------------------------------------------------------------------------------
1.0        | 5.4735       | 10.3598      | 5.8467       | 13.9324      | 15.5496      | 9.9438       | 0.4150
2.0        | 16.0881      | 10.3806      | 5.8485       | 63.5509      | 71.1824      | 12.0698      | 0.4171
3.0        | 17.4021      | 10.3745      | 5.8525       | 66.9665      | 69.7174      | 12.4950      | 0.4171
4.0        | 17.4651      | 8.8665       | 5.8442       | 61.3879      | 71.1452      | 12.6691      | 0.4171
5.0        | 17.9677      | 10.3492      | 5.8419       | 68.8794      | 70.6128      | 12.3076      | 0.4171
6.0        | 16.0508      | 10.3695      | 5.8528       | 69.8288      | 72.0779      | 12.3220      | 0.4155
7.0        | 18.2522      | 10.3748      | 5.8579       | 69.5522      | 71.3750      | 12.4823      | 0.4171
8.0        | 16.8158      | 10.3611      | 5.8554       | 68.6237      | 71.0050      | 12.4840      | 0.4170
9.0        | 17.4409      | 10.3813      | 5.8580       | 68.7129      | 70.2430      | 12.7346      | 0.4051
10.0       | 17.4509      | 10.3785      | 5.8604       | 70.0807      | 72.0470      | 12.5288      | 0.4171
11.0       | 16.0735      | 10.3667      | 5.6664       | 69.8102      | 71.8628      | 12.4567      | 0.4170
12.0       | 17.4099      | 10.3536      | 5.8611       | 68.6358      | 71.9821      | 12.3800      | 0.4170
13.0       | 17.4339      | 10.3730      | 5.8552       | 69.1466      | 70.2406      | 12.4161      | 0.4170
14.0       | 18.2652      | 10.3720      | 5.8499       | 68.9871      | 71.8969      | 12.3675      | 0.4158
15.0       | 17.4301      | 10.3815      | 5.8537       | 69.4430      | 71.7875      | 5.8510       | 0.4171
16.0       | 17.4106      | 10.3597      | 5.8468       | 68.8270      | 71.9325      | 12.4432      | 0.4171
17.0       | 17.5228      | 10.3727      | 5.6733       | 69.0329      | 72.2553      | 12.4842      | 0.4171
18.0       | 17.4037      | 10.3492      | 5.6680       | 69.5787      | 70.6584      | 12.5389      | 0.4055
19.0       | 25.7002      | 10.3642      | 5.8478       | 69.6702      | 72.1023      | 12.6141      | 0.4171
20.0       | 17.4559      | 10.3525      | 5.6612       | 71.8157      | 76.1790      | 12.6965      | 0.4171
21.0       | 18.1936      | 13.3459      | 6.2544       | 69.9451      | 72.6117      | 12.7249      | 0.4124
22.0       | 0.0000       | 0.0000       | 0.0000       | 0.0000       | 75.0666      | 0.0000       | 0.0000
================================================================================

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment