Last active
April 28, 2026 12:19
-
-
Save Lua12138/fc4a9807b3c2f1e7b27bf60479db877a to your computer and use it in GitHub Desktop.
A script to detect GPU TFLOPS using torch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import time | |
| import platform | |
| def get_system_info(): | |
| props = torch.cuda.get_device_properties(0) | |
| return { | |
| "OS": f"{platform.system()} {platform.release()}", | |
| "CPU": platform.processor(), | |
| "GPU": props.name, | |
| "VRAM": f"{round(props.total_memory / (1024**3), 2)} GB", | |
| "Arch": f"Compute Cap {props.major}.{props.minor}", | |
| "Backend": f"Torch {torch.__version__} / {torch.version.cuda}" | |
| } | |
| def benchmark_compute_cell(target_vram_gb, dtype): | |
| device = "cuda" | |
| matrix_size = 4096 | |
| chunks, A, B = None, None, None | |
| # 1. Physical VRAM Allocation (Fix: differentiate between float and int) | |
| elem_size = torch.tensor([], dtype=dtype).element_size() | |
| chunk_elements = (256 * 1024 * 1024) // elem_size | |
| num_chunks = int((target_vram_gb * 1024**3) / (256 * 1024**2)) | |
| chunks = [] | |
| try: | |
| for _ in range(num_chunks): | |
| # FIX: Use randint for integer types, randn for floating points | |
| if dtype.is_floating_point: | |
| chunks.append(torch.randn(chunk_elements, device=device, dtype=dtype)) | |
| else: | |
| # Fill with random integers in a safe range | |
| chunks.append(torch.randint(-10, 10, (chunk_elements,), device=device, dtype=dtype)) | |
| except Exception: | |
| pass | |
| if not chunks: return 0.0 | |
| # 2. Compute Performance Measurement | |
| try: | |
| # Prepare operands (Fix: randint for integers) | |
| if dtype.is_floating_point: | |
| A = torch.randn((matrix_size, matrix_size), device=device, dtype=dtype) | |
| B = torch.randn((matrix_size, matrix_size), device=device, dtype=dtype) | |
| else: | |
| A = torch.randint(-10, 10, (matrix_size, matrix_size), device=device, dtype=dtype) | |
| B = torch.randint(-10, 10, (matrix_size, matrix_size), device=device, dtype=dtype) | |
| res = None | |
| torch.cuda.synchronize() | |
| iters = 15 | |
| start = time.perf_counter() | |
| for i in range(iters): | |
| if dtype.is_floating_point: | |
| if res is not None: | |
| B[0, 0] += res.mean() | |
| res = torch.matmul(A, B) | |
| ops_per_iter = 2 * matrix_size**3 | |
| else: | |
| if res is not None: | |
| B[0, 0] += res[0, 0] | |
| res = (A * B) + A | |
| ops_per_iter = 2 * matrix_size**3 / 16 # Scaled for normalization | |
| # VRAM Dependency Injection | |
| A += chunks[i % len(chunks)][:matrix_size*matrix_size].view(matrix_size, matrix_size) | |
| A += chunks[i % len(chunks)][:matrix_size*matrix_size].view(matrix_size, matrix_size) | |
| torch.cuda.synchronize() | |
| end = time.perf_counter() | |
| duration_sec = (end - start) / iters | |
| perf_score = (ops_per_iter / duration_sec) / 1e12 | |
| return perf_score | |
| except Exception: | |
| return 0.0 | |
| finally: | |
| if chunks is not None: del chunks | |
| if A is not None: del A | |
| if B is not None: del B | |
| torch.cuda.empty_cache() | |
| # --- Execution --- | |
| info = get_system_info() | |
| print("="*80) | |
| print("SYSTEM DIAGNOSTICS") | |
| for k, v in info.items(): print(f"{k:<12}: {v}") | |
| print("="*80) | |
| gpu_capacity = torch.cuda.get_device_properties(0).total_memory | |
| available_vram_gb = gpu_capacity // (1024 ** 3) | |
| test_steps = list(range(1, available_vram_gb+1)) | |
| dtypes = { | |
| "INT8": torch.int8, | |
| "INT16": torch.int16, | |
| "INT32": torch.int32, | |
| "FP16": torch.float16, | |
| "BF16": torch.bfloat16, | |
| "FP32": torch.float32, | |
| "FP64": torch.float64 | |
| } | |
| # Run Benchmarks | |
| results_matrix = {} | |
| for name, dt in dtypes.items(): | |
| print(f"Benchmarking {name}...", end="\r") | |
| col_results = {} | |
| for gb in test_steps: | |
| col_results[gb] = benchmark_compute_cell(gb, dt) | |
| results_matrix[name] = col_results | |
| print(f"All benchmarking done", end="\r") | |
| # --- Final Table Output --- | |
| print("\n" + "="*80) | |
| print(f"COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better") | |
| print("-" * 80) | |
| header = f"{'VRAM GB':<10}" | |
| for name in dtypes.keys(): header += f" | {name:<12}" | |
| print(header) | |
| print("-" * 80) | |
| for gb in test_steps: | |
| row = f"{gb:<10.1f}" | |
| for name in dtypes.keys(): | |
| val = results_matrix[name].get(gb, 0.0) | |
| row += f" | {val:<12.4f}" | |
| print(row) | |
| print("="*80) |
Author
Author
================================================================================
SYSTEM DIAGNOSTICS
OS : Linux 6.6.113+
CPU : x86_64
GPU : Tesla T4
VRAM : 14.56 GB
Arch : Compute Cap 7.5
Backend : Torch 2.10.0+cu128 / 12.8
================================================================================
================================================================================
COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better
--------------------------------------------------------------------------------
VRAM GB | INT8 | INT16 | INT32 | FP16 | BF16 | FP32 | FP64
--------------------------------------------------------------------------------
1.0 | 2.7133 | 6.0406 | 2.9837 | 23.7107 | 1.6805 | 3.4189 | 0.2415
2.0 | 12.6171 | 6.1250 | 3.0266 | 17.5787 | 1.6406 | 3.4680 | 0.2418
3.0 | 14.0303 | 6.2226 | 3.0294 | 18.4580 | 1.6437 | 3.3881 | 0.2418
4.0 | 11.9880 | 6.0711 | 3.0123 | 17.9556 | 1.6492 | 3.4096 | 0.2418
5.0 | 12.0174 | 6.0620 | 3.0008 | 17.3214 | 1.6634 | 3.3895 | 0.2418
6.0 | 12.3675 | 6.0301 | 2.9975 | 17.0475 | 1.6799 | 3.3856 | 0.2415
7.0 | 12.2805 | 6.0768 | 3.0047 | 17.5876 | 1.6917 | 3.3920 | 0.2416
8.0 | 12.2435 | 6.0752 | 3.0025 | 23.3704 | 1.7222 | 3.3840 | 0.2416
9.0 | 10.9346 | 6.0688 | 2.9992 | 21.2667 | 1.7388 | 3.4786 | 0.2417
10.0 | 11.2239 | 6.0820 | 2.9867 | 21.8001 | 1.7918 | 3.4283 | 0.2416
11.0 | 11.4281 | 6.0404 | 2.9991 | 23.2962 | 1.7944 | 3.4922 | 0.2417
12.0 | 11.3640 | 6.0961 | 3.0034 | 22.3581 | 1.8077 | 3.4947 | 0.2417
13.0 | 11.3720 | 6.0709 | 3.1963 | 21.1299 | 1.8239 | 3.5146 | 0.2419
14.0 | 13.9182 | 6.8030 | 3.3584 | 21.0882 | 1.8286 | 3.5004 | 0.0000
================================================================================
Author
================================================================================
SYSTEM DIAGNOSTICS
OS : Windows 11
CPU : AMD64 Family 25 Model 33 Stepping 2, AuthenticAMD
GPU : AMD Radeon RX 7900 XTX
VRAM : 23.98 GB
Arch : Compute Cap 11.0
Backend : Torch 2.10.0a0+rocm7.10.0a20251120 / None
================================================================================
All benchmarking done
================================================================================
COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better
--------------------------------------------------------------------------------
VRAM GB | INT8 | INT16 | INT32 | FP16 | BF16 | FP32 | FP64
--------------------------------------------------------------------------------
1.0 | 1.3394 | 1.4338 | 2.5516 | 12.7965 | 8.4983 | 6.0509 | 1.0543
2.0 | 34.4811 | 4.5476 | 2.8527 | 48.1365 | 56.6924 | 19.1415 | 1.0249
3.0 | 45.0584 | 6.3482 | 2.4877 | 44.3068 | 58.1990 | 19.8741 | 1.0639
4.0 | 25.8562 | 6.4505 | 3.3395 | 52.3331 | 55.8807 | 20.3470 | 0.9779
5.0 | 27.3118 | 6.4253 | 3.2944 | 54.7542 | 59.4955 | 20.0175 | 1.0673
6.0 | 25.3211 | 6.4984 | 3.2779 | 55.7319 | 60.6414 | 19.7747 | 1.0582
7.0 | 26.0070 | 6.6061 | 3.1471 | 41.1660 | 57.5070 | 12.3532 | 1.0584
8.0 | 27.1570 | 6.3578 | 3.3463 | 56.3347 | 57.5408 | 20.4308 | 1.0619
9.0 | 22.6373 | 5.8721 | 3.2402 | 57.9432 | 57.6936 | 19.7200 | 1.0685
10.0 | 26.5400 | 6.4486 | 3.3798 | 57.1603 | 59.2644 | 20.4623 | 1.0661
11.0 | 25.4592 | 6.5189 | 3.3772 | 58.4769 | 59.9680 | 19.0941 | 1.0622
12.0 | 24.8455 | 6.3476 | 3.3154 | 57.3191 | 60.3427 | 20.4527 | 1.0644
13.0 | 26.1697 | 6.3580 | 3.3065 | 56.4202 | 57.0910 | 19.6925 | 1.0676
14.0 | 26.1580 | 6.3895 | 2.9189 | 57.9007 | 47.0302 | 20.4858 | 1.0636
15.0 | 19.9315 | 5.1354 | 3.4653 | 57.8984 | 61.4734 | 20.3091 | 1.0570
16.0 | 21.1530 | 6.8815 | 3.5538 | 57.1860 | 59.8424 | 19.7960 | 1.0664
17.0 | 23.4557 | 7.5250 | 3.4683 | 58.1642 | 61.3103 | 19.2114 | 1.0612
18.0 | 26.0285 | 7.2151 | 3.5146 | 46.8021 | 60.0445 | 20.8182 | 1.0697
19.0 | 24.6695 | 7.4401 | 3.4984 | 59.9399 | 60.7893 | 19.9111 | 1.0146
20.0 | 17.2417 | 7.3729 | 3.4683 | 49.9312 | 57.6158 | 20.8803 | 1.0548
21.0 | 12.8340 | 7.3103 | 3.5598 | 57.6464 | 50.1524 | 20.2402 | 1.0255
22.0 | 11.0178 | 7.2859 | 3.4966 | 57.1618 | 57.9797 | 19.9253 | 1.0221
23.0 | 11.5246 | 7.4358 | 3.5801 | 59.1279 | 61.5389 | 20.5735 | 1.0570
================================================================================
Author
================================================================================
SYSTEM DIAGNOSTICS
OS : Linux 6.8.0-106-generic
CPU : x86_64
GPU : NVIDIA A10
VRAM : 22.06 GB
Arch : Compute Cap 8.6
Backend : Torch 2.11.0+cu130 / 13.0
================================================================================
All benchmarking done
================================================================================
COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better
--------------------------------------------------------------------------------
VRAM GB | INT8 | INT16 | INT32 | FP16 | BF16 | FP32 | FP64
--------------------------------------------------------------------------------
1.0 | 5.4735 | 10.3598 | 5.8467 | 13.9324 | 15.5496 | 9.9438 | 0.4150
2.0 | 16.0881 | 10.3806 | 5.8485 | 63.5509 | 71.1824 | 12.0698 | 0.4171
3.0 | 17.4021 | 10.3745 | 5.8525 | 66.9665 | 69.7174 | 12.4950 | 0.4171
4.0 | 17.4651 | 8.8665 | 5.8442 | 61.3879 | 71.1452 | 12.6691 | 0.4171
5.0 | 17.9677 | 10.3492 | 5.8419 | 68.8794 | 70.6128 | 12.3076 | 0.4171
6.0 | 16.0508 | 10.3695 | 5.8528 | 69.8288 | 72.0779 | 12.3220 | 0.4155
7.0 | 18.2522 | 10.3748 | 5.8579 | 69.5522 | 71.3750 | 12.4823 | 0.4171
8.0 | 16.8158 | 10.3611 | 5.8554 | 68.6237 | 71.0050 | 12.4840 | 0.4170
9.0 | 17.4409 | 10.3813 | 5.8580 | 68.7129 | 70.2430 | 12.7346 | 0.4051
10.0 | 17.4509 | 10.3785 | 5.8604 | 70.0807 | 72.0470 | 12.5288 | 0.4171
11.0 | 16.0735 | 10.3667 | 5.6664 | 69.8102 | 71.8628 | 12.4567 | 0.4170
12.0 | 17.4099 | 10.3536 | 5.8611 | 68.6358 | 71.9821 | 12.3800 | 0.4170
13.0 | 17.4339 | 10.3730 | 5.8552 | 69.1466 | 70.2406 | 12.4161 | 0.4170
14.0 | 18.2652 | 10.3720 | 5.8499 | 68.9871 | 71.8969 | 12.3675 | 0.4158
15.0 | 17.4301 | 10.3815 | 5.8537 | 69.4430 | 71.7875 | 5.8510 | 0.4171
16.0 | 17.4106 | 10.3597 | 5.8468 | 68.8270 | 71.9325 | 12.4432 | 0.4171
17.0 | 17.5228 | 10.3727 | 5.6733 | 69.0329 | 72.2553 | 12.4842 | 0.4171
18.0 | 17.4037 | 10.3492 | 5.6680 | 69.5787 | 70.6584 | 12.5389 | 0.4055
19.0 | 25.7002 | 10.3642 | 5.8478 | 69.6702 | 72.1023 | 12.6141 | 0.4171
20.0 | 17.4559 | 10.3525 | 5.6612 | 71.8157 | 76.1790 | 12.6965 | 0.4171
21.0 | 18.1936 | 13.3459 | 6.2544 | 69.9451 | 72.6117 | 12.7249 | 0.4124
22.0 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 75.0666 | 0.0000 | 0.0000
================================================================================
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The TFLOPS calculated by this script will typically be significantly lower than the nominal TFLOPS of your GPU, which is normal.
The main reason for this is that after each computation in the script, it reads hundreds of MB of data from VRAM. Therefore, the memory bandwidth also greatly affects the calculated result.
I believe this is a more realistic scenario, as there are very few situations where the GPU is used extensively for idle computations without involving VRAM.
Finally, feel free to comment here with the performance of your device, which can provide a reference for others.