Lua12138 · April 28, 2026 12:19 · Lua12138 · Apr 28, 2026 · Lua12138 · Apr 28, 2026
diff --git a/benchmark.py b/benchmark.py
 import torch
 import time
 import platform

 def get_system_info():
    props = torch.cuda.get_device_properties(0)
    return {
        "OS": f"{platform.system()} {platform.release()}",
        "CPU": platform.processor(),
        "GPU": props.name,
        "VRAM": f"{round(props.total_memory / (1024**3), 2)} GB",
        "Arch": f"Compute Cap {props.major}.{props.minor}",
        "Backend": f"Torch {torch.__version__} / {torch.version.cuda}"
    }

 def benchmark_compute_cell(target_vram_gb, dtype):
    device = "cuda"
    matrix_size = 4096 
    
    chunks, A, B = None, None, None
    # 1. Physical VRAM Allocation (Fix: differentiate between float and int)
    elem_size = torch.tensor([], dtype=dtype).element_size()
    chunk_elements = (256 * 1024 * 1024) // elem_size
    num_chunks = int((target_vram_gb * 1024**3) / (256 * 1024**2))
    
    chunks = []
    try:
        for _ in range(num_chunks):
            # FIX: Use randint for integer types, randn for floating points
            if dtype.is_floating_point:
                chunks.append(torch.randn(chunk_elements, device=device, dtype=dtype))
            else:
                # Fill with random integers in a safe range
                chunks.append(torch.randint(-10, 10, (chunk_elements,), device=device, dtype=dtype))
    except Exception:
        pass

    if not chunks: return 0.0

    # 2. Compute Performance Measurement
    try:
        # Prepare operands (Fix: randint for integers)
        if dtype.is_floating_point:
            A = torch.randn((matrix_size, matrix_size), device=device, dtype=dtype)
            B = torch.randn((matrix_size, matrix_size), device=device, dtype=dtype)
        else:
            A = torch.randint(-10, 10, (matrix_size, matrix_size), device=device, dtype=dtype)
            B = torch.randint(-10, 10, (matrix_size, matrix_size), device=device, dtype=dtype)
        
        res = None
        torch.cuda.synchronize()
        iters = 15
        start = time.perf_counter()
        
        for i in range(iters):
            if dtype.is_floating_point:
                if res is not None:
                    B[0, 0] += res.mean() 
                res = torch.matmul(A, B)
                ops_per_iter = 2 * matrix_size**3
            else:
                if res is not None:
                    B[0, 0] += res[0, 0]
                res = (A * B) + A
                ops_per_iter = 2 * matrix_size**3 / 16 # Scaled for normalization

            # VRAM Dependency Injection
            A += chunks[i % len(chunks)][:matrix_size*matrix_size].view(matrix_size, matrix_size)
            A += chunks[i % len(chunks)][:matrix_size*matrix_size].view(matrix_size, matrix_size)
            torch.cuda.synchronize()
            
        end = time.perf_counter()
        duration_sec = (end - start) / iters
        perf_score = (ops_per_iter / duration_sec) / 1e12
        return perf_score

    except Exception:
        return 0.0
    finally:
        if chunks is not None: del chunks
        if A is not None: del A
        if B is not None: del B
        torch.cuda.empty_cache()


 # --- Execution ---
 info = get_system_info()
 print("="*80)
 print("SYSTEM DIAGNOSTICS")
 for k, v in info.items(): print(f"{k:<12}: {v}")
 print("="*80)

 gpu_capacity = torch.cuda.get_device_properties(0).total_memory
 available_vram_gb = gpu_capacity // (1024 ** 3)
 test_steps = list(range(1, available_vram_gb+1))
 dtypes = {
    "INT8": torch.int8,
    "INT16": torch.int16,
    "INT32": torch.int32,
    "FP16": torch.float16,
    "BF16": torch.bfloat16,
    "FP32": torch.float32,
    "FP64": torch.float64
 }

 # Run Benchmarks
 results_matrix = {}
 for name, dt in dtypes.items():
    print(f"Benchmarking {name}...", end="\r")
    col_results = {}
    for gb in test_steps:
        col_results[gb] = benchmark_compute_cell(gb, dt)
    results_matrix[name] = col_results

 print(f"All benchmarking done", end="\r")
 # --- Final Table Output ---
 print("\n" + "="*80)
 print(f"COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better")
 print("-" * 80)
 header = f"{'VRAM GB':<10}"
 for name in dtypes.keys(): header += f" | {name:<12}"
 print(header)
 print("-" * 80)

 for gb in test_steps:
    row = f"{gb:<10.1f}"
    for name in dtypes.keys():
        val = results_matrix[name].get(gb, 0.0)
        row += f" | {val:<12.4f}"
    print(row)
 print("="*80)
	import torch
	import time
	import platform

	def get_system_info():
	props = torch.cuda.get_device_properties(0)
	return {
	"OS": f"{platform.system()} {platform.release()}",
	"CPU": platform.processor(),
	"GPU": props.name,
	"VRAM": f"{round(props.total_memory / (1024**3), 2)} GB",
	"Arch": f"Compute Cap {props.major}.{props.minor}",
	"Backend": f"Torch {torch.__version__} / {torch.version.cuda}"
	}

	def benchmark_compute_cell(target_vram_gb, dtype):
	device = "cuda"
	matrix_size = 4096

	chunks, A, B = None, None, None
	# 1. Physical VRAM Allocation (Fix: differentiate between float and int)
	elem_size = torch.tensor([], dtype=dtype).element_size()
	chunk_elements = (256 * 1024 * 1024) // elem_size
	num_chunks = int((target_vram_gb * 1024*3) / (256 1024**2))

	chunks = []
	try:
	for _ in range(num_chunks):
	# FIX: Use randint for integer types, randn for floating points
	if dtype.is_floating_point:
	chunks.append(torch.randn(chunk_elements, device=device, dtype=dtype))
	else:
	# Fill with random integers in a safe range
	chunks.append(torch.randint(-10, 10, (chunk_elements,), device=device, dtype=dtype))
	except Exception:
	pass

	if not chunks: return 0.0

	# 2. Compute Performance Measurement
	try:
	# Prepare operands (Fix: randint for integers)
	if dtype.is_floating_point:
	A = torch.randn((matrix_size, matrix_size), device=device, dtype=dtype)
	B = torch.randn((matrix_size, matrix_size), device=device, dtype=dtype)
	else:
	A = torch.randint(-10, 10, (matrix_size, matrix_size), device=device, dtype=dtype)
	B = torch.randint(-10, 10, (matrix_size, matrix_size), device=device, dtype=dtype)

	res = None
	torch.cuda.synchronize()
	iters = 15
	start = time.perf_counter()

	for i in range(iters):
	if dtype.is_floating_point:
	if res is not None:
	B[0, 0] += res.mean()
	res = torch.matmul(A, B)
	ops_per_iter = 2 * matrix_size**3
	else:
	if res is not None:
	B[0, 0] += res[0, 0]
	res = (A * B) + A
	ops_per_iter = 2 * matrix_size**3 / 16 # Scaled for normalization

	# VRAM Dependency Injection
	A += chunks[i % len(chunks)][:matrix_size*matrix_size].view(matrix_size, matrix_size)
	A += chunks[i % len(chunks)][:matrix_size*matrix_size].view(matrix_size, matrix_size)
	torch.cuda.synchronize()

	end = time.perf_counter()
	duration_sec = (end - start) / iters
	perf_score = (ops_per_iter / duration_sec) / 1e12
	return perf_score

	except Exception:
	return 0.0
	finally:
	if chunks is not None: del chunks
	if A is not None: del A
	if B is not None: del B
	torch.cuda.empty_cache()


	# --- Execution ---
	info = get_system_info()
	print("="*80)
	print("SYSTEM DIAGNOSTICS")
	for k, v in info.items(): print(f"{k:<12}: {v}")
	print("="*80)

	gpu_capacity = torch.cuda.get_device_properties(0).total_memory
	available_vram_gb = gpu_capacity // (1024 ** 3)
	test_steps = list(range(1, available_vram_gb+1))
	dtypes = {
	"INT8": torch.int8,
	"INT16": torch.int16,
	"INT32": torch.int32,
	"FP16": torch.float16,
	"BF16": torch.bfloat16,
	"FP32": torch.float32,
	"FP64": torch.float64
	}

	# Run Benchmarks
	results_matrix = {}
	for name, dt in dtypes.items():
	print(f"Benchmarking {name}...", end="\r")
	col_results = {}
	for gb in test_steps:
	col_results[gb] = benchmark_compute_cell(gb, dt)
	results_matrix[name] = col_results

	print(f"All benchmarking done", end="\r")
	# --- Final Table Output ---
	print("\n" + "="*80)
	print(f"COMPUTE PERFORMANCE REPORT (TFLOPS) - Higher is Better")
	print("-" * 80)
	header = f"{'VRAM GB':<10}"
	for name in dtypes.keys(): header += f" \| {name:<12}"
	print(header)
	print("-" * 80)

	for gb in test_steps:
	row = f"{gb:<10.1f}"
	for name in dtypes.keys():
	val = results_matrix[name].get(gb, 0.0)
	row += f" \| {val:<12.4f}"
	print(row)
	print("="*80)
No results found