Skip to content

Instantly share code, notes, and snippets.

@tin2tin
Created December 9, 2025 16:19
Show Gist options
  • Select an option

  • Save tin2tin/89f2b2f39b309a525e371faf1bc27897 to your computer and use it in GitHub Desktop.

Select an option

Save tin2tin/89f2b2f39b309a525e371faf1bc27897 to your computer and use it in GitHub Desktop.
WAN 2.2 TI2V 5B
import os
import torch
import gc
from diffusers import WanPipeline, WanTransformer3DModel
from diffusers.utils import export_to_video
from transformers import BitsAndBytesConfig
# ==========================================
# 1. SETTINGS
# ==========================================
OUTPUT_FILENAME = "C:/Tmp/5b_t2v_output.mp4"
# 1280x704 is heavy. If you get Out of Memory, try 960x544 first.
HEIGHT = 544
WIDTH = 960
#HEIGHT = 704
#WIDTH = 1280
NUM_FRAMES = 121
STEPS = 50
GUIDANCE = 5.0
PROMPT = "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."
NEGATIVE_PROMPT = "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走"
# ==========================================
# 2. OPTIMIZED SETUP
# ==========================================
print("--- Initializing & Clearing Memory ---")
gc.collect()
torch.cuda.empty_cache()
MODEL_ID = "Wan-AI/Wan2.2-TI2V-5B-Diffusers"
# 4-Bit Config: Compresses the 5B model to ~3.5GB VRAM
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
# ==========================================
# 3. LOAD MODELS
# ==========================================
print("--- Loading Models (4-bit Quantized) ---")
# Load the Transformer in 4-bit to save VRAM
transformer = WanTransformer3DModel.from_pretrained(
MODEL_ID,
subfolder="transformer",
quantization_config=nf4_config,
torch_dtype=torch.bfloat16
)
# Load the Pipeline
# Note: We let the pipeline load the VAE and Text Encoder (T5) automatically.
# T5 is large (~10GB), so we rely on cpu_offload to handle it.
pipe = WanPipeline.from_pretrained(
MODEL_ID,
transformer=transformer,
torch_dtype=torch.bfloat16
)
# ==========================================
# 4. MEMORY OPTIMIZATIONS (CRITICAL)
# ==========================================
# This moves models to CPU when not in use.
# Sequence: Text Encoder (GPU) -> Transformer (GPU) -> VAE (GPU)
pipe.enable_model_cpu_offload()
# Enable VAE Slicing to decode 121 frames without crashing
# This decodes 1 frame at a time instead of all 121 at once.
pipe.vae.enable_slicing()
# Only enable tiling if you still OOM (it creates grid artifacts sometimes)
# pipe.vae.enable_tiling()
# ==========================================
# 5. GENERATE
# ==========================================
print(f"--- Generating {NUM_FRAMES} frames at {WIDTH}x{HEIGHT} ---")
torch.cuda.empty_cache()
# Optional: Set a seed for reproducibility
generator = torch.Generator(device="cuda").manual_seed(42)
output = pipe(
prompt=PROMPT,
negative_prompt=NEGATIVE_PROMPT,
height=HEIGHT,
width=WIDTH,
num_frames=NUM_FRAMES,
guidance_scale=GUIDANCE,
num_inference_steps=STEPS,
generator=generator
).frames[0]
# ==========================================
# 6. SAVE
# ==========================================
print("--- Saving Video ---")
export_to_video(output, OUTPUT_FILENAME, fps=24)
print(f"Done! Saved to {OUTPUT_FILENAME}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment