Created
December 9, 2025 16:19
-
-
Save tin2tin/89f2b2f39b309a525e371faf1bc27897 to your computer and use it in GitHub Desktop.
WAN 2.2 TI2V 5B
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import torch | |
| import gc | |
| from diffusers import WanPipeline, WanTransformer3DModel | |
| from diffusers.utils import export_to_video | |
| from transformers import BitsAndBytesConfig | |
| # ========================================== | |
| # 1. SETTINGS | |
| # ========================================== | |
| OUTPUT_FILENAME = "C:/Tmp/5b_t2v_output.mp4" | |
| # 1280x704 is heavy. If you get Out of Memory, try 960x544 first. | |
| HEIGHT = 544 | |
| WIDTH = 960 | |
| #HEIGHT = 704 | |
| #WIDTH = 1280 | |
| NUM_FRAMES = 121 | |
| STEPS = 50 | |
| GUIDANCE = 5.0 | |
| PROMPT = "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." | |
| NEGATIVE_PROMPT = "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" | |
| # ========================================== | |
| # 2. OPTIMIZED SETUP | |
| # ========================================== | |
| print("--- Initializing & Clearing Memory ---") | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| MODEL_ID = "Wan-AI/Wan2.2-TI2V-5B-Diffusers" | |
| # 4-Bit Config: Compresses the 5B model to ~3.5GB VRAM | |
| nf4_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16 | |
| ) | |
| # ========================================== | |
| # 3. LOAD MODELS | |
| # ========================================== | |
| print("--- Loading Models (4-bit Quantized) ---") | |
| # Load the Transformer in 4-bit to save VRAM | |
| transformer = WanTransformer3DModel.from_pretrained( | |
| MODEL_ID, | |
| subfolder="transformer", | |
| quantization_config=nf4_config, | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| # Load the Pipeline | |
| # Note: We let the pipeline load the VAE and Text Encoder (T5) automatically. | |
| # T5 is large (~10GB), so we rely on cpu_offload to handle it. | |
| pipe = WanPipeline.from_pretrained( | |
| MODEL_ID, | |
| transformer=transformer, | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| # ========================================== | |
| # 4. MEMORY OPTIMIZATIONS (CRITICAL) | |
| # ========================================== | |
| # This moves models to CPU when not in use. | |
| # Sequence: Text Encoder (GPU) -> Transformer (GPU) -> VAE (GPU) | |
| pipe.enable_model_cpu_offload() | |
| # Enable VAE Slicing to decode 121 frames without crashing | |
| # This decodes 1 frame at a time instead of all 121 at once. | |
| pipe.vae.enable_slicing() | |
| # Only enable tiling if you still OOM (it creates grid artifacts sometimes) | |
| # pipe.vae.enable_tiling() | |
| # ========================================== | |
| # 5. GENERATE | |
| # ========================================== | |
| print(f"--- Generating {NUM_FRAMES} frames at {WIDTH}x{HEIGHT} ---") | |
| torch.cuda.empty_cache() | |
| # Optional: Set a seed for reproducibility | |
| generator = torch.Generator(device="cuda").manual_seed(42) | |
| output = pipe( | |
| prompt=PROMPT, | |
| negative_prompt=NEGATIVE_PROMPT, | |
| height=HEIGHT, | |
| width=WIDTH, | |
| num_frames=NUM_FRAMES, | |
| guidance_scale=GUIDANCE, | |
| num_inference_steps=STEPS, | |
| generator=generator | |
| ).frames[0] | |
| # ========================================== | |
| # 6. SAVE | |
| # ========================================== | |
| print("--- Saving Video ---") | |
| export_to_video(output, OUTPUT_FILENAME, fps=24) | |
| print(f"Done! Saved to {OUTPUT_FILENAME}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment