ryanhalliday · January 31, 2026 18:51
diff --git a/modal-vllm-inference-devstral-2-123b.py b/modal-vllm-inference-devstral-2-123b.py
 # ---
 # pytest: false
 # ---

 # # Run OpenAI-compatible LLM inference with Devstral-2 and vLLM

 # In this example, we show how to run a vLLM server in OpenAI-compatible mode on Modal.

 # LLMs do more than just model language: they chat, they produce JSON and XML, they run code, and more.
 # This has complicated their interface far beyond "text-in, text-out".
 # OpenAI's API has emerged as a standard for that interface,
 # and it is supported by open source LLM serving frameworks like [vLLM](https://docs.vllm.ai/en/latest/).

 # This example is intended to demonstrate the basics of deploying LLM inference on Modal.
 # For more on how to optimize performance, see
 # [this guide](https://modal.com/docs/guide/high-performance-llm-inference)
 # and check out our
 # [LLM Engineer's Almanac](https://modal.com/llm-almanac).

 # Our examples repository also includes scripts for running clients and load-testing for OpenAI-compatible APIs
 # [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible).

 # ## Set up the container image

 # Our first order of business is to define the environment our server will run in:
 # the [container `Image`](https://modal.com/docs/guide/custom-container).
 # vLLM can be installed with `uv pip`, since Modal [provides the CUDA drivers](https://modal.com/docs/guide/cuda).

 import json
 from typing import Any

 import aiohttp
 import modal

 vllm_image = (
    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
    .entrypoint([])
    .apt_install("libxcb1")  # required by vLLM dependencies
    .uv_pip_install(
        "vllm==0.15.0",
        "huggingface-hub==0.36.0",
        "mistral_common>=1.8.6",
    )
    .env({"HF_XET_HIGH_PERFORMANCE": "1"})  # faster model transfers
 )

 # ## Download the model weights

 # MODEL_NAME = "Qwen/Qwen3-4B-Thinking-2507-FP8"
 # MODEL_NAME = "mistralai/Devstral-Small-2-24B-Instruct-2512"
 # MODEL_REVISION = "1da725e0a5f7c2e6444035331fa04ff945e1b37a"  # avoid nasty surprises when repos update!
 MODEL_NAME = "mistralai/Devstral-2-123B-Instruct-2512"
 MODEL_REVISION = "bbcc5a2717b82678c0b069a41c596463fbe5f665"  # avoid nasty surprises when repos update!

 # Although vLLM will download weights from Hugging Face on-demand,
 # we want to cache them so we don't do it every time our server starts.
 # We'll use [Modal Volumes](https://modal.com/docs/guide/volumes) for our cache.
 # Modal Volumes are essentially a "shared disk" that all Modal Functions can access like it's a regular disk. For more on storing model weights on Modal, see
 # [this guide](https://modal.com/docs/guide/model-weights).


 hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)

 # We'll also cache some of vLLM's JIT compilation artifacts in a Modal Volume.

 vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)

 # ## Configuring vLLM

 # ### Trading off fast boots and token generation performance

 # vLLM has embraced dynamic and just-in-time compilation to eke out additional performance without having to write too many custom kernels,
 # e.g. via the Torch compiler and CUDA graph capture.
 # These compilation features incur latency at startup in exchange for lowered latency and higher throughput during generation.
 # We make this trade-off controllable with the `FAST_BOOT` variable below.

 FAST_BOOT = True

 # If you're running an LLM service that frequently scales from 0 (frequent ["cold starts"](https://modal.com/docs/guide/cold-start))
 # then you'll want to set this to `True`.

 # If you're running an LLM service that usually has multiple replicas running, then set this to `False` for improved performance.

 # See the code below for details on the parameters that `FAST_BOOT` controls.

 # For more on the performance you can expect when serving your own LLMs, see
 # [our LLM engine performance benchmarks](https://modal.com/llm-almanac).

 # ## Build a vLLM engine and serve it

 # The function below spawns a vLLM instance listening at port 8000, serving requests to our model.
 # We wrap it in the [`@modal.web_server` decorator](https://modal.com/docs/guide/webhooks#non-asgi-web-servers)
 # to connect it to the Internet.

 # The server runs in an independent process, via `subprocess.Popen`, and only starts accepting requests
 # once the model is spun up and the `serve` function returns.


 app = modal.App("example-vllm-inference")

 N_GPU = 2
 MINUTES = 60  # seconds
 VLLM_PORT = 8000


 @app.function(
    image=vllm_image,
    # Enough for smaller models
    # gpu=f"H100:{N_GPU}",
    # Bigger models
    gpu=f"H200:{N_GPU}",
    scaledown_window=10 * MINUTES,  # how long should we stay up with no requests?
    timeout=20 * MINUTES,  # how long should we wait for container start?
    volumes={
        "/root/.cache/huggingface": hf_cache_vol,
        "/root/.cache/vllm": vllm_cache_vol,
    },
 )
 @modal.concurrent(  # how many requests can one replica handle? tune carefully!
    max_inputs=32
 )
 @modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
 def serve():
    import subprocess

    cmd = [
        "vllm",
        "serve",
        "--uvicorn-log-level=info",
        MODEL_NAME,
        "--revision",
        MODEL_REVISION,
        "--served-model-name",
        MODEL_NAME,
        "llm",
        "--host",
        "0.0.0.0",
        "--port",
        str(VLLM_PORT),
    ]

    # enforce-eager disables both Torch compilation and CUDA graph capture
    # default is no-enforce-eager. see the --compilation-config flag for tighter control
    cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]

    # assume multiple GPUs are for splitting up large matrix multiplications
    cmd += ["--tensor-parallel-size", str(N_GPU)]

    cmd += [
        "--enable-auto-tool-choice",
        "--tool-call-parser",
        "mistral",
        # "hermes" # Qwen
        "--max-model-len",
        "262144"
    ]

    print(*cmd)

    subprocess.Popen(" ".join(cmd), shell=True)


 # ## Deploy the server

 # To deploy the API on Modal, just run
 # ```bash
 # modal deploy vllm_inference.py
 # ```

 # This will create a new app on Modal, build the container image for it if it hasn't been built yet,
 # and deploy the app.

 # ## Interact with the server

 # Once it is deployed, you'll see a URL appear in the command line,
 # something like `https://your-workspace-name--example-vllm-inference-serve.modal.run`.

 # You can find [interactive Swagger UI docs](https://swagger.io/tools/swagger-ui/)
 # at the `/docs` route of that URL, i.e. `https://your-workspace-name--example-vllm-inference-serve.modal.run/docs`.
 # These docs describe each route and indicate the expected input and output
 # and translate requests into `curl` commands.

 # For simple routes like `/health`, which checks whether the server is responding,
 # you can even send a request directly from the docs.

 # To interact with the API programmatically in Python, we recommend the `openai` library.
	# ---
	# pytest: false
	# ---

	# # Run OpenAI-compatible LLM inference with Devstral-2 and vLLM

	# In this example, we show how to run a vLLM server in OpenAI-compatible mode on Modal.

	# LLMs do more than just model language: they chat, they produce JSON and XML, they run code, and more.
	# This has complicated their interface far beyond "text-in, text-out".
	# OpenAI's API has emerged as a standard for that interface,
	# and it is supported by open source LLM serving frameworks like [vLLM](https://docs.vllm.ai/en/latest/).

	# This example is intended to demonstrate the basics of deploying LLM inference on Modal.
	# For more on how to optimize performance, see
	# [this guide](https://modal.com/docs/guide/high-performance-llm-inference)
	# and check out our
	# [LLM Engineer's Almanac](https://modal.com/llm-almanac).

	# Our examples repository also includes scripts for running clients and load-testing for OpenAI-compatible APIs
	# [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible).

	# ## Set up the container image

	# Our first order of business is to define the environment our server will run in:
	# the [container `Image`](https://modal.com/docs/guide/custom-container).
	# vLLM can be installed with `uv pip`, since Modal [provides the CUDA drivers](https://modal.com/docs/guide/cuda).

	import json
	from typing import Any

	import aiohttp
	import modal

	vllm_image = (
	modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
	.entrypoint([])
	.apt_install("libxcb1") # required by vLLM dependencies
	.uv_pip_install(
	"vllm==0.15.0",
	"huggingface-hub==0.36.0",
	"mistral_common>=1.8.6",
	)
	.env({"HF_XET_HIGH_PERFORMANCE": "1"}) # faster model transfers
	)

	# ## Download the model weights

	# MODEL_NAME = "Qwen/Qwen3-4B-Thinking-2507-FP8"
	# MODEL_NAME = "mistralai/Devstral-Small-2-24B-Instruct-2512"
	# MODEL_REVISION = "1da725e0a5f7c2e6444035331fa04ff945e1b37a" # avoid nasty surprises when repos update!
	MODEL_NAME = "mistralai/Devstral-2-123B-Instruct-2512"
	MODEL_REVISION = "bbcc5a2717b82678c0b069a41c596463fbe5f665" # avoid nasty surprises when repos update!

	# Although vLLM will download weights from Hugging Face on-demand,
	# we want to cache them so we don't do it every time our server starts.
	# We'll use [Modal Volumes](https://modal.com/docs/guide/volumes) for our cache.
	# Modal Volumes are essentially a "shared disk" that all Modal Functions can access like it's a regular disk. For more on storing model weights on Modal, see
	# [this guide](https://modal.com/docs/guide/model-weights).


	hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)

	# We'll also cache some of vLLM's JIT compilation artifacts in a Modal Volume.

	vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)

	# ## Configuring vLLM

	# ### Trading off fast boots and token generation performance

	# vLLM has embraced dynamic and just-in-time compilation to eke out additional performance without having to write too many custom kernels,
	# e.g. via the Torch compiler and CUDA graph capture.
	# These compilation features incur latency at startup in exchange for lowered latency and higher throughput during generation.
	# We make this trade-off controllable with the `FAST_BOOT` variable below.

	FAST_BOOT = True

	# If you're running an LLM service that frequently scales from 0 (frequent ["cold starts"](https://modal.com/docs/guide/cold-start))
	# then you'll want to set this to `True`.

	# If you're running an LLM service that usually has multiple replicas running, then set this to `False` for improved performance.

	# See the code below for details on the parameters that `FAST_BOOT` controls.

	# For more on the performance you can expect when serving your own LLMs, see
	# [our LLM engine performance benchmarks](https://modal.com/llm-almanac).

	# ## Build a vLLM engine and serve it

	# The function below spawns a vLLM instance listening at port 8000, serving requests to our model.
	# We wrap it in the [`@modal.web_server` decorator](https://modal.com/docs/guide/webhooks#non-asgi-web-servers)
	# to connect it to the Internet.

	# The server runs in an independent process, via `subprocess.Popen`, and only starts accepting requests
	# once the model is spun up and the `serve` function returns.


	app = modal.App("example-vllm-inference")

	N_GPU = 2
	MINUTES = 60 # seconds
	VLLM_PORT = 8000


	@app.function(
	image=vllm_image,
	# Enough for smaller models
	# gpu=f"H100:{N_GPU}",
	# Bigger models
	gpu=f"H200:{N_GPU}",
	scaledown_window=10 * MINUTES, # how long should we stay up with no requests?
	timeout=20 * MINUTES, # how long should we wait for container start?
	volumes={
	"/root/.cache/huggingface": hf_cache_vol,
	"/root/.cache/vllm": vllm_cache_vol,
	},
	)
	@modal.concurrent( # how many requests can one replica handle? tune carefully!
	max_inputs=32
	)
	@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
	def serve():
	import subprocess

	cmd = [
	"vllm",
	"serve",
	"--uvicorn-log-level=info",
	MODEL_NAME,
	"--revision",
	MODEL_REVISION,
	"--served-model-name",
	MODEL_NAME,
	"llm",
	"--host",
	"0.0.0.0",
	"--port",
	str(VLLM_PORT),
	]

	# enforce-eager disables both Torch compilation and CUDA graph capture
	# default is no-enforce-eager. see the --compilation-config flag for tighter control
	cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]

	# assume multiple GPUs are for splitting up large matrix multiplications
	cmd += ["--tensor-parallel-size", str(N_GPU)]

	cmd += [
	"--enable-auto-tool-choice",
	"--tool-call-parser",
	"mistral",
	# "hermes" # Qwen
	"--max-model-len",
	"262144"
	]

	print(*cmd)

	subprocess.Popen(" ".join(cmd), shell=True)


	# ## Deploy the server

	# To deploy the API on Modal, just run
	# ```bash
	# modal deploy vllm_inference.py
	# ```

	# This will create a new app on Modal, build the container image for it if it hasn't been built yet,
	# and deploy the app.

	# ## Interact with the server

	# Once it is deployed, you'll see a URL appear in the command line,
	# something like `https://your-workspace-name--example-vllm-inference-serve.modal.run`.

	# You can find [interactive Swagger UI docs](https://swagger.io/tools/swagger-ui/)
	# at the `/docs` route of that URL, i.e. `https://your-workspace-name--example-vllm-inference-serve.modal.run/docs`.
	# These docs describe each route and indicate the expected input and output
	# and translate requests into `curl` commands.

	# For simple routes like `/health`, which checks whether the server is responding,
	# you can even send a request directly from the docs.

	# To interact with the API programmatically in Python, we recommend the `openai` library.
No results found