Last active
January 31, 2026 18:51
-
-
Save ryanhalliday/f1fa95d8ccd6de85be4a70733e014888 to your computer and use it in GitHub Desktop.
Run Devstral 2 123b on Modal serverless
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # --- | |
| # pytest: false | |
| # --- | |
| # # Run OpenAI-compatible LLM inference with Devstral-2 and vLLM | |
| # In this example, we show how to run a vLLM server in OpenAI-compatible mode on Modal. | |
| # LLMs do more than just model language: they chat, they produce JSON and XML, they run code, and more. | |
| # This has complicated their interface far beyond "text-in, text-out". | |
| # OpenAI's API has emerged as a standard for that interface, | |
| # and it is supported by open source LLM serving frameworks like [vLLM](https://docs.vllm.ai/en/latest/). | |
| # This example is intended to demonstrate the basics of deploying LLM inference on Modal. | |
| # For more on how to optimize performance, see | |
| # [this guide](https://modal.com/docs/guide/high-performance-llm-inference) | |
| # and check out our | |
| # [LLM Engineer's Almanac](https://modal.com/llm-almanac). | |
| # Our examples repository also includes scripts for running clients and load-testing for OpenAI-compatible APIs | |
| # [here](https://github.com/modal-labs/modal-examples/tree/main/06_gpu_and_ml/llm-serving/openai_compatible). | |
| # ## Set up the container image | |
| # Our first order of business is to define the environment our server will run in: | |
| # the [container `Image`](https://modal.com/docs/guide/custom-container). | |
| # vLLM can be installed with `uv pip`, since Modal [provides the CUDA drivers](https://modal.com/docs/guide/cuda). | |
| import json | |
| from typing import Any | |
| import aiohttp | |
| import modal | |
| vllm_image = ( | |
| modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12") | |
| .entrypoint([]) | |
| .apt_install("libxcb1") # required by vLLM dependencies | |
| .uv_pip_install( | |
| "vllm==0.15.0", | |
| "huggingface-hub==0.36.0", | |
| "mistral_common>=1.8.6", | |
| ) | |
| .env({"HF_XET_HIGH_PERFORMANCE": "1"}) # faster model transfers | |
| ) | |
| # ## Download the model weights | |
| # MODEL_NAME = "Qwen/Qwen3-4B-Thinking-2507-FP8" | |
| # MODEL_NAME = "mistralai/Devstral-Small-2-24B-Instruct-2512" | |
| # MODEL_REVISION = "1da725e0a5f7c2e6444035331fa04ff945e1b37a" # avoid nasty surprises when repos update! | |
| MODEL_NAME = "mistralai/Devstral-2-123B-Instruct-2512" | |
| MODEL_REVISION = "bbcc5a2717b82678c0b069a41c596463fbe5f665" # avoid nasty surprises when repos update! | |
| # Although vLLM will download weights from Hugging Face on-demand, | |
| # we want to cache them so we don't do it every time our server starts. | |
| # We'll use [Modal Volumes](https://modal.com/docs/guide/volumes) for our cache. | |
| # Modal Volumes are essentially a "shared disk" that all Modal Functions can access like it's a regular disk. For more on storing model weights on Modal, see | |
| # [this guide](https://modal.com/docs/guide/model-weights). | |
| hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True) | |
| # We'll also cache some of vLLM's JIT compilation artifacts in a Modal Volume. | |
| vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) | |
| # ## Configuring vLLM | |
| # ### Trading off fast boots and token generation performance | |
| # vLLM has embraced dynamic and just-in-time compilation to eke out additional performance without having to write too many custom kernels, | |
| # e.g. via the Torch compiler and CUDA graph capture. | |
| # These compilation features incur latency at startup in exchange for lowered latency and higher throughput during generation. | |
| # We make this trade-off controllable with the `FAST_BOOT` variable below. | |
| FAST_BOOT = True | |
| # If you're running an LLM service that frequently scales from 0 (frequent ["cold starts"](https://modal.com/docs/guide/cold-start)) | |
| # then you'll want to set this to `True`. | |
| # If you're running an LLM service that usually has multiple replicas running, then set this to `False` for improved performance. | |
| # See the code below for details on the parameters that `FAST_BOOT` controls. | |
| # For more on the performance you can expect when serving your own LLMs, see | |
| # [our LLM engine performance benchmarks](https://modal.com/llm-almanac). | |
| # ## Build a vLLM engine and serve it | |
| # The function below spawns a vLLM instance listening at port 8000, serving requests to our model. | |
| # We wrap it in the [`@modal.web_server` decorator](https://modal.com/docs/guide/webhooks#non-asgi-web-servers) | |
| # to connect it to the Internet. | |
| # The server runs in an independent process, via `subprocess.Popen`, and only starts accepting requests | |
| # once the model is spun up and the `serve` function returns. | |
| app = modal.App("example-vllm-inference") | |
| N_GPU = 2 | |
| MINUTES = 60 # seconds | |
| VLLM_PORT = 8000 | |
| @app.function( | |
| image=vllm_image, | |
| # Enough for smaller models | |
| # gpu=f"H100:{N_GPU}", | |
| # Bigger models | |
| gpu=f"H200:{N_GPU}", | |
| scaledown_window=10 * MINUTES, # how long should we stay up with no requests? | |
| timeout=20 * MINUTES, # how long should we wait for container start? | |
| volumes={ | |
| "/root/.cache/huggingface": hf_cache_vol, | |
| "/root/.cache/vllm": vllm_cache_vol, | |
| }, | |
| ) | |
| @modal.concurrent( # how many requests can one replica handle? tune carefully! | |
| max_inputs=32 | |
| ) | |
| @modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES) | |
| def serve(): | |
| import subprocess | |
| cmd = [ | |
| "vllm", | |
| "serve", | |
| "--uvicorn-log-level=info", | |
| MODEL_NAME, | |
| "--revision", | |
| MODEL_REVISION, | |
| "--served-model-name", | |
| MODEL_NAME, | |
| "llm", | |
| "--host", | |
| "0.0.0.0", | |
| "--port", | |
| str(VLLM_PORT), | |
| ] | |
| # enforce-eager disables both Torch compilation and CUDA graph capture | |
| # default is no-enforce-eager. see the --compilation-config flag for tighter control | |
| cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"] | |
| # assume multiple GPUs are for splitting up large matrix multiplications | |
| cmd += ["--tensor-parallel-size", str(N_GPU)] | |
| cmd += [ | |
| "--enable-auto-tool-choice", | |
| "--tool-call-parser", | |
| "mistral", | |
| # "hermes" # Qwen | |
| "--max-model-len", | |
| "262144" | |
| ] | |
| print(*cmd) | |
| subprocess.Popen(" ".join(cmd), shell=True) | |
| # ## Deploy the server | |
| # To deploy the API on Modal, just run | |
| # ```bash | |
| # modal deploy vllm_inference.py | |
| # ``` | |
| # This will create a new app on Modal, build the container image for it if it hasn't been built yet, | |
| # and deploy the app. | |
| # ## Interact with the server | |
| # Once it is deployed, you'll see a URL appear in the command line, | |
| # something like `https://your-workspace-name--example-vllm-inference-serve.modal.run`. | |
| # You can find [interactive Swagger UI docs](https://swagger.io/tools/swagger-ui/) | |
| # at the `/docs` route of that URL, i.e. `https://your-workspace-name--example-vllm-inference-serve.modal.run/docs`. | |
| # These docs describe each route and indicate the expected input and output | |
| # and translate requests into `curl` commands. | |
| # For simple routes like `/health`, which checks whether the server is responding, | |
| # you can even send a request directly from the docs. | |
| # To interact with the API programmatically in Python, we recommend the `openai` library. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment