Skip to content

Instantly share code, notes, and snippets.

@celsowm
Last active February 12, 2026 12:35
Show Gist options
  • Select an option

  • Save celsowm/8507d0990ad04467fa8b4f5ddf36099e to your computer and use it in GitHub Desktop.

Select an option

Save celsowm/8507d0990ad04467fa8b4f5ddf36099e to your computer and use it in GitHub Desktop.
ai_server.md

#step 1: download model

hf download Qwen/Qwen3-14B-FP8 --local-dir /srv/models/qwen3-14b-fp8

big model multi gpus:

docker run -d --name vllm-qwen-235b-thinking \
  --gpus '"device=0,1,2,3"' \
  --ipc=host \
  -p 8000:8000 \
  -v /srv/models:/models \
  vllm/vllm-openai:latest \
  /models/qwen3-235b-fp8 \
  --port 8000 \
  --tensor-parallel-size 4 \
  --max-model-len 65536 \
  --enable-auto-tool-choice \
  --tool-call-parser hermes \
  --enable-chunked-prefill \
  --reasoning-parser deepseek_r1

medium model big ctx

docker run -d --name vllm-qwen-14b \
  -p 8001:8001 \
  -v /srv/models:/models \
  --gpus '"device=4"' \
  -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
  vllm/vllm-openai:latest \
  /models/qwen3-14b-fp8 \
  --host 0.0.0.0 \
  --port 8001 \
  --max-model-len 98304 \
  --hf-overrides '{"max_position_embeddings": 98304, "rope_scaling": {"type": "yarn", "factor": 3.0, "original_max_position_embeddings": 32768}}' \
  --enable-chunked-prefill \
  --enable-auto-tool-choice \
  --tool-call-parser hermes

multimodal

docker run -d --name olmOCR-2-7B \
  -p 8002:8002 \
  -v /srv/models:/models \
  --gpus '"device=5"' \
  vllm/vllm-openai:latest \
  /models/olmOCR-2-7B-1025-FP8 \
  --host 0.0.0.0 \
  --port 8002 \
  --max-model-len 16384 \
  --gpu-memory-utilization 0.45

embedding

docker run -d --name vllm-qwen-embed \
  -p 8003:8003 \
  -v /srv/models:/models \
  --gpus '"device=5"' \
  vllm/vllm-openai:latest \
  --model /models/qwen3-embedding-8B \
  --port 8003 \
  --max-model-len 32768 \
  --gpu-memory-utilization 0.45

xeon-cpu (compile first):

docker run -d --name qwen3-4b \
  --privileged --ipc=host --network=host \
  --memory=32g --memory-swap=32g --shm-size=2g \
  --cpuset-cpus "0-85,172-257" \
  -v /srv/models:/models \
  -e SGLANG_USE_CPU_ENGINE=1 \
  -e OMP_NUM_THREADS=64 \
  -e MKL_NUM_THREADS=64 \
  sglang-cpu:xeon \
  /opt/.venv/bin/python3 -m sglang.launch_server \
    --model-path /models/qwen3-4b-w8a8 \
    --device cpu \
    --quantization w8a8_int8 \
    --context-length 8192 \
    --mem-fraction-static 0.02 \
    --max-total-tokens 8192 \
    --host 0.0.0.0 \
    --port 8004
@celsowm
Copy link
Author

celsowm commented Feb 12, 2026

trellis 2:

entrypoint.sh

#!/bin/bash
set -euo pipefail

LOG_PREFIX="[trellis2]"

# Evita prompts do conda/pip
export CONDA_ALWAYS_YES=true
export PIP_DISABLE_PIP_VERSION_CHECK=1
export PYTHONUNBUFFERED=1

# Gradio exposto externamente
export GRADIO_SERVER_NAME="${GRADIO_SERVER_NAME:-0.0.0.0}"
export GRADIO_SERVER_PORT="${GRADIO_SERVER_PORT:-7860}"

# Caches persistentes
export HF_HOME="${HF_HOME:-/data/hf}"
export TORCH_HOME="${TORCH_HOME:-/data/torch}"
export XDG_CACHE_HOME="${XDG_CACHE_HOME:-/data/cache}"

# CUDA / arch (H100 = SM90)
export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0+PTX}"

FLAG_FILE="/data/.trellis2_installed"

mkdir -p /data "${HF_HOME}" "${TORCH_HOME}" "${XDG_CACHE_HOME}"

# Se não existir sudo, cria wrapper (container roda como root)
if ! command -v sudo >/dev/null 2>&1; then
  echo "${LOG_PREFIX} criando wrapper sudo (container roda como root)"
  printf '#!/bin/sh\nexec "$@"\n' > /usr/local/bin/sudo
  chmod +x /usr/local/bin/sudo
fi

# Conda
source /opt/conda/etc/profile.d/conda.sh
cd /opt/trellis2

echo "${LOG_PREFIX} GPU visivel no container:"
nvidia-smi -L || true
echo "${LOG_PREFIX} CUDA_HOME=${CUDA_HOME}"
echo "${LOG_PREFIX} TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}"

need_install=0

# Se flag não existe, precisa instalar
if [ ! -f "${FLAG_FILE}" ]; then
  need_install=1
fi

# Se env não existe, precisa instalar
if ! conda env list | awk '{print $1}' | grep -qx trellis2; then
  need_install=1
fi

if [ "${need_install}" -eq 1 ]; then
  echo "${LOG_PREFIX} Instalando dependencias (primeira execucao ou env ausente)..."
  . ./setup.sh --new-env --basic --flash-attn --nvdiffrast --nvdiffrec --cumesh --o-voxel --flexgemm
  conda clean -afy || true
  touch "${FLAG_FILE}"
fi

conda activate trellis2

echo "${LOG_PREFIX} Validando torch/cuda..."
python -c "import torch; print('torch', torch.__version__, 'cuda', torch.version.cuda, 'is_available', torch.cuda.is_available())"

# Valida cumesh e o_voxel; se falhar recompila
echo "${LOG_PREFIX} Validando cumesh/o_voxel..."
set +e
python -c "import cumesh; print('cumesh OK', cumesh.__file__)"
rc_cumesh=$?
python -c "import o_voxel; print('o_voxel OK', o_voxel.__file__)"
rc_ovoxel=$?
set -e

if [ "${rc_cumesh}" -ne 0 ] || [ "${rc_ovoxel}" -ne 0 ]; then
  echo "${LOG_PREFIX} Falha em cumesh/o_voxel. Recompilando modulos..."
  . ./setup.sh --cumesh --o-voxel --flexgemm
fi

echo "${LOG_PREFIX} Subindo app em http://0.0.0.0:${GRADIO_SERVER_PORT}"
exec python app.py

Dockerfile

FROM nvidia/cuda:12.4.1-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive \
    TZ=Etc/UTC \
    PYTHONUNBUFFERED=1 \
    PIP_NO_CACHE_DIR=1

# Dependências de sistema
RUN apt-get update && apt-get install -y --no-install-recommends \
    git git-lfs ca-certificates curl wget \
    build-essential cmake ninja-build pkg-config \
    libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*

# "sudo" não existe em container por padrão e o setup.sh chama sudo.
# Como rodamos como root, criamos um wrapper que só repassa o comando.
RUN printf '#!/bin/sh\nexec "$@"\n' > /usr/local/bin/sudo && chmod +x /usr/local/bin/sudo

# Miniconda
ENV CONDA_DIR=/opt/conda
RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-py310_24.7.1-0-Linux-x86_64.sh -O /tmp/miniconda.sh && \
    bash /tmp/miniconda.sh -b -p ${CONDA_DIR} && \
    rm /tmp/miniconda.sh && \
    ${CONDA_DIR}/bin/conda clean -afy

ENV PATH=${CONDA_DIR}/bin:$PATH
SHELL ["/bin/bash", "-lc"]

WORKDIR /opt/trellis2

# Repo + submódulos
RUN git lfs install && \
    git clone -b main --recursive https://github.com/microsoft/TRELLIS.2.git . && \
    git submodule update --init --recursive

# Env vars úteis
ENV CUDA_HOME=/usr/local/cuda \
    OPENCV_IO_ENABLE_OPENEXR=1 \
    PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
    GRADIO_SERVER_NAME=0.0.0.0 \
    GRADIO_SERVER_PORT=7860 \
    HF_HOME=/data/hf \
    TORCH_HOME=/data/torch \
    XDG_CACHE_HOME=/data/cache \
    TORCH_CUDA_ARCH_LIST=9.0+PTX

# Pastas de cache/volume
RUN mkdir -p /data/hf /data/torch /data/cache

COPY entrypoint.sh /usr/local/bin/entrypoint.sh
RUN chmod +x /usr/local/bin/entrypoint.sh

EXPOSE 7860
CMD ["/usr/local/bin/entrypoint.sh"]


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment