Skip to content

Instantly share code, notes, and snippets.

@goodrahstar
Created April 24, 2026 16:29
Show Gist options
  • Select an option

  • Save goodrahstar/5e080e6181ccb46c4f94ddcb8f13e4ea to your computer and use it in GitHub Desktop.

Select an option

Save goodrahstar/5e080e6181ccb46c4f94ddcb8f13e4ea to your computer and use it in GitHub Desktop.
Scripts to Run Qwen 3.6 27B on Your Mac
-------
File: download-qwen36-q3km.sh
#!/usr/bin/env bash
# Smaller 27B quant for 16GB RAM systems (~12GB class file; verify after download).
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
OUT_DIR="${OUT_DIR:-$REPO_ROOT/models}"
FILE="Qwen3.6-27B-Q3_K_M.gguf"
URL="https://huggingface.co/unsloth/Qwen3.6-27B-GGUF/resolve/main/${FILE}"
mkdir -p "$OUT_DIR"
DEST="${OUT_DIR}/${FILE}"
if [[ -f "$DEST" ]]; then
echo "Already present: $DEST"
exit 0
fi
echo "Downloading $FILE (smaller than Q4_K_M)..."
if [[ -n "${HF_TOKEN:-}" ]]; then
curl -L --retry 3 --continue-at - -H "Authorization: Bearer ${HF_TOKEN}" -o "$DEST" "$URL"
else
curl -L --retry 3 --continue-at - -o "$DEST" "$URL"
fi
echo "Done: $DEST"
-------
File: download-qwen36-q4.sh
#!/usr/bin/env bash
# Larger Q4 27B file — best on 32GB+ RAM. Resume: run again (curl -C -).
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
OUT_DIR="${OUT_DIR:-$REPO_ROOT/models}"
FILE="Qwen3.6-27B-Q4_K_M.gguf"
URL="https://huggingface.co/unsloth/Qwen3.6-27B-GGUF/resolve/main/${FILE}"
mkdir -p "$OUT_DIR"
DEST="${OUT_DIR}/${FILE}"
if [[ -f "$DEST" ]]; then
echo "Already present: $DEST"
exit 0
fi
echo "Downloading $FILE (heavier; needs plenty of RAM)..."
if [[ -n "${HF_TOKEN:-}" ]]; then
curl -L --retry 3 --continue-at - -H "Authorization: Bearer ${HF_TOKEN}" -o "$DEST" "$URL"
else
curl -L --retry 3 --continue-at - -o "$DEST" "$URL"
fi
echo "Done: $DEST"
-------
File: start-llama-qwen36-macos.sh
#!/usr/bin/env bash
# Run llama-server with a local Qwen3.6 27B GGUF (OpenAI-compatible API on :8080).
# Apple Silicon: Homebrew on arm64 — use Metal offload via --n-gpu-layers all.
# Intel Mac: this build may only expose CPU/BLAS; 27B Q4 is heavy — prefer 32GB+ RAM.
set -eo pipefail
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
# Prefer Q3 on low-RAM; override with MODEL=...
_DEF_Q3="$REPO_ROOT/models/Qwen3.6-27B-Q3_K_M.gguf"
_DEF_Q4="$REPO_ROOT/models/Qwen3.6-27B-Q4_K_M.gguf"
if [[ -n "${MODEL:-}" ]]; then
:
elif [[ -f "$_DEF_Q3" ]]; then
MODEL="$_DEF_Q3"
else
MODEL="$_DEF_Q4"
fi
PORT="${PORT:-8080}"
CTX="${CTX:-4096}"
# API model id (must match agent_data/agent/models.json and requests)
ALIAS="${MODEL_ALIAS:-unsloth/Qwen3.6-27B-GGUF}"
if [[ ! -f "$MODEL" ]]; then
echo "Missing GGUF: $MODEL" >&2
echo "Download one of:" >&2
echo " $REPO_ROOT/scripts/download-qwen36-q3km.sh (smaller file — 16GB Macs)" >&2
echo " $REPO_ROOT/scripts/download-qwen36-q4.sh (higher quality — 32GB+ RAM)" >&2
exit 1
fi
NGL="all"
if [[ "$(uname -m)" != "arm64" ]]; then
# Intel Homebrew build often has no GPU backend; use CPU (still accepts --n-gpu-layers, may be ignored)
NGL="${N_GPU_LAYERS:-0}"
fi
# mlock: USE_MLOCK=1 to add --mlock (can hurt on low-RAM)
EXTRA=( )
[[ "${USE_MLOCK:-0}" == "1" ]] && EXTRA+=( --mlock )
[[ "${LLAMA_FIT_MODE:-on}" == "off" ]] && EXTRA+=( --fit off )
exec llama-server \
-m "$MODEL" \
--port "$PORT" \
-c "$CTX" \
--n-gpu-layers "$NGL" \
-fa on \
"${EXTRA[@]}" \
-a "$ALIAS" \
--chat-template-kwargs "${CHAT_TEMPLATE_KWARGS:-{\"enable_thinking\": true}}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment