Skip to content

Instantly share code, notes, and snippets.

@AIWintermuteAI
Last active March 30, 2026 08:09
Show Gist options
  • Select an option

  • Save AIWintermuteAI/e3409667a3d01df500c2248bb3b40a0f to your computer and use it in GitHub Desktop.

Select an option

Save AIWintermuteAI/e3409667a3d01df500c2248bb3b40a0f to your computer and use it in GitHub Desktop.
Pi AI Hat+ 2 Benchmarking
#!/usr/bin/env bash
set -euo pipefail
HOST="${HOST:-localhost}"
PORT="${1:-${PORT:-11434}}"
URL="${URL:-http://${HOST}:${PORT}/api/chat}"
MODEL="${MODEL:-aleSuglia/qwen2-vl-2b-instruct-q4_k_m}"
WARMUP_PROMPT="${WARMUP_PROMPT:-Reply with OK only.}"
PROMPT="${PROMPT:-What is in this image?}"
IMAGE_PATH="${IMAGE_PATH:-test.jpg}"
if ! command -v curl >/dev/null 2>&1; then
echo "Error: curl is required but was not found." >&2
exit 1
fi
if ! command -v base64 >/dev/null 2>&1; then
echo "Error: base64 is required but was not found." >&2
exit 1
fi
# Optional: download sample image if missing
if [ ! -f "$IMAGE_PATH" ]; then
echo "Image not found at $IMAGE_PATH, downloading sample..."
curl --silent --show-error --fail -L \
-o "$IMAGE_PATH" \
"https://upload.wikimedia.org/wikipedia/commons/3/3a/Cat03.jpg"
fi
# Encode image (GNU and BSD/macOS compatible)
if base64 --help >/dev/null 2>&1; then
IMG_B64="$(base64 -w 0 < "$IMAGE_PATH" 2>/dev/null || base64 < "$IMAGE_PATH" | tr -d '\n')"
else
IMG_B64="$(base64 < "$IMAGE_PATH" | tr -d '\n')"
fi
# Warm-up payload: text only (no image)
WARMUP_PAYLOAD="$(printf '{"model":"%s","messages":[{"role":"user","content":"%s"}],"stream":false}' \
"$MODEL" "$WARMUP_PROMPT")"
# Measured payload: includes image
PAYLOAD="$(printf '{"model":"%s","messages":[{"role":"user","content":"%s","images":["%s"]}],"stream":true}' \
"$MODEL" "$PROMPT" "$IMG_B64")"
echo "Running warm-up prompt (text-only), response discarded..."
curl --silent --show-error --fail "$URL" \
-H 'Content-Type: application/json' \
-d "$WARMUP_PAYLOAD" \
>/dev/null
echo "Measuring time to first response character on image prompt..."
start_ns="$(date +%s%N)"
response_file="$(mktemp)"
cleanup() {
rm -f "$response_file"
}
trap cleanup EXIT
set +e
ttfb_ms="$({
curl --silent --show-error --fail --no-buffer "$URL" \
-H 'Content-Type: application/json' \
-d "$PAYLOAD"
} | tee "$response_file" | {
IFS= read -r -n1 _first_char
read_status=$?
if [ "$read_status" -ne 0 ]; then
exit 1
fi
now_ns="$(date +%s%N)"
elapsed_ns=$((now_ns - start_ns))
elapsed_ms_int=$((elapsed_ns / 1000000))
elapsed_ms_frac=$(((elapsed_ns / 10000) % 100))
printf '%s.%02d\n' "$elapsed_ms_int" "$elapsed_ms_frac"
cat >/dev/null
})"
status=$?
set -e
if [ "$status" -ne 0 ]; then
echo "Error: could not read first response character." >&2
exit 1
fi
echo "Time to first character (second prompt): ${ttfb_ms} ms"
echo "Response body (second prompt):"
cat "$response_file"
#!/usr/bin/env bash
set -euo pipefail
HOST="${HOST:-localhost}"
PORT="${1:-${PORT:-8000}}"
URL="${URL:-http://${HOST}:${PORT}/api/chat}"
PROMPT="$(cat <<'EOF'
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
EOF
)"
PROMPT="${PROMPT//$'\n'/ }"
PAYLOAD="{\"model\":\"qwen2:1.5b\",\"messages\":[{\"role\":\"user\",\"content\":\"${PROMPT}\"}],\"stream\":true}"
if ! command -v curl >/dev/null 2>&1; then
echo "Error: curl is required but was not found." >&2
exit 1
fi
echo "Running warm-up prompt (model load), response discarded..."
curl --silent --show-error --fail "$URL" \
-H 'Content-Type: application/json' \
-d "$PAYLOAD" \
>/dev/null
echo "Measuring time to first response character on second prompt..."
start_ns="$(date +%s%N)"
response_file="$(mktemp)"
cleanup() {
rm -f "$response_file"
}
trap cleanup EXIT
set +e
ttfb_ms="$({
curl --silent --show-error --fail --no-buffer "$URL" \
-H 'Content-Type: application/json' \
-d "$PAYLOAD"
} | tee "$response_file" | {
IFS= read -r -n1 _first_char
read_status=$?
if [ "$read_status" -ne 0 ]; then
exit 1
fi
now_ns="$(date +%s%N)"
elapsed_ns=$((now_ns - start_ns))
elapsed_ms_int=$((elapsed_ns / 1000000))
elapsed_ms_frac=$(((elapsed_ns / 10000) % 100))
printf '%s.%02d\n' "$elapsed_ms_int" "$elapsed_ms_frac"
cat >/dev/null
})"
status=$?
set -e
if [ "$status" -ne 0 ]; then
echo "Error: could not read first response character." >&2
exit 1
fi
echo "Time to first character (second prompt): ${ttfb_ms} ms"
echo "Response body (second prompt):"
cat "$response_file"
#!/usr/bin/env bash
set -euo pipefail
HOST="${HOST:-localhost}"
PORT="${1:-${PORT:-8000}}"
URL="${URL:-http://${HOST}:${PORT}/api/chat}"
PAYLOAD='{"model":"qwen2:1.5b","messages":[{"role":"user","content":"Tell me a joke about cats."}],"stream":true}'
if ! command -v curl >/dev/null 2>&1; then
echo "Error: curl is required but was not found." >&2
exit 1
fi
echo "Running warm-up prompt (model load), response discarded..."
curl --silent --show-error --fail "$URL" \
-H 'Content-Type: application/json' \
-d "$PAYLOAD" \
>/dev/null
echo "Measuring time to first response character on second prompt..."
start_ns="$(date +%s%N)"
response_file="$(mktemp)"
cleanup() {
rm -f "$response_file"
}
trap cleanup EXIT
set +e
ttfb_ms="$({
curl --silent --show-error --fail --no-buffer "$URL" \
-H 'Content-Type: application/json' \
-d "$PAYLOAD"
} | tee "$response_file" | {
IFS= read -r -n1 _first_char
read_status=$?
if [ "$read_status" -ne 0 ]; then
exit 1
fi
now_ns="$(date +%s%N)"
elapsed_ns=$((now_ns - start_ns))
elapsed_ms_int=$((elapsed_ns / 1000000))
elapsed_ms_frac=$(((elapsed_ns / 10000) % 100))
printf '%s.%02d\n' "$elapsed_ms_int" "$elapsed_ms_frac"
cat >/dev/null
})"
status=$?
set -e
if [ "$status" -ne 0 ]; then
echo "Error: could not read first response character." >&2
exit 1
fi
echo "Time to first character (second prompt): ${ttfb_ms} ms"
echo "Response body (second prompt):"
cat "$response_file"
import argparse
import sys
import cv2
import numpy as np
import time
from hailo_platform import VDevice
from hailo_platform.genai import VLM
from hailo_apps.python.core.common.core import handle_list_models_flag, resolve_hef_path
from hailo_apps.python.core.common.defines import VLM_CHAT_APP, SHARED_VDEVICE_GROUP_ID, HAILO10H_ARCH, REPO_ROOT
from hailo_apps.python.core.common.hailo_logger import get_logger
# Initialize logger
logger = get_logger(__name__)
def main():
"""Main function for VLM Chat Example."""
# Parse arguments
parser = argparse.ArgumentParser(description="VLM Chat Example")
parser.add_argument("--hef-path", type=str, default=None, help="Path to HEF model file")
parser.add_argument("--list-models", action="store_true", help="List available models")
# Handle --list-models flag before full initialization
handle_list_models_flag(parser, VLM_CHAT_APP)
args = parser.parse_args()
# Resolve HEF path with auto-download (VLM is Hailo-10H only)
hef_path = resolve_hef_path(args.hef_path, app_name=VLM_CHAT_APP, arch=HAILO10H_ARCH)
if hef_path is None:
logger.error("Failed to resolve HEF path for VLM model.")
sys.exit(1)
logger.info(f"Using HEF: {hef_path}")
print(f"✓ Model file found: {hef_path}")
vdevice = None
vlm = None
try:
print("\n[1/5] Initializing Hailo device...")
params = VDevice.create_params()
params.group_id = SHARED_VDEVICE_GROUP_ID
vdevice = VDevice(params)
print("✓ Hailo device initialized")
print("[2/5] Loading VLM model...")
vlm = VLM(vdevice, str(hef_path))
print("✓ Model loaded successfully")
prompt = [
{
"role": "system",
"content": [{"type": "text", "text": 'You are a helpful assistant that analyzes images and answers questions about them.'}]
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": 'How many people in the image?.'}
]
}
]
# Load and convert image
# Use standard REPO_ROOT from defines
image_path = REPO_ROOT / 'doc' / 'images' / 'barcode-example.png'
print(f"[3/5] Loading image from: {image_path}")
image = cv2.imread(str(image_path))
if image is None:
raise FileNotFoundError(f"Could not load image file: {image_path}")
print(f"✓ Image loaded (size: {image.shape[1]}x{image.shape[0]})")
print("[4/5] Preprocessing image...")
if len(image.shape) == 3 and image.shape[2] == 3:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (336, 336), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
print("✓ Image preprocessed (resized to 336x336, converted to RGB)")
print("[5/5] Sending prompt with image to VLM...")
print(f" User question: '{prompt[1]['content'][1]['text']}'")
response = vlm.generate_all(prompt=prompt, frames=[image], temperature=0.1, seed=42, max_generated_tokens=200)
start_time = time.time()
first_token = False
num_tokens = 0
with vlm:
with vlm.generate(prompt, frames=[image], temperature=0.1, seed=42, max_generated_tokens=200) as gen:
for token in gen:
if not first_token:
first_token = True
time_to_first_token = time.time() - start_time
print(token, end='', flush=True)
num_tokens += 1
# print("[4/4] Response received:")
# print("-" * 60)
# print(response.split(". [{'type'")[0])
# print("-" * 60)
print("\n✓ Example completed successfully")
print(f"TTFT {time_to_first_token}")
tokens_per_s = num_tokens / (time.time() - start_time)
print(f"TPS {tokens_per_s}")
except Exception as e:
logger.error(f"Error occurred: {e}", exc_info=True)
sys.exit(1)
finally:
# Clean up resources
if vlm:
try:
vlm.clear_context()
vlm.release()
except Exception as e:
logger.warning(f"Error releasing VLM: {e}")
if vdevice:
try:
vdevice.release()
except Exception as e:
logger.warning(f"Error releasing VDevice: {e}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment