AIWintermuteAI · March 30, 2026 08:09
diff --git a/measure-ttft-image.sh b/measure-ttft-image.sh
 #!/usr/bin/env bash

 set -euo pipefail

 HOST="${HOST:-localhost}"
 PORT="${1:-${PORT:-11434}}"
 URL="${URL:-http://${HOST}:${PORT}/api/chat}"
 MODEL="${MODEL:-aleSuglia/qwen2-vl-2b-instruct-q4_k_m}"
 WARMUP_PROMPT="${WARMUP_PROMPT:-Reply with OK only.}"
 PROMPT="${PROMPT:-What is in this image?}"
 IMAGE_PATH="${IMAGE_PATH:-test.jpg}"

 if ! command -v curl >/dev/null 2>&1; then
  echo "Error: curl is required but was not found." >&2
  exit 1
 fi

 if ! command -v base64 >/dev/null 2>&1; then
  echo "Error: base64 is required but was not found." >&2
  exit 1
 fi

 # Optional: download sample image if missing
 if [ ! -f "$IMAGE_PATH" ]; then
  echo "Image not found at $IMAGE_PATH, downloading sample..."
  curl --silent --show-error --fail -L \
    -o "$IMAGE_PATH" \
    "https://upload.wikimedia.org/wikipedia/commons/3/3a/Cat03.jpg"
 fi

 # Encode image (GNU and BSD/macOS compatible)
 if base64 --help >/dev/null 2>&1; then
  IMG_B64="$(base64 -w 0 < "$IMAGE_PATH" 2>/dev/null || base64 < "$IMAGE_PATH" | tr -d '\n')"
 else
  IMG_B64="$(base64 < "$IMAGE_PATH" | tr -d '\n')"
 fi

 # Warm-up payload: text only (no image)
 WARMUP_PAYLOAD="$(printf '{"model":"%s","messages":[{"role":"user","content":"%s"}],"stream":false}' \
  "$MODEL" "$WARMUP_PROMPT")"

 # Measured payload: includes image
 PAYLOAD="$(printf '{"model":"%s","messages":[{"role":"user","content":"%s","images":["%s"]}],"stream":true}' \
  "$MODEL" "$PROMPT" "$IMG_B64")"

 echo "Running warm-up prompt (text-only), response discarded..."
 curl --silent --show-error --fail "$URL" \
  -H 'Content-Type: application/json' \
  -d "$WARMUP_PAYLOAD" \
  >/dev/null

 echo "Measuring time to first response character on image prompt..."
 start_ns="$(date +%s%N)"
 response_file="$(mktemp)"

 cleanup() {
 	rm -f "$response_file"
 }
 trap cleanup EXIT

 set +e
 ttfb_ms="$({
  curl --silent --show-error --fail --no-buffer "$URL" \
    -H 'Content-Type: application/json' \
    -d "$PAYLOAD"
 } | tee "$response_file" | {
  IFS= read -r -n1 _first_char
  read_status=$?
  if [ "$read_status" -ne 0 ]; then
    exit 1
  fi

  now_ns="$(date +%s%N)"
  elapsed_ns=$((now_ns - start_ns))
  elapsed_ms_int=$((elapsed_ns / 1000000))
  elapsed_ms_frac=$(((elapsed_ns / 10000) % 100))
  printf '%s.%02d\n' "$elapsed_ms_int" "$elapsed_ms_frac"

  cat >/dev/null
 })"
 status=$?
 set -e

 if [ "$status" -ne 0 ]; then
  echo "Error: could not read first response character." >&2
  exit 1
 fi

 echo "Time to first character (second prompt): ${ttfb_ms} ms"
 echo "Response body (second prompt):"
 cat "$response_file"
diff --git a/measure-ttft-text-large-infill.sh b/measure-ttft-text-large-infill.sh
 #!/usr/bin/env bash

 set -euo pipefail

 HOST="${HOST:-localhost}"
 PORT="${1:-${PORT:-8000}}"
 URL="${URL:-http://${HOST}:${PORT}/api/chat}"
 PROMPT="$(cat <<'EOF'
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 Please read this long benchmark prompt and continue thinking carefully about context details constraints assumptions and tradeoffs.
 EOF
 )"
 PROMPT="${PROMPT//$'\n'/ }"
 PAYLOAD="{\"model\":\"qwen2:1.5b\",\"messages\":[{\"role\":\"user\",\"content\":\"${PROMPT}\"}],\"stream\":true}"

 if ! command -v curl >/dev/null 2>&1; then
 	echo "Error: curl is required but was not found." >&2
 	exit 1
 fi

 echo "Running warm-up prompt (model load), response discarded..."
 curl --silent --show-error --fail "$URL" \
 	-H 'Content-Type: application/json' \
 	-d "$PAYLOAD" \
 	>/dev/null

 echo "Measuring time to first response character on second prompt..."
 start_ns="$(date +%s%N)"
 response_file="$(mktemp)"

 cleanup() {
 	rm -f "$response_file"
 }
 trap cleanup EXIT

 set +e
 ttfb_ms="$({
 	curl --silent --show-error --fail --no-buffer "$URL" \
 		-H 'Content-Type: application/json' \
 		-d "$PAYLOAD"
 } | tee "$response_file" | {
 	IFS= read -r -n1 _first_char
 	read_status=$?
 	if [ "$read_status" -ne 0 ]; then
 		exit 1
 	fi

 	now_ns="$(date +%s%N)"
 	elapsed_ns=$((now_ns - start_ns))
 	elapsed_ms_int=$((elapsed_ns / 1000000))
 	elapsed_ms_frac=$(((elapsed_ns / 10000) % 100))
 	printf '%s.%02d\n' "$elapsed_ms_int" "$elapsed_ms_frac"

 	cat >/dev/null
 })"
 status=$?
 set -e

 if [ "$status" -ne 0 ]; then
 	echo "Error: could not read first response character." >&2
 	exit 1
 fi

 echo "Time to first character (second prompt): ${ttfb_ms} ms"
 echo "Response body (second prompt):"
 cat "$response_file"
diff --git a/measure-ttft-text.sh b/measure-ttft-text.sh
 #!/usr/bin/env bash

 set -euo pipefail

 HOST="${HOST:-localhost}"
 PORT="${1:-${PORT:-8000}}"
 URL="${URL:-http://${HOST}:${PORT}/api/chat}"
 PAYLOAD='{"model":"qwen2:1.5b","messages":[{"role":"user","content":"Tell me a joke about cats."}],"stream":true}'

 if ! command -v curl >/dev/null 2>&1; then
 	echo "Error: curl is required but was not found." >&2
 	exit 1
 fi

 echo "Running warm-up prompt (model load), response discarded..."
 curl --silent --show-error --fail "$URL" \
 	-H 'Content-Type: application/json' \
 	-d "$PAYLOAD" \
 	>/dev/null

 echo "Measuring time to first response character on second prompt..."
 start_ns="$(date +%s%N)"
 response_file="$(mktemp)"

 cleanup() {
 	rm -f "$response_file"
 }
 trap cleanup EXIT

 set +e
 ttfb_ms="$({
 	curl --silent --show-error --fail --no-buffer "$URL" \
 		-H 'Content-Type: application/json' \
 		-d "$PAYLOAD"
 } | tee "$response_file" | {
 	IFS= read -r -n1 _first_char
 	read_status=$?
 	if [ "$read_status" -ne 0 ]; then
 		exit 1
 	fi

 	now_ns="$(date +%s%N)"
 	elapsed_ns=$((now_ns - start_ns))
 	elapsed_ms_int=$((elapsed_ns / 1000000))
 	elapsed_ms_frac=$(((elapsed_ns / 10000) % 100))
 	printf '%s.%02d\n' "$elapsed_ms_int" "$elapsed_ms_frac"

 	cat >/dev/null
 })"
 status=$?
 set -e

 if [ "$status" -ne 0 ]; then
 	echo "Error: could not read first response character." >&2
 	exit 1
 fi

 echo "Time to first character (second prompt): ${ttfb_ms} ms"
 echo "Response body (second prompt):"
 cat "$response_file"
diff --git a/simple_vlm_chat.py b/simple_vlm_chat.py
 import argparse
 import sys

 import cv2
 import numpy as np
 import time
 from hailo_platform import VDevice
 from hailo_platform.genai import VLM

 from hailo_apps.python.core.common.core import handle_list_models_flag, resolve_hef_path
 from hailo_apps.python.core.common.defines import VLM_CHAT_APP, SHARED_VDEVICE_GROUP_ID, HAILO10H_ARCH, REPO_ROOT
 from hailo_apps.python.core.common.hailo_logger import get_logger

 # Initialize logger
 logger = get_logger(__name__)


 def main():
    """Main function for VLM Chat Example."""
    # Parse arguments
    parser = argparse.ArgumentParser(description="VLM Chat Example")
    parser.add_argument("--hef-path", type=str, default=None, help="Path to HEF model file")
    parser.add_argument("--list-models", action="store_true", help="List available models")

    # Handle --list-models flag before full initialization
    handle_list_models_flag(parser, VLM_CHAT_APP)

    args = parser.parse_args()

    # Resolve HEF path with auto-download (VLM is Hailo-10H only)
    hef_path = resolve_hef_path(args.hef_path, app_name=VLM_CHAT_APP, arch=HAILO10H_ARCH)
    if hef_path is None:
        logger.error("Failed to resolve HEF path for VLM model.")
        sys.exit(1)

    logger.info(f"Using HEF: {hef_path}")
    print(f"✓ Model file found: {hef_path}")

    vdevice = None
    vlm = None

    try:
        print("\n[1/5] Initializing Hailo device...")
        params = VDevice.create_params()
        params.group_id = SHARED_VDEVICE_GROUP_ID
        vdevice = VDevice(params)
        print("✓ Hailo device initialized")

        print("[2/5] Loading VLM model...")
        vlm = VLM(vdevice, str(hef_path))
        print("✓ Model loaded successfully")

        prompt = [
            {
                "role": "system",
                "content": [{"type": "text", "text": 'You are a helpful assistant that analyzes images and answers questions about them.'}]
            },
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": 'How many people in the image?.'}
                ]
            }
        ]

        # Load and convert image
        # Use standard REPO_ROOT from defines
        image_path = REPO_ROOT / 'doc' / 'images' / 'barcode-example.png'

        print(f"[3/5] Loading image from: {image_path}")
        image = cv2.imread(str(image_path))
        if image is None:
            raise FileNotFoundError(f"Could not load image file: {image_path}")
        print(f"✓ Image loaded (size: {image.shape[1]}x{image.shape[0]})")

        print("[4/5] Preprocessing image...")
        if len(image.shape) == 3 and image.shape[2] == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        image = cv2.resize(image, (336, 336), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
        print("✓ Image preprocessed (resized to 336x336, converted to RGB)")

        print("[5/5] Sending prompt with image to VLM...")
        print(f"   User question: '{prompt[1]['content'][1]['text']}'")
        response = vlm.generate_all(prompt=prompt, frames=[image], temperature=0.1, seed=42, max_generated_tokens=200)
        start_time = time.time()
        first_token = False
        num_tokens = 0

        with vlm:
            with vlm.generate(prompt, frames=[image], temperature=0.1, seed=42, max_generated_tokens=200) as gen:
                for token in gen:
                    if not first_token:
                        first_token = True
                        time_to_first_token = time.time() - start_time
                    print(token, end='', flush=True)
                    num_tokens += 1
        # print("[4/4] Response received:")
        # print("-" * 60)
        # print(response.split(". [{'type'")[0])
        # print("-" * 60)
        print("\n✓ Example completed successfully")
        print(f"TTFT {time_to_first_token}")
        tokens_per_s = num_tokens / (time.time() - start_time)
        print(f"TPS {tokens_per_s}")

    except Exception as e:
        logger.error(f"Error occurred: {e}", exc_info=True)
        sys.exit(1)

    finally:
        # Clean up resources
        if vlm:
            try:
                vlm.clear_context()
                vlm.release()
            except Exception as e:
                logger.warning(f"Error releasing VLM: {e}")

        if vdevice:
            try:
                vdevice.release()
            except Exception as e:
                logger.warning(f"Error releasing VDevice: {e}")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env bash

	set -euo pipefail

	HOST="${HOST:-localhost}"
	PORT="${1:-${PORT:-11434}}"
	URL="${URL:-http://${HOST}:${PORT}/api/chat}"
	MODEL="${MODEL:-aleSuglia/qwen2-vl-2b-instruct-q4_k_m}"
	WARMUP_PROMPT="${WARMUP_PROMPT:-Reply with OK only.}"
	PROMPT="${PROMPT:-What is in this image?}"
	IMAGE_PATH="${IMAGE_PATH:-test.jpg}"

	if ! command -v curl >/dev/null 2>&1; then
	echo "Error: curl is required but was not found." >&2
	exit 1
	fi

	if ! command -v base64 >/dev/null 2>&1; then
	echo "Error: base64 is required but was not found." >&2
	exit 1
	fi

	# Optional: download sample image if missing
	if [ ! -f "$IMAGE_PATH" ]; then
	echo "Image not found at $IMAGE_PATH, downloading sample..."
	curl --silent --show-error --fail -L \
	-o "$IMAGE_PATH" \
	"https://upload.wikimedia.org/wikipedia/commons/3/3a/Cat03.jpg"
	fi

	# Encode image (GNU and BSD/macOS compatible)
	if base64 --help >/dev/null 2>&1; then
	IMG_B64="$(base64 -w 0 < "$IMAGE_PATH" 2>/dev/null \|\| base64 < "$IMAGE_PATH" \| tr -d '\n')"
	else
	IMG_B64="$(base64 < "$IMAGE_PATH" \| tr -d '\n')"
	fi

	# Warm-up payload: text only (no image)
	WARMUP_PAYLOAD="$(printf '{"model":"%s","messages":[{"role":"user","content":"%s"}],"stream":false}' \
	"$MODEL" "$WARMUP_PROMPT")"

	# Measured payload: includes image
	PAYLOAD="$(printf '{"model":"%s","messages":[{"role":"user","content":"%s","images":["%s"]}],"stream":true}' \
	"$MODEL" "$PROMPT" "$IMG_B64")"

	echo "Running warm-up prompt (text-only), response discarded..."
	curl --silent --show-error --fail "$URL" \
	-H 'Content-Type: application/json' \
	-d "$WARMUP_PAYLOAD" \
	>/dev/null

	echo "Measuring time to first response character on image prompt..."
	start_ns="$(date +%s%N)"
	response_file="$(mktemp)"

	cleanup() {
	rm -f "$response_file"
	}
	trap cleanup EXIT

	set +e
	ttfb_ms="$({
	curl --silent --show-error --fail --no-buffer "$URL" \
	-H 'Content-Type: application/json' \
	-d "$PAYLOAD"
	} \| tee "$response_file" \| {
	IFS= read -r -n1 _first_char
	read_status=$?
	if [ "$read_status" -ne 0 ]; then
	exit 1
	fi

	now_ns="$(date +%s%N)"
	elapsed_ns=$((now_ns - start_ns))
	elapsed_ms_int=$((elapsed_ns / 1000000))
	elapsed_ms_frac=$(((elapsed_ns / 10000) % 100))
	printf '%s.%02d\n' "$elapsed_ms_int" "$elapsed_ms_frac"

	cat >/dev/null
	})"
	status=$?
	set -e

	if [ "$status" -ne 0 ]; then
	echo "Error: could not read first response character." >&2
	exit 1
	fi

	echo "Time to first character (second prompt): ${ttfb_ms} ms"
	echo "Response body (second prompt):"
	cat "$response_file"
	import argparse
	import sys

	import cv2
	import numpy as np
	import time
	from hailo_platform import VDevice
	from hailo_platform.genai import VLM

	from hailo_apps.python.core.common.core import handle_list_models_flag, resolve_hef_path
	from hailo_apps.python.core.common.defines import VLM_CHAT_APP, SHARED_VDEVICE_GROUP_ID, HAILO10H_ARCH, REPO_ROOT
	from hailo_apps.python.core.common.hailo_logger import get_logger

	# Initialize logger
	logger = get_logger(__name__)


	def main():
	"""Main function for VLM Chat Example."""
	# Parse arguments
	parser = argparse.ArgumentParser(description="VLM Chat Example")
	parser.add_argument("--hef-path", type=str, default=None, help="Path to HEF model file")
	parser.add_argument("--list-models", action="store_true", help="List available models")

	# Handle --list-models flag before full initialization
	handle_list_models_flag(parser, VLM_CHAT_APP)

	args = parser.parse_args()

	# Resolve HEF path with auto-download (VLM is Hailo-10H only)
	hef_path = resolve_hef_path(args.hef_path, app_name=VLM_CHAT_APP, arch=HAILO10H_ARCH)
	if hef_path is None:
	logger.error("Failed to resolve HEF path for VLM model.")
	sys.exit(1)

	logger.info(f"Using HEF: {hef_path}")
	print(f"✓ Model file found: {hef_path}")

	vdevice = None
	vlm = None

	try:
	print("\n[1/5] Initializing Hailo device...")
	params = VDevice.create_params()
	params.group_id = SHARED_VDEVICE_GROUP_ID
	vdevice = VDevice(params)
	print("✓ Hailo device initialized")

	print("[2/5] Loading VLM model...")
	vlm = VLM(vdevice, str(hef_path))
	print("✓ Model loaded successfully")

	prompt = [
	{
	"role": "system",
	"content": [{"type": "text", "text": 'You are a helpful assistant that analyzes images and answers questions about them.'}]
	},
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": 'How many people in the image?.'}
	]
	}
	]

	# Load and convert image
	# Use standard REPO_ROOT from defines
	image_path = REPO_ROOT / 'doc' / 'images' / 'barcode-example.png'

	print(f"[3/5] Loading image from: {image_path}")
	image = cv2.imread(str(image_path))
	if image is None:
	raise FileNotFoundError(f"Could not load image file: {image_path}")
	print(f"✓ Image loaded (size: {image.shape[1]}x{image.shape[0]})")

	print("[4/5] Preprocessing image...")
	if len(image.shape) == 3 and image.shape[2] == 3:
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

	image = cv2.resize(image, (336, 336), interpolation=cv2.INTER_LINEAR).astype(np.uint8)
	print("✓ Image preprocessed (resized to 336x336, converted to RGB)")

	print("[5/5] Sending prompt with image to VLM...")
	print(f" User question: '{prompt[1]['content'][1]['text']}'")
	response = vlm.generate_all(prompt=prompt, frames=[image], temperature=0.1, seed=42, max_generated_tokens=200)
	start_time = time.time()
	first_token = False
	num_tokens = 0

	with vlm:
	with vlm.generate(prompt, frames=[image], temperature=0.1, seed=42, max_generated_tokens=200) as gen:
	for token in gen:
	if not first_token:
	first_token = True
	time_to_first_token = time.time() - start_time
	print(token, end='', flush=True)
	num_tokens += 1
	# print("[4/4] Response received:")
	# print("-" * 60)
	# print(response.split(". [{'type'")[0])
	# print("-" * 60)
	print("\n✓ Example completed successfully")
	print(f"TTFT {time_to_first_token}")
	tokens_per_s = num_tokens / (time.time() - start_time)
	print(f"TPS {tokens_per_s}")

	except Exception as e:
	logger.error(f"Error occurred: {e}", exc_info=True)
	sys.exit(1)

	finally:
	# Clean up resources
	if vlm:
	try:
	vlm.clear_context()
	vlm.release()
	except Exception as e:
	logger.warning(f"Error releasing VLM: {e}")

	if vdevice:
	try:
	vdevice.release()
	except Exception as e:
	logger.warning(f"Error releasing VDevice: {e}")


	if __name__ == "__main__":
	main()