Skip to content

Instantly share code, notes, and snippets.

@inardini
Last active March 1, 2026 04:34
Show Gist options
  • Select an option

  • Save inardini/b9267831de1d557596186d49e2a78451 to your computer and use it in GitHub Desktop.

Select an option

Save inardini/b9267831de1d557596186d49e2a78451 to your computer and use it in GitHub Desktop.
Sends a multimodal query to ADK agent hosted on Vertex AI Agent Engine.
import os
import vertexai
from vertexai import agent_engines
from typing import Optional, Dict, Any
def query_multimodal_agent(
project_id: str,
location: str,
agent_engine_id: str,
file_gcs_uri: str,
mime_type: str,
prompt: str,
user_id: str = "user_example_123",
) -> Dict[str, Any]:
"""
Sends a multimodal query to ADK agent hosted on Vertex AI Agent Engine.
It assumes Gemini as LLM.
"""
# Initialize the Vertex AI SDK
vertexai.init(project=project_id, location=location)
# Construct the full resource name for the agent engine
agent_engine_resource_name = f"projects/{project_id}/locations/{location}/reasoningEngines/{agent_engine_id}"
# Get the remote application representing the agent engine
remote_app = agent_engines.get(agent_engine_resource_name)
# Create a new session for the user
remote_session = remote_app.create_session(user_id=user_id)
# Prepare the multimodal message payload.
# Check out the doc: https://cloud.google.com/vertex-ai/generative-ai/docs/agent-engine/use/langchain#multimodal-content
multimodal_message = {
"role": "user",
"parts": [
{
"text": prompt,
},
{
"file_data": {
"file_uri": file_gcs_uri,
"mime_type": mime_type,
},
},
],
}
print(f"Querying agent with prompt: '{prompt}' and file: {file_gcs_uri} ({mime_type})...")
# Stream the query to the agent engine
response_stream = remote_app.stream_query(
user_id=user_id,
session_id=remote_session["id"],
message=multimodal_message,
)
# Extract the final response text from the stream
final_response_text = event["content"]["parts"][0].get("text", "")
print(final_response_text)
if __name__ == "__main__":
PROJECT_ID = "your-google-cloud-project-id"
LOCATION = "us-central1"
AGENT_ENGINE_ID = "your-reasoning-engine-id"
# Example 1: Image Analysis
print("Running Image Example")
image_uri = "gs://cloud-samples-data/generative-ai/image/scones.jpg"
image_mime_type = "image/jpeg"
image_prompt = "Describe this image in five words."
image_response = query_multimodal_agent(
project_id=PROJECT_ID,
location=LOCATION,
agent_engine_id=AGENT_ENGINE_ID,
file_gcs_uri=image_uri,
mime_type=image_mime_type,
prompt=image_prompt,
)
print("Agent Response (Image)")
print(image_response)
print("-" * 30)
# Example 2: Video Analysis
print("Running Video Example")
video_uri = "gs://cloud-samples-data/generative-ai/video/ad_copy_from_video.mp4"
video_mime_type = "video/mp4"
video_prompt = "What is the main subject of this video? Provide a short summary."
video_response = query_multimodal_agent(
project_id=PROJECT_ID,
location=LOCATION,
agent_engine_id=AGENT_ENGINE_ID,
file_gcs_uri=video_uri,
mime_type=video_mime_type,
prompt=video_prompt,
)
print("Agent Response (Video)")
print(video_response)
print("-" * 30)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment