Skip to content

Instantly share code, notes, and snippets.

@inardini
Last active April 2, 2025 02:27
Show Gist options
  • Select an option

  • Save inardini/f2a4d4210c12905cc8dd3410a36b4a49 to your computer and use it in GitHub Desktop.

Select an option

Save inardini/f2a4d4210c12905cc8dd3410a36b4a49 to your computer and use it in GitHub Desktop.
Evaluate a judge model
import pandas as pd
from vertexai.preview.evaluation import PairwiseMetric, EvalTask, MetricPromptTemplateExamples
from vertexai.preview.evaluation.autorater_utils import evaluate_autorater
# Prepare the evaluation dataset with the human rating data column.
human_rated_dataset = pd.DataFrame({
"prompt": ["Researchers at ... ", "Introducing the 'SilentStep'..."],
"response": ["A new solar panel ...", "The 'SilentStep' treadmill..."],
"baseline_model_response": ["Researchers developed...", "This is the 'SilentStep'..."],
"pairwise_fluency/human_pairwise_choice": ["CANDIDATE", "BASELINE"]
})
# Set up evaluation metrics based on your criteria
pairwise_fluency = PairwiseMetric(
metric="pairwise_fluency",
metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template('pairwise_fluency')
)
# Run the evaluation
eval_result = EvalTask(
dataset=human_rated_dataset,
metrics=[pairwise_fluency],
).evaluate()
# Calibrate model-based metric result and human preferences.
evaluate_autorater_result = evaluate_autorater(
evaluate_autorater_input=eval_result.metrics_table,
eval_metrics=[pairwise_fluency]
)
print(evaluate_autorater_result.eval_result)
# [{'metric': 'pairwise_fluency', 'accuracy_balanced': 0.5, 'f1_score_balanced': 0.85, 'confusion_matrix': array([[0, 1],[0, 9]]), 'confusion_matrix_labels': ['BASELINE', 'CANDIDATE']}]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment