Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -180,64 +180,40 @@
"metadata": {},
"outputs": [],
"source": [
"# Create the model evaluation job\n",
"model_eval_job_name = f\"model-evaluation-custom-metrics{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}\"\n",
"\n",
"model_eval_job = bedrock_client.create_evaluation_job(\n",
" jobName=model_eval_job_name,\n",
" jobDescription=\"Evaluate model performance with custom comprehensiveness metric\",\n",
" roleArn=role_arn,\n",
" applicationType=\"ModelEvaluation\",\n",
" inferenceConfig={\n",
" \"models\": [{\n",
" \"bedrockModel\": {\n",
" \"modelIdentifier\": generator_model\n",
" }\n",
" }]\n",
" },\n",
" outputDataConfig={\n",
" \"s3Uri\": output_path\n",
" },\n",
" evaluationConfig={\n",
" \"automated\": {\n",
" \"datasetMetricConfigs\": [{\n",
" \"taskType\": \"General\",\n",
" \"dataset\": {\n",
" \"name\": \"ModelEvalDataset\",\n",
" \"datasetLocation\": {\n",
" \"s3Uri\": input_data\n",
" }\n",
" },\n",
" \"metricNames\": [\n",
" \"Builtin.Correctness\",\n",
" \"Builtin.Completeness\",\n",
" \"Builtin.Coherence\",\n",
" \"Builtin.Relevance\",\n",
" \"Builtin.FollowingInstructions\",\n",
" \"comprehensiveness\"\n",
" ]\n",
" }],\n",
" \"customMetricConfig\": {\n",
" \"customMetrics\": [\n",
" comprehensiveness_metric\n",
" ],\n",
" \"evaluatorModelConfig\": {\n",
" \"bedrockEvaluatorModels\": [{\n",
" \"modelIdentifier\": custom_metrics_evaluator_model\n",
" }]\n",
" }\n",
" },\n",
" \"evaluatorModelConfig\": {\n",
" \"bedrockEvaluatorModels\": [{\n",
" \"modelIdentifier\": evaluator_model\n",
" }]\n",
" }\n",
" }\n",
" }\n",
")\n",
"import boto3\n",
"import json\n",
"from collections import defaultdict\n",
"\n",
"print(f\"Created model evaluation job: {model_eval_job_name}\")\n",
"print(f\"Job ID: {model_eval_job['jobArn']}\")"
"# Retrieve evaluation scores from S3 output after job completes\n",
"# The output S3 URI is available in the get_evaluation_job response\n",
"output_s3_uri = response['outputDataConfig']['s3Uri']\n",
"bucket = output_s3_uri.split('/')[2]\n",
"prefix = '/'.join(output_s3_uri.split('/')[3:])\n",
"\n",
"s3_client = boto3.client('s3')\n",
"objects = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)\n",
"\n",
"# Aggregate scores across all output jsonl files\n",
"metric_scores = defaultdict(list)\n",
"dataset_prompt_counts = defaultdict(int)\n",
"\n",
"for obj in objects.get('Contents', []):\n",
" key = obj['Key']\n",
" if not key.endswith('_output.jsonl'):\n",
" continue\n",
" dataset_name = key.split('/datasets/')[1].split('/')[0]\n",
" body = s3_client.get_object(Bucket=bucket, Key=key)['Body'].read().decode()\n",
" for line in body.strip().split('\\n'):\n",
" record = json.loads(line)\n",
" dataset_prompt_counts[dataset_name] += 1\n",
" for score in record.get('automatedEvaluationResult', {}).get('scores', []):\n",
" metric_scores[f\"{dataset_name}/{score['metricName']}\"].append(score['result'])\n",
"\n",
"# Print average scores and prompt counts per dataset/metric\n",
"for key, scores in metric_scores.items():\n",
" dataset, metric = key.split('/', 1)\n",
" avg = sum(scores) / len(scores)\n",
" print(f\"Dataset: {dataset} | Metric: {metric} | Avg Score: {avg:.4f} | Prompts: {dataset_prompt_counts[dataset]}\")"
]
},
{
Expand Down Expand Up @@ -266,6 +242,42 @@
"print(f\"Job Status: {response['status']}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import boto3\n",
"import json\n",
"\n",
"# Retrieve evaluation summary scores from S3 after job completes\n",
"# The output S3 URI is available in the get_evaluation_job response\n",
"output_s3_uri = response['outputDataConfig']['s3Uri'] # e.g. s3://bucket/prefix/\n",
"bucket = output_s3_uri.split('/')[2]\n",
"prefix = '/'.join(output_s3_uri.split('/')[3:])\n",
"\n",
"s3_client = boto3.client('s3')\n",
"\n",
"# List objects to find the evaluation summary JSON\n",
"objects = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)\n",
"summary_key = next(\n",
" obj['Key'] for obj in objects.get('Contents', [])\n",
" if obj['Key'].endswith('evaluationSummary.json')\n",
")\n",
"\n",
"# Download and parse the evaluation summary\n",
"summary_obj = s3_client.get_object(Bucket=bucket, Key=summary_key)\n",
"summary = json.loads(summary_obj['Body'].read())\n",
"\n",
"# Print custom metric scores and prompt counts\n",
"for dataset_result in summary.get('datasetMetricResults', []):\n",
" print(f\"Dataset: {dataset_result['datasetName']}\")\n",
" print(f\" Number of prompts: {dataset_result.get('numberOfPrompts')}\")\n",
" for metric in dataset_result.get('metricResults', []):\n",
" print(f\" Metric: {metric['metricName']} | Score: {metric.get('score')}\")"
]
},
{
"cell_type": "markdown",
"id": "46961e92-4bbb-436a-8929-926e99c5073a",
Expand Down Expand Up @@ -314,4 +326,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}