diff --git a/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py b/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py index 4fb0777dd0..76310e087f 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py @@ -35,10 +35,8 @@ class _Benchmark(str, Enum): MATH = "math" STRONG_REJECT = "strong_reject" IFEVAL = "ifeval" - GEN_QA = "gen_qa" MMMU = "mmmu" LLM_JUDGE = "llm_judge" - INFERENCE_ONLY = "inference_only" # Internal benchmark configuration mapping - using plain dictionaries @@ -138,14 +136,6 @@ class _Benchmark(str, Enum): "subtask_available": False, "subtasks": None }, - _Benchmark.GEN_QA: { - "modality": "Multi-Modal (image)", - "description": "Custom Dataset Evaluation – Lets you supply your own dataset for benchmarking, comparing model outputs to reference answers with metrics such as ROUGE and BLEU. gen_qa supports image inference for models which have multimodal support.", - "metrics": ["all"], - "strategy": "gen_qa", - "subtask_available": False, - "subtasks": None - }, _Benchmark.MMMU: { "modality": "Multi-Modal", "description": "Massive Multidiscipline Multimodal Understanding (MMMU) – College-level benchmark comprising multiple-choice and open-ended questions from 30 disciplines.", @@ -171,14 +161,6 @@ class _Benchmark(str, Enum): "subtask_available": False, "subtasks": None }, - _Benchmark.INFERENCE_ONLY: { - "modality": "Text", - "description": "Lets you supply your own dataset to generate inference responses which can be used with the llm_judge task. No metrics are computed for this task.", - "metrics": ["N/A"], - "strategy": "--", - "subtask_available": False, - "subtasks": None - }, } @@ -278,10 +260,6 @@ class BenchMarkEvaluator(BaseEvaluator): Optional. If not provided, the system will attempt to resolve it using the default MLflow app experience (checks domain match, account default, or creates a new app). Format: arn:aws:sagemaker:region:account:mlflow-tracking-server/name - dataset (Union[str, Any]): Evaluation dataset. Required. Accepts: - - S3 URI (str): e.g., 's3://bucket/path/dataset.jsonl' - - Dataset ARN (str): e.g., 'arn:aws:sagemaker:...:hub-content/AIRegistry/DataSet/...' - - DataSet object: sagemaker.ai_registry.dataset.DataSet instance (ARN inferred automatically) evaluate_base_model (bool): Whether to evaluate the base model in addition to the custom model. Set to False to skip base model evaluation and only evaluate the custom model. Defaults to True (evaluates both models). @@ -309,7 +287,6 @@ class BenchMarkEvaluator(BaseEvaluator): benchmark=Benchmark.MMLU, subtasks=["abstract_algebra", "anatomy", "astronomy"], model="llama3-2-1b-instruct", - dataset="s3://bucket/eval-data.jsonl", s3_output_path="s3://bucket/outputs/", mlflow_resource_arn="arn:aws:sagemaker:us-west-2:123456789012:mlflow-tracking-server/my-server" ) @@ -327,16 +304,8 @@ class BenchMarkEvaluator(BaseEvaluator): _hyperparameters: Optional[Any] = None # Template-required fields - dataset: Union[str, Any] - evaluate_base_model: bool = True - - @validator('dataset', pre=True) - def _resolve_dataset(cls, v): - """Resolve dataset to string (S3 URI or ARN) and validate format. - - Uses BaseEvaluator's common validation logic to avoid code duplication. - """ - return BaseEvaluator._validate_and_resolve_dataset(v) + evaluate_base_model: bool = False + @validator('benchmark') def _validate_benchmark_model_compatibility(cls, v, values): @@ -385,7 +354,13 @@ def _validate_subtasks(cls, v, values): f"Subtask list cannot be empty for benchmark '{benchmark.value}'. " f"Provide at least one subtask or use 'ALL'." ) - + if len(v) > 1 : + raise ValueError( + f"Currently only one subtask is supported for benchmark '{benchmark.value}'. " + f"Provide only one subtask or use 'ALL'." + ) + + # TODO : Should support list of subtasks. # Validate each subtask in the list for subtask in v: if not isinstance(subtask, str): @@ -393,7 +368,7 @@ def _validate_subtasks(cls, v, values): f"All subtasks in the list must be strings. " f"Found {type(subtask).__name__}: {subtask}" ) - + # Validate against available subtasks if defined if config.get("subtasks") and subtask not in config["subtasks"]: raise ValueError( @@ -527,23 +502,32 @@ def _resolve_subtask_for_evaluation(self, subtask: Optional[Union[str, List[str] """ # Use provided subtask or fall back to constructor subtasks eval_subtask = subtask if subtask is not None else self.subtasks - + + if eval_subtask is None or eval_subtask.upper() == "ALL": + #TODO : Check All Vs None subtask for evaluation + return None + # Validate the subtask config = _BENCHMARK_CONFIG.get(self.benchmark) if config and config.get("subtask_available"): - if isinstance(eval_subtask, list): - for st in eval_subtask: - if config.get("subtasks") and st not in config["subtasks"] and st.upper() != "ALL": - raise ValueError( - f"Invalid subtask '{st}' for benchmark '{self.benchmark.value}'. " - f"Available subtasks: {', '.join(config['subtasks'])}" - ) - elif isinstance(eval_subtask, str): + if isinstance(eval_subtask, str): if eval_subtask.upper() != "ALL" and config.get("subtasks") and eval_subtask not in config["subtasks"]: raise ValueError( f"Invalid subtask '{eval_subtask}' for benchmark '{self.benchmark.value}'. " f"Available subtasks: {', '.join(config['subtasks'])}" ) + elif isinstance(eval_subtask, list): + if len(eval_subtask) == 0: + raise ValueError( + f"Subtask list cannot be empty for benchmark '{self.benchmark.value}'. " + f"Provide at least one subtask or use 'ALL'." + ) + if len(eval_subtask) > 1: + raise ValueError( + f"Currently only one subtask is supported for benchmark '{self.benchmark.value}'. " + f"Provide only one subtask or use 'ALL'." + ) + return eval_subtask @@ -573,10 +557,12 @@ def _get_benchmark_template_additions(self, eval_subtask: Optional[Union[str, Li 'task': self.benchmark.value, 'strategy': config["strategy"], metric_key: config["metrics"][0] if config.get("metrics") else 'accuracy', - 'subtask': eval_subtask if isinstance(eval_subtask, str) else ','.join(eval_subtask) if eval_subtask else '', 'evaluate_base_model': self.evaluate_base_model, } + if isinstance(eval_subtask, str): + benchmark_context['subtask'] = eval_subtask + # Add all configured hyperparameters for key in configured_params.keys(): benchmark_context[key] = configured_params[key] @@ -604,7 +590,6 @@ def evaluate(self, subtask: Optional[Union[str, List[str]]] = None) -> Evaluatio benchmark=Benchmark.MMLU, subtasks="ALL", model="llama3-2-1b-instruct", - dataset="s3://bucket/data.jsonl", s3_output_path="s3://bucket/outputs/" ) @@ -645,9 +630,7 @@ def evaluate(self, subtask: Optional[Union[str, List[str]]] = None) -> Evaluatio model_package_group_arn=model_package_group_arn, resolved_model_artifact_arn=artifacts['resolved_model_artifact_arn'] ) - - # Add dataset URI - template_context['dataset_uri'] = self.dataset + # Add benchmark-specific template additions benchmark_additions = self._get_benchmark_template_additions(eval_subtask, config) diff --git a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py index ea5b10b5ed..a50c21ffe3 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py @@ -129,7 +129,7 @@ {% if kms_key_id %}, "KmsKeyId": "{{ kms_key_id }}" {% endif %} - }, + }{% if dataset_uri %}, "InputDataConfig": [ { "ChannelName": "train", @@ -144,7 +144,7 @@ } }{% endif %} } - ]{% if vpc_config %}, + ]{% endif %}{% if vpc_config %}, "VpcConfig": { "SecurityGroupIds": {{ vpc_security_group_ids | tojson }}, "Subnets": {{ vpc_subnets | tojson }} @@ -191,7 +191,7 @@ {% if kms_key_id %}, "KmsKeyId": "{{ kms_key_id }}" {% endif %} - }, + }{% if dataset_uri %}, "InputDataConfig": [ { "ChannelName": "train", @@ -206,7 +206,7 @@ } }{% endif %} } - ]{% if vpc_config %}, + ]{% endif %}{% if vpc_config %}, "VpcConfig": { "SecurityGroupIds": {{ vpc_security_group_ids | tojson }}, "Subnets": {{ vpc_subnets | tojson }} @@ -358,7 +358,7 @@ {% if kms_key_id %}, "KmsKeyId": "{{ kms_key_id }}" {% endif %} - }, + }{% if dataset_uri %}, "InputDataConfig": [ { "ChannelName": "train", @@ -373,7 +373,7 @@ } }{% endif %} } - ]{% if vpc_config %}, + ]{% endif %}{% if vpc_config %}, "VpcConfig": { "SecurityGroupIds": {{ vpc_security_group_ids | tojson }}, "Subnets": {{ vpc_subnets | tojson }} @@ -500,7 +500,7 @@ {% if kms_key_id %}, "KmsKeyId": "{{ kms_key_id }}" {% endif %} - }, + }{% if dataset_uri %}, "InputDataConfig": [ { "ChannelName": "train", @@ -515,7 +515,7 @@ } }{% endif %} } - ]{% if vpc_config %}, + ]{% endif %}{% if vpc_config %}, "VpcConfig": { "SecurityGroupIds": {{ vpc_security_group_ids | tojson }}, "Subnets": {{ vpc_subnets | tojson }} @@ -650,7 +650,7 @@ {% if kms_key_id %}, "KmsKeyId": "{{ kms_key_id }}" {% endif %} - }, + }{% if dataset_uri %}, "InputDataConfig": [ { "ChannelName": "train", @@ -665,7 +665,7 @@ } }{% endif %} } - ]{% if vpc_config %}, + ]{% endif %}{% if vpc_config %}, "VpcConfig": { "SecurityGroupIds": {{ vpc_security_group_ids | tojson }}, "Subnets": {{ vpc_subnets | tojson }} @@ -713,7 +713,7 @@ {% if kms_key_id %}, "KmsKeyId": "{{ kms_key_id }}" {% endif %} - }, + }{% if dataset_uri %}, "InputDataConfig": [ { "ChannelName": "train", @@ -728,7 +728,7 @@ } }{% endif %} } - ]{% if vpc_config %}, + ]{% endif %}{% if vpc_config %}, "VpcConfig": { "SecurityGroupIds": {{ vpc_security_group_ids | tojson }}, "Subnets": {{ vpc_subnets | tojson }} @@ -892,7 +892,7 @@ {% if kms_key_id %}, "KmsKeyId": "{{ kms_key_id }}" {% endif %} - }, + }{% if dataset_uri %}, "InputDataConfig": [ { "ChannelName": "train", @@ -907,7 +907,7 @@ } }{% endif %} } - ]{% if vpc_config %}, + ]{% endif %}{% if vpc_config %}, "VpcConfig": { "SecurityGroupIds": {{ vpc_security_group_ids | tojson }}, "Subnets": {{ vpc_subnets | tojson }} @@ -1032,7 +1032,7 @@ "ModelPackageConfig": { "ModelPackageGroupArn": "{{ model_package_group_arn }}", "SourceModelPackageArn": "{{ source_model_package_arn }}" - }, + }{% if dataset_uri %}, "InputDataConfig": [ { "ChannelName": "train", @@ -1047,7 +1047,7 @@ } }{% endif %} } - ]{% if vpc_config %}, + ]{% endif %}{% if vpc_config %}, "VpcConfig": { "SecurityGroupIds": {{ vpc_security_group_ids | tojson }}, "Subnets": {{ vpc_subnets | tojson }} @@ -1086,7 +1086,7 @@ "ModelPackageConfig": { "ModelPackageGroupArn": "{{ model_package_group_arn }}", "SourceModelPackageArn": "{{ source_model_package_arn }}" - }, + }{% if dataset_uri %}, "InputDataConfig": [ { "ChannelName": "train", @@ -1101,7 +1101,7 @@ } }{% endif %} } - ]{% if vpc_config %}, + ]{% endif %}{% if vpc_config %}, "VpcConfig": { "SecurityGroupIds": {{ vpc_security_group_ids | tojson }}, "Subnets": {{ vpc_subnets | tojson }} diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py index ee6fa631ec..b39de3e5e6 100644 --- a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py @@ -84,11 +84,11 @@ def test_get_benchmarks_and_properties(self): # Verify it's an enum assert hasattr(Benchmark, "__members__") - # Verify GEN_QA is available - assert hasattr(Benchmark, "GEN_QA") + # Verify MMLU is available + assert hasattr(Benchmark, "MMLU") - # Get properties for GEN_QA benchmark - properties = get_benchmark_properties(benchmark=Benchmark.GEN_QA) + # Get properties for MMLU benchmark + properties = get_benchmark_properties(benchmark=Benchmark.MMLU) # Verify properties structure assert isinstance(properties, dict) @@ -97,14 +97,14 @@ def test_get_benchmarks_and_properties(self): assert "metrics" in properties assert "strategy" in properties - logger.info(f"GEN_QA properties: {properties}") + logger.info(f"MMLU properties: {properties}") def test_benchmark_evaluation_full_flow(self): """ Test complete benchmark evaluation flow with fine-tuned model package. This test mirrors the flow from benchmark_demo.ipynb and covers: - 1. Creating BenchMarkEvaluator with GEN_QA benchmark + 1. Creating BenchMarkEvaluator with MMLU benchmark 2. Accessing hyperparameters 3. Starting evaluation 4. Monitoring execution @@ -119,25 +119,23 @@ def test_benchmark_evaluation_full_flow(self): Benchmark = get_benchmarks() # Step 1: Create BenchmarkEvaluator - logger.info("Creating BenchmarkEvaluator with GEN_QA benchmark") + logger.info("Creating BenchmarkEvaluator with MMLU benchmark") # Create evaluator (matching notebook configuration) evaluator = BenchMarkEvaluator( - benchmark=Benchmark.GEN_QA, + benchmark=Benchmark.MMLU, model=TEST_CONFIG["model_package_arn"], s3_output_path=TEST_CONFIG["s3_output_path"], # mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"], - dataset=TEST_CONFIG["dataset_s3_uri"], model_package_group=TEST_CONFIG["model_package_group_arn"], base_eval_name="integ-test-gen-qa-eval", ) # Verify evaluator was created assert evaluator is not None - assert evaluator.benchmark == Benchmark.GEN_QA + assert evaluator.benchmark == Benchmark.MMLU assert evaluator.model == TEST_CONFIG["model_package_arn"] - assert evaluator.dataset == TEST_CONFIG["dataset_s3_uri"] - + logger.info(f"Created evaluator: {evaluator.base_eval_name}") # Step 2: Access hyperparameters @@ -247,17 +245,15 @@ def test_benchmark_evaluator_validation(self): model=TEST_CONFIG["model_package_arn"], s3_output_path=TEST_CONFIG["s3_output_path"], # mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"], - dataset="s3://bucket/dataset.jsonl", ) # Test invalid MLflow ARN format with pytest.raises(ValueError, match="Invalid MLFlow resource ARN"): BenchMarkEvaluator( - benchmark=Benchmark.GEN_QA, + benchmark=Benchmark.MMLU, model=TEST_CONFIG["model_package_arn"], s3_output_path=TEST_CONFIG["s3_output_path"], mlflow_resource_arn="invalid-arn", - dataset="s3://bucket/dataset.jsonl", ) logger.info("Validation tests passed") @@ -272,20 +268,18 @@ def test_benchmark_subtasks_validation(self): model=TEST_CONFIG["model_package_arn"], s3_output_path=TEST_CONFIG["s3_output_path"], # mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"], - dataset="s3://bucket/dataset.jsonl", - subtasks=["abstract_algebra", "anatomy"], + subtasks="abstract_algebra", model_package_group="arn:aws:sagemaker:us-west-2:123456789012:model-package-group/test", ) - assert evaluator.subtasks == ["abstract_algebra", "anatomy"] + assert evaluator.subtasks == "abstract_algebra" # Test invalid subtask for benchmark without subtask support - with pytest.raises(ValueError, match="Subtask is not supported"): + with pytest.raises(ValueError, match="Invalid subtask 'invalid' for benchmark 'mmlu'"): BenchMarkEvaluator( - benchmark=Benchmark.GEN_QA, + benchmark=Benchmark.MMLU, model=TEST_CONFIG["model_package_arn"], s3_output_path=TEST_CONFIG["s3_output_path"], # mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"], - dataset="s3://bucket/dataset.jsonl", subtasks=["invalid"], model_package_group="arn:aws:sagemaker:us-west-2:123456789012:model-package-group/test", ) @@ -310,18 +304,17 @@ def test_benchmark_evaluation_base_model_only(self): # Create evaluator with JumpStart model ID (no model package) evaluator = BenchMarkEvaluator( - benchmark=Benchmark.GEN_QA, + benchmark=Benchmark.MMLU, model=BASE_MODEL_ONLY_CONFIG["base_model_id"], s3_output_path=BASE_MODEL_ONLY_CONFIG["s3_output_path"], # mlflow_resource_arn=BASE_MODEL_ONLY_CONFIG["mlflow_tracking_server_arn"], - dataset=BASE_MODEL_ONLY_CONFIG["dataset_s3_uri"], base_eval_name="integ-test-base-model-only", # Note: model_package_group not needed for JumpStart models ) # Verify evaluator was created assert evaluator is not None - assert evaluator.benchmark == Benchmark.GEN_QA + assert evaluator.benchmark == Benchmark.MMLU assert evaluator.model == BASE_MODEL_ONLY_CONFIG["base_model_id"] logger.info(f"Created evaluator: {evaluator.base_eval_name}") @@ -364,11 +357,10 @@ def test_benchmark_evaluation_nova_model(self): # Create evaluator with Nova model package evaluator = BenchMarkEvaluator( - benchmark=Benchmark.GEN_QA, + benchmark=Benchmark.MMLU, model=NOVA_CONFIG["model_package_arn"], s3_output_path=NOVA_CONFIG["s3_output_path"], mlflow_resource_arn=NOVA_CONFIG["mlflow_tracking_server_arn"], - dataset=NOVA_CONFIG["dataset_s3_uri"], model_package_group=NOVA_CONFIG["model_package_group_arn"], base_eval_name="integ-test-nova-eval", region=NOVA_CONFIG["region"], @@ -376,7 +368,7 @@ def test_benchmark_evaluation_nova_model(self): # Verify evaluator was created assert evaluator is not None - assert evaluator.benchmark == Benchmark.GEN_QA + assert evaluator.benchmark == Benchmark.MMLU assert evaluator.model == NOVA_CONFIG["model_package_arn"] assert evaluator.region == NOVA_CONFIG["region"] diff --git a/v3-examples/model-customization-examples/benchmark_demo.ipynb b/v3-examples/model-customization-examples/benchmark_demo.ipynb index 81544a6a20..442d93b690 100644 --- a/v3-examples/model-customization-examples/benchmark_demo.ipynb +++ b/v3-examples/model-customization-examples/benchmark_demo.ipynb @@ -49,7 +49,7 @@ "pprint(list(Benchmark))\n", "\n", "# Print properties for a specific benchmark\n", - "pprint(get_benchmark_properties(benchmark=Benchmark.GEN_QA))" + "pprint(get_benchmark_properties(benchmark=Benchmark.MMLU))" ], "outputs": [], "execution_count": null @@ -82,16 +82,15 @@ "source": [ "from sagemaker.train.evaluate import BenchMarkEvaluator\n", "\n", - "# Create evaluator with GEN_QA benchmark\n", + "# Create evaluator with MMLU benchmark\n", "# These values match our successfully tested configuration\n", "evaluator = BenchMarkEvaluator(\n", - " benchmark=Benchmark.GEN_QA,\n", - " model=\"arn:aws:sagemaker:us-west-2:<>:model-package/test-finetuned-models-gamma/28\",\n", - " s3_output_path=\"s3://mufi-test-serverless-smtj/eval/\",\n", - " mlflow_resource_arn=\"arn:aws:sagemaker:us-west-2:<>:mlflow-tracking-server/mmlu-eval-experiment\",\n", - " dataset=\"s3://sagemaker-us-west-2-<>/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl\",\n", - " model_package_group=\"arn:aws:sagemaker:us-west-2:<>:model-package-group/example-name-aovqo\", # Optional inferred from model if model package\n", - " base_eval_name=\"gen-qa-eval-demo\",\n", + " benchmark=Benchmark.MMLU,\n", + " #subtask = \"abstract_algebra\" # or \"all\"\n", + " model=\"arn:aws:sagemaker:us-east-1:729646638167:model-package/sdk-test-finetuned-models/2\",\n", + " s3_output_path=\"s3://sagemaker-us-east-1-729646638167/model-customization/eval/\",\n", + " model_package_group=\"arn:aws:sagemaker:us-east-1:729646638167:model-package-group/sdk-test-finetuned-models\", # Optional inferred from model if model package\n", + " base_eval_name=\"mmlu-eval-demo1\",\n", " # Note: sagemaker_session is optional and will be auto-created if not provided\n", " # Note: region is optional and will be auto deduced using environment variables - SAGEMAKER_REGION, AWS_REGION\n", ")\n", @@ -109,14 +108,13 @@ "\n", "# from sagemaker.train.evaluate import BenchMarkEvaluator\n", "\n", - "# # Create evaluator with GEN_QA benchmark\n", + "# # Create evaluator with MMLU benchmark\n", "# # These values match our successfully tested configuration\n", "# evaluator = BenchMarkEvaluator(\n", - "# benchmark=Benchmark.GEN_QA,\n", + "# benchmark=Benchmark.MMLU,\n", "# model=\"meta-textgeneration-llama-3-2-1b-instruct\",\n", "# s3_output_path=\"s3://mufi-test-serverless-smtj/eval/\",\n", "# mlflow_resource_arn=\"arn:aws:sagemaker:us-west-2:<>:mlflow-tracking-server/mmlu-eval-experiment\",\n", - "# dataset=\"s3://sagemaker-us-west-2-<>/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl\",\n", "# # model_package_group=\"arn:aws:sagemaker:us-west-2:<>:model-package-group/example-name-aovqo\", # Optional inferred from model if model package\n", "# base_eval_name=\"gen-qa-eval-demo\",\n", "# # Note: sagemaker_session is optional and will be auto-created if not provided\n", @@ -136,15 +134,14 @@ "\n", "# from sagemaker.train.evaluate import BenchMarkEvaluator\n", "\n", - "# # Create evaluator with GEN_QA benchmark\n", + "# # Create evaluator with MMLU benchmark\n", "# # These values match our successfully tested configuration\n", "# evaluator = BenchMarkEvaluator(\n", - "# benchmark=Benchmark.GEN_QA,\n", + "# benchmark=Benchmark.MMLU,\n", "# # model=\"arn:aws:sagemaker:us-east-1:<>:model-package/bgrv-nova-micro-sft-lora/1\",\n", "# model=\"arn:aws:sagemaker:us-east-1:<>:model-package/test-nova-finetuned-models/3\",\n", "# s3_output_path=\"s3://mufi-test-serverless-iad/eval/\",\n", "# mlflow_resource_arn=\"arn:aws:sagemaker:us-east-1:<>:mlflow-tracking-server/mlflow-prod-server\",\n", - "# dataset=\"s3://sagemaker-us-east-1-<>/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl\",\n", "# model_package_group=\"arn:aws:sagemaker:us-east-1:<>:model-package-group/test-nova-finetuned-models\", # Optional inferred from model if model package\n", "# base_eval_name=\"gen-qa-eval-demo\",\n", "# region=\"us-east-1\",\n", @@ -306,7 +303,7 @@ "run_name/\n", "├── eval_results/\n", "│ ├── results_[timestamp].json\n", - "│ ├── inference_output.jsonl (for gen_qa)\n", + "│ ├── inference_output.jsonl\n", "│ └── details/\n", "│ └── model/\n", "│ └── /\n", @@ -445,7 +442,7 @@ "\n", "**Typical Execution Time:**\n", "- Total: ~10-12 minutes\n", - "- Downloading phase: ~5-7 minutes (model and dataset)\n", + "- Downloading phase: ~5-7 minutes (model)\n", "- Training phase: ~3-5 minutes (running evaluation)\n", "- Lineage steps: ~2-4 seconds each" ]