Skip to content

Commit c0e559a

Browse files
authored
Test: Unit test for update training job (#180)
* feat: warm pool * added custom code * test: unit tests * fix: hack for HPO and warmpool * fix: futureproofing * format: reformat one file * fix: made pr changes * added unit test * fix: use conditions instead of requeue error * unit test fixes * Feature: Update TrainingJob * test: added integration test * fix: corrected hook * fix: small typo * fix: corrected spelling/grammar erorrs * fix: handle invalid update * refactor: changed comments/terminal condition * Feature: Update TrainingJob * preliminary changes * final test suite * fix: corrected grammar mistake * test: added another test and enhanced descriptions. * updated unit teat * [test] corrected rebase errors * . * fix/test: added nil check and new unit test * Feature: Update TrainingJob * test: added integration test * fix: corrected hook * fix: small typo * fix: corrected spelling/grammar erorrs * fix: handle invalid update * refactor: changed comments/terminal condition * updated test * test: added more cases to improve coverage * updates comments and removed unnecessary nil check * fix: do not remove profiler * regnerated controller * regenerating controller * test: unit test change * pr changes * unit test and general code changes * updated unit tests * updated test * changed delta * removed redundant nil check * refactor: changed variable names * updated update input logic * minor refactor * updated unit test * changed function to method * Using requeue on sucess instead of resource synced * modified unit tests * fix: setting the status of desired * updating unit tests * fixed merge issues * corrected spelling * updated descriptions * changed error strings * changed error strings * control flow modifications * logic improvements and clarifying comments * new test structure * testfix * another fix * revert test modification * modified terminal conditions * updated unit test * changed error messaged and removed redundant check * taking out hpo custom code * eliminate api call
1 parent bafd861 commit c0e559a

36 files changed

+3049
-120
lines changed

pkg/resource/training_job/manager_test_suite_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ func (d *testRunnerDelegate) EmptyServiceAPIOutput(apiName string) (interface{},
9494
case "StopTrainingJobWithContext":
9595
var output svcsdk.StopTrainingJobOutput
9696
return &output, nil
97+
case "UpdateTrainingJobWithContext":
98+
var output svcsdk.UpdateTrainingJobOutput
99+
return &output, nil
97100
}
98101
return nil, errors.New(fmt.Sprintf("no matching API name found for: %s", apiName))
99102
}
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
{
2+
"AlgorithmSpecification": {
3+
"AlgorithmName": null,
4+
"EnableSageMakerMetricsTimeSeries": false,
5+
"MetricDefinitions": [
6+
{
7+
"Name": "train:mae",
8+
"Regex": ".*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
9+
},
10+
{
11+
"Name": "validation:auc",
12+
"Regex": ".*\\[[0-9]+\\].*#011validation-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
13+
},
14+
{
15+
"Name": "train:merror",
16+
"Regex": ".*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
17+
},
18+
{
19+
"Name": "train:auc",
20+
"Regex": ".*\\[[0-9]+\\].*#011train-auc:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
21+
},
22+
{
23+
"Name": "validation:mae",
24+
"Regex": ".*\\[[0-9]+\\].*#011validation-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
25+
},
26+
{
27+
"Name": "validation:error",
28+
"Regex": ".*\\[[0-9]+\\].*#011validation-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
29+
},
30+
{
31+
"Name": "validation:merror",
32+
"Regex": ".*\\[[0-9]+\\].*#011validation-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
33+
},
34+
{
35+
"Name": "validation:logloss",
36+
"Regex": ".*\\[[0-9]+\\].*#011validation-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
37+
},
38+
{
39+
"Name": "train:rmse",
40+
"Regex": ".*\\[[0-9]+\\].*#011train-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
41+
},
42+
{
43+
"Name": "train:logloss",
44+
"Regex": ".*\\[[0-9]+\\].*#011train-logloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
45+
},
46+
{
47+
"Name": "train:mlogloss",
48+
"Regex": ".*\\[[0-9]+\\].*#011train-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
49+
},
50+
{
51+
"Name": "validation:rmse",
52+
"Regex": ".*\\[[0-9]+\\].*#011validation-rmse:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
53+
},
54+
{
55+
"Name": "validation:ndcg",
56+
"Regex": ".*\\[[0-9]+\\].*#011validation-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
57+
},
58+
{
59+
"Name": "train:error",
60+
"Regex": ".*\\[[0-9]+\\].*#011train-error:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
61+
},
62+
{
63+
"Name": "validation:mlogloss",
64+
"Regex": ".*\\[[0-9]+\\].*#011validation-mlogloss:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
65+
},
66+
{
67+
"Name": "train:ndcg",
68+
"Regex": ".*\\[[0-9]+\\].*#011train-ndcg:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
69+
},
70+
{
71+
"Name": "train:map",
72+
"Regex": ".*\\[[0-9]+\\].*#011train-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
73+
},
74+
{
75+
"Name": "validation:map",
76+
"Regex": ".*\\[[0-9]+\\].*#011validation-map:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*"
77+
}
78+
],
79+
"TrainingImage": "433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:1",
80+
"TrainingInputMode": "File"
81+
},
82+
"AutoMLJobArn": null,
83+
"BillableTimeInSeconds": null,
84+
"CheckpointConfig": null,
85+
"CreationTime": "2021-10-13T03:49:20.337Z",
86+
"DebugHookConfig": null,
87+
"DebugRuleConfigurations": null,
88+
"DebugRuleEvaluationStatuses": null,
89+
"EnableInterContainerTrafficEncryption": false,
90+
"EnableManagedSpotTraining": false,
91+
"EnableNetworkIsolation": false,
92+
"Environment": null,
93+
"ExperimentConfig": null,
94+
"FailureReason": null,
95+
"FinalMetricDataList": null,
96+
"HyperParameters": {
97+
"eta": "0.2",
98+
"gamma": "4",
99+
"max_depth": "5",
100+
"min_child_weight": "6",
101+
"num_round": "51",
102+
"objective": "reg:squarederror",
103+
"silent": "0",
104+
"subsample": "0.7"
105+
},
106+
"InputDataConfig": [
107+
{
108+
"ChannelName": "train",
109+
"CompressionType": "None",
110+
"ContentType": "text/csv",
111+
"DataSource": {
112+
"FileSystemDataSource": null,
113+
"S3DataSource": {
114+
"AttributeNames": null,
115+
"S3DataDistributionType": "FullyReplicated",
116+
"S3DataType": "S3Prefix",
117+
"S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/train"
118+
}
119+
},
120+
"InputMode": null,
121+
"RecordWrapperType": "None",
122+
"ShuffleConfig": null
123+
},
124+
{
125+
"ChannelName": "validation",
126+
"CompressionType": "None",
127+
"ContentType": "text/csv",
128+
"DataSource": {
129+
"FileSystemDataSource": null,
130+
"S3DataSource": {
131+
"AttributeNames": null,
132+
"S3DataDistributionType": "FullyReplicated",
133+
"S3DataType": "S3Prefix",
134+
"S3Uri": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/validation"
135+
}
136+
},
137+
"InputMode": null,
138+
"RecordWrapperType": "None",
139+
"ShuffleConfig": null
140+
}
141+
],
142+
"LabelingJobArn": null,
143+
"LastModifiedTime": "2021-10-13T03:49:20.576Z",
144+
"ModelArtifacts": null,
145+
"OutputDataConfig": {
146+
"KmsKeyId": "",
147+
"S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/debugger/output"
148+
},
149+
"ProfilerConfig": {
150+
"ProfilingIntervalInMilliseconds": 200,
151+
"ProfilingParameters": null,
152+
"S3OutputPath": "s3://source-data-bucket-592697580195-us-west-2/sagemaker/training/profiler/"
153+
},
154+
"ProfilerRuleConfigurations": [
155+
{
156+
"InstanceType": null,
157+
"LocalPath": null,
158+
"RuleConfigurationName": "ProfilerReport",
159+
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/xgboost:1",
160+
"RuleParameters": {
161+
"rule_to_invoke": "ProfilerReport"
162+
},
163+
"S3OutputPath": null,
164+
"VolumeSizeInGB": 0
165+
},
166+
{
167+
"InstanceType": null,
168+
"LocalPath": null,
169+
"RuleConfigurationName": "CPUBottleneck",
170+
"RuleEvaluatorImage": "895741380848.dkr.ecr.us-west-2.amazonaws.com/xgboost:1",
171+
"RuleParameters": {
172+
"rule_to_invoke": "CPUBottleneck"
173+
},
174+
"S3OutputPath": null,
175+
"VolumeSizeInGB": 0
176+
}
177+
],
178+
"ProfilerRuleEvaluationStatuses": [
179+
{
180+
"LastModifiedTime": "2021-10-13T03:49:20.337Z",
181+
"RuleConfigurationName": "ProfilerReport",
182+
"RuleEvaluationJobArn": null,
183+
"RuleEvaluationStatus": "InProgress",
184+
"StatusDetails": null
185+
},
186+
{
187+
"LastModifiedTime": "2021-10-13T03:49:20.337Z",
188+
"RuleConfigurationName": "CPUBottleneck",
189+
"RuleEvaluationJobArn": null,
190+
"RuleEvaluationStatus": "InProgress",
191+
"StatusDetails": null
192+
}
193+
],
194+
"ProfilingStatus": "Enabled",
195+
"ResourceConfig": {
196+
"InstanceCount": 1,
197+
"InstanceType": "ml.m4.xlarge",
198+
"VolumeKmsKeyId": null,
199+
"VolumeSizeInGB": 5
200+
},
201+
"RoleArn": "arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-20210920T111639",
202+
"SecondaryStatus": "Downloading",
203+
"SecondaryStatusTransitions": [
204+
{
205+
"EndTime": null,
206+
"StartTime": "2021-10-13T03:49:20.337Z",
207+
"Status": "Starting",
208+
"StatusMessage": "Starting the training job"
209+
}
210+
],
211+
"StoppingCondition": {
212+
"MaxRuntimeInSeconds": 86400,
213+
"MaxWaitTimeInSeconds": null
214+
},
215+
"TensorBoardOutputConfig": null,
216+
"TrainingEndTime": null,
217+
"TrainingJobArn": "arn:aws:sagemaker:us-west-2:123456789012:training-job/training-test",
218+
"TrainingJobName": "training-test",
219+
"TrainingJobStatus": "InProgress",
220+
"TrainingStartTime": null,
221+
"TrainingTimeInSeconds": null,
222+
"TuningJobArn": null,
223+
"VpcConfig": null
224+
}

0 commit comments

Comments
 (0)