Skip to content

Commit 598626b

Browse files
AWS Glue Data Quality now provides aggregated metrics in evaluation results when publishAggregatedMetrics with row-level results are enabled. These metrics include summary statistics showing total counts of processed, passed, and failed rows and rules in a single view.
1 parent f15278e commit 598626b

13 files changed

+468
-16
lines changed

generator/ServiceModels/glue/glue-2017-03-31.api.json

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7291,6 +7291,17 @@
72917291
"type":"list",
72927292
"member":{"shape":"DataOperation"}
72937293
},
7294+
"DataQualityAggregatedMetrics":{
7295+
"type":"structure",
7296+
"members":{
7297+
"TotalRowsProcessed":{"shape":"NullableDouble"},
7298+
"TotalRowsPassed":{"shape":"NullableDouble"},
7299+
"TotalRowsFailed":{"shape":"NullableDouble"},
7300+
"TotalRulesProcessed":{"shape":"NullableDouble"},
7301+
"TotalRulesPassed":{"shape":"NullableDouble"},
7302+
"TotalRulesFailed":{"shape":"NullableDouble"}
7303+
}
7304+
},
72947305
"DataQualityAnalyzerResult":{
72957306
"type":"structure",
72967307
"members":{
@@ -7381,7 +7392,8 @@
73817392
"RulesetEvaluationRunId":{"shape":"HashString"},
73827393
"RuleResults":{"shape":"DataQualityRuleResults"},
73837394
"AnalyzerResults":{"shape":"DataQualityAnalyzerResults"},
7384-
"Observations":{"shape":"DataQualityObservations"}
7395+
"Observations":{"shape":"DataQualityObservations"},
7396+
"AggregatedMetrics":{"shape":"DataQualityAggregatedMetrics"}
73857397
}
73867398
},
73877399
"DataQualityResultDescription":{
@@ -7454,7 +7466,8 @@
74547466
"EvaluationMessage":{"shape":"DataQualityRuleResultDescription"},
74557467
"Result":{"shape":"DataQualityRuleResultStatus"},
74567468
"EvaluatedMetrics":{"shape":"EvaluatedMetricsMap"},
7457-
"EvaluatedRule":{"shape":"DataQualityRuleResultDescription"}
7469+
"EvaluatedRule":{"shape":"DataQualityRuleResultDescription"},
7470+
"RuleMetrics":{"shape":"RuleMetricsMap"}
74587471
}
74597472
},
74607473
"DataQualityRuleResultDescription":{
@@ -9374,7 +9387,8 @@
93749387
"RulesetEvaluationRunId":{"shape":"HashString"},
93759388
"RuleResults":{"shape":"DataQualityRuleResults"},
93769389
"AnalyzerResults":{"shape":"DataQualityAnalyzerResults"},
9377-
"Observations":{"shape":"DataQualityObservations"}
9390+
"Observations":{"shape":"DataQualityObservations"},
9391+
"AggregatedMetrics":{"shape":"DataQualityAggregatedMetrics"}
93789392
}
93799393
},
93809394
"GetDataQualityRuleRecommendationRunRequest":{
@@ -13519,6 +13533,12 @@
1351913533
},
1352013534
"RoleString":{"type":"string"},
1352113535
"RowTag":{"type":"string"},
13536+
"RuleMetricsMap":{
13537+
"type":"map",
13538+
"key":{"shape":"NameString"},
13539+
"value":{"shape":"NullableDouble"},
13540+
"sensitive":true
13541+
},
1352213542
"RulesetNames":{
1352313543
"type":"list",
1352413544
"member":{"shape":"NameString"},

generator/ServiceModels/glue/glue-2017-03-31.docs.json

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2211,7 +2211,7 @@
22112211
"refs": {}
22122212
},
22132213
"CreateDataQualityRulesetRequest": {
2214-
"base": null,
2214+
"base": "<p>A request to create a data quality ruleset.</p>",
22152215
"refs": {}
22162216
},
22172217
"CreateDataQualityRulesetResponse": {
@@ -2654,6 +2654,13 @@
26542654
"Property$DataOperationScopes": "<p>Indicates which data operations are applicable to the property.</p>"
26552655
}
26562656
},
2657+
"DataQualityAggregatedMetrics": {
2658+
"base": "<p>A summary of metrics showing the total counts of processed rows and rules, including their pass/fail statistics based on row-level results.</p>",
2659+
"refs": {
2660+
"DataQualityResult$AggregatedMetrics": "<p> A summary of <code>DataQualityAggregatedMetrics</code> objects showing the total counts of processed rows and rules, including their pass/fail statistics based on row-level results. </p>",
2661+
"GetDataQualityResultResponse$AggregatedMetrics": "<p> A summary of <code>DataQualityAggregatedMetrics</code> objects showing the total counts of processed rows and rules, including their pass/fail statistics based on row-level results. </p>"
2662+
}
2663+
},
26572664
"DataQualityAnalyzerResult": {
26582665
"base": "<p>Describes the result of the evaluation of a data quality analyzer.</p>",
26592666
"refs": {
@@ -4498,15 +4505,15 @@
44984505
"refs": {}
44994506
},
45004507
"GetDataQualityResultResponse": {
4501-
"base": null,
4508+
"base": "<p>The response for the data quality result.</p>",
45024509
"refs": {}
45034510
},
45044511
"GetDataQualityRuleRecommendationRunRequest": {
45054512
"base": null,
45064513
"refs": {}
45074514
},
45084515
"GetDataQualityRuleRecommendationRunResponse": {
4509-
"base": null,
4516+
"base": "<p>The response for the Data Quality rule recommendation run.</p>",
45104517
"refs": {}
45114518
},
45124519
"GetDataQualityRulesetEvaluationRunRequest": {
@@ -4522,7 +4529,7 @@
45224529
"refs": {}
45234530
},
45244531
"GetDataQualityRulesetResponse": {
4525-
"base": null,
4532+
"base": "<p>Returns the data quality ruleset response.</p>",
45264533
"refs": {}
45274534
},
45284535
"GetDatabaseRequest": {
@@ -6856,6 +6863,7 @@
68566863
"PutWorkflowRunPropertiesRequest$Name": "<p>Name of the workflow which was run.</p>",
68576864
"ReferenceDatasetsList$member": null,
68586865
"ResumeWorkflowRunRequest$Name": "<p>The name of the workflow to resume.</p>",
6866+
"RuleMetricsMap$key": null,
68596867
"RulesetNames$member": null,
68606868
"RunStatementRequest$SessionId": "<p>The Session Id of the statement to be run.</p>",
68616869
"SecurityConfiguration$Name": "<p>The name of the security configuration.</p>",
@@ -7261,6 +7269,12 @@
72617269
"CreateJobRequest$MaxCapacity": "<p>For Glue version 1.0 or earlier jobs, using the standard worker type, the number of Glue data processing units (DPUs) that can be allocated when this job runs. A DPU is a relative measure of processing power that consists of 4 vCPUs of compute capacity and 16 GB of memory. For more information, see the <a href=\"https://aws.amazon.com/glue/pricing/\"> Glue pricing page</a>.</p> <p>For Glue version 2.0+ jobs, you cannot specify a <code>Maximum capacity</code>. Instead, you should specify a <code>Worker type</code> and the <code>Number of workers</code>.</p> <p>Do not set <code>MaxCapacity</code> if using <code>WorkerType</code> and <code>NumberOfWorkers</code>.</p> <p>The value that can be allocated for <code>MaxCapacity</code> depends on whether you are running a Python shell job, an Apache Spark ETL job, or an Apache Spark streaming ETL job:</p> <ul> <li> <p>When you specify a Python shell job (<code>JobCommand.Name</code>=\"pythonshell\"), you can allocate either 0.0625 or 1 DPU. The default is 0.0625 DPU.</p> </li> <li> <p>When you specify an Apache Spark ETL job (<code>JobCommand.Name</code>=\"glueetl\") or Apache Spark streaming ETL job (<code>JobCommand.Name</code>=\"gluestreaming\"), you can allocate from 2 to 100 DPUs. The default is 10 DPUs. This job type cannot have a fractional DPU allocation.</p> </li> </ul>",
72627270
"CreateMLTransformRequest$MaxCapacity": "<p>The number of Glue data processing units (DPUs) that are allocated to task runs for this transform. You can allocate from 2 to 100 DPUs; the default is 10. A DPU is a relative measure of processing power that consists of 4 vCPUs of compute capacity and 16 GB of memory. For more information, see the <a href=\"https://aws.amazon.com/glue/pricing/\">Glue pricing page</a>. </p> <p> <code>MaxCapacity</code> is a mutually exclusive option with <code>NumberOfWorkers</code> and <code>WorkerType</code>.</p> <ul> <li> <p>If either <code>NumberOfWorkers</code> or <code>WorkerType</code> is set, then <code>MaxCapacity</code> cannot be set.</p> </li> <li> <p>If <code>MaxCapacity</code> is set then neither <code>NumberOfWorkers</code> or <code>WorkerType</code> can be set.</p> </li> <li> <p>If <code>WorkerType</code> is set, then <code>NumberOfWorkers</code> is required (and vice versa).</p> </li> <li> <p> <code>MaxCapacity</code> and <code>NumberOfWorkers</code> must both be at least 1.</p> </li> </ul> <p>When the <code>WorkerType</code> field is set to a value other than <code>Standard</code>, the <code>MaxCapacity</code> field is set automatically and becomes read-only.</p> <p>When the <code>WorkerType</code> field is set to a value other than <code>Standard</code>, the <code>MaxCapacity</code> field is set automatically and becomes read-only.</p>",
72637271
"CreateSessionRequest$MaxCapacity": "<p>The number of Glue data processing units (DPUs) that can be allocated when the job runs. A DPU is a relative measure of processing power that consists of 4 vCPUs of compute capacity and 16 GB memory. </p>",
7272+
"DataQualityAggregatedMetrics$TotalRowsProcessed": "<p>The total number of rows that were processed during the data quality evaluation.</p>",
7273+
"DataQualityAggregatedMetrics$TotalRowsPassed": "<p>The total number of rows that passed all applicable data quality rules.</p>",
7274+
"DataQualityAggregatedMetrics$TotalRowsFailed": "<p>The total number of rows that failed one or more data quality rules.</p>",
7275+
"DataQualityAggregatedMetrics$TotalRulesProcessed": "<p>The total number of data quality rules that were evaluated.</p>",
7276+
"DataQualityAggregatedMetrics$TotalRulesPassed": "<p>The total number of data quality rules that passed their evaluation criteria.</p>",
7277+
"DataQualityAggregatedMetrics$TotalRulesFailed": "<p>The total number of data quality rules that failed their evaluation criteria.</p>",
72647278
"DataQualityMetricValues$ActualValue": "<p>The actual value of the data quality metric.</p>",
72657279
"DataQualityMetricValues$ExpectedValue": "<p>The expected value of the data quality metric according to the analysis of historical data.</p>",
72667280
"DataQualityMetricValues$LowerLimit": "<p>The lower limit of the data quality metric value according to the analysis of historical data.</p>",
@@ -7273,6 +7287,7 @@
72737287
"JobRun$DPUSeconds": "<p>This field can be set for either job runs with execution class <code>FLEX</code> or when Auto Scaling is enabled, and represents the total time each executor ran during the lifecycle of a job run in seconds, multiplied by a DPU factor (1 for <code>G.1X</code>, 2 for <code>G.2X</code>, or 0.25 for <code>G.025X</code> workers). This value may be different than the <code>executionEngineRuntime</code> * <code>MaxCapacity</code> as in the case of Auto Scaling jobs, as the number of executors running at a given time may be less than the <code>MaxCapacity</code>. Therefore, it is possible that the value of <code>DPUSeconds</code> is less than <code>executionEngineRuntime</code> * <code>MaxCapacity</code>.</p>",
72747288
"JobUpdate$MaxCapacity": "<p>For Glue version 1.0 or earlier jobs, using the standard worker type, the number of Glue data processing units (DPUs) that can be allocated when this job runs. A DPU is a relative measure of processing power that consists of 4 vCPUs of compute capacity and 16 GB of memory. For more information, see the <a href=\"https://aws.amazon.com/glue/pricing/\"> Glue pricing page</a>.</p> <p>For Glue version 2.0+ jobs, you cannot specify a <code>Maximum capacity</code>. Instead, you should specify a <code>Worker type</code> and the <code>Number of workers</code>.</p> <p>Do not set <code>MaxCapacity</code> if using <code>WorkerType</code> and <code>NumberOfWorkers</code>.</p> <p>The value that can be allocated for <code>MaxCapacity</code> depends on whether you are running a Python shell job, an Apache Spark ETL job, or an Apache Spark streaming ETL job:</p> <ul> <li> <p>When you specify a Python shell job (<code>JobCommand.Name</code>=\"pythonshell\"), you can allocate either 0.0625 or 1 DPU. The default is 0.0625 DPU.</p> </li> <li> <p>When you specify an Apache Spark ETL job (<code>JobCommand.Name</code>=\"glueetl\") or Apache Spark streaming ETL job (<code>JobCommand.Name</code>=\"gluestreaming\"), you can allocate from 2 to 100 DPUs. The default is 10 DPUs. This job type cannot have a fractional DPU allocation.</p> </li> </ul>",
72757289
"MLTransform$MaxCapacity": "<p>The number of Glue data processing units (DPUs) that are allocated to task runs for this transform. You can allocate from 2 to 100 DPUs; the default is 10. A DPU is a relative measure of processing power that consists of 4 vCPUs of compute capacity and 16 GB of memory. For more information, see the <a href=\"http://aws.amazon.com/glue/pricing/\">Glue pricing page</a>. </p> <p> <code>MaxCapacity</code> is a mutually exclusive option with <code>NumberOfWorkers</code> and <code>WorkerType</code>.</p> <ul> <li> <p>If either <code>NumberOfWorkers</code> or <code>WorkerType</code> is set, then <code>MaxCapacity</code> cannot be set.</p> </li> <li> <p>If <code>MaxCapacity</code> is set then neither <code>NumberOfWorkers</code> or <code>WorkerType</code> can be set.</p> </li> <li> <p>If <code>WorkerType</code> is set, then <code>NumberOfWorkers</code> is required (and vice versa).</p> </li> <li> <p> <code>MaxCapacity</code> and <code>NumberOfWorkers</code> must both be at least 1.</p> </li> </ul> <p>When the <code>WorkerType</code> field is set to a value other than <code>Standard</code>, the <code>MaxCapacity</code> field is set automatically and becomes read-only.</p>",
7290+
"RuleMetricsMap$value": null,
72767291
"Session$MaxCapacity": "<p>The number of Glue data processing units (DPUs) that can be allocated when the job runs. A DPU is a relative measure of processing power that consists of 4 vCPUs of compute capacity and 16 GB memory. </p>",
72777292
"Session$ExecutionTime": "<p>The total time the session ran for.</p>",
72787293
"Session$DPUSeconds": "<p>The DPUs consumed by the session (formula: ExecutionTime * MaxCapacity).</p>",
@@ -8484,6 +8499,12 @@
84848499
"XMLClassifier$RowTag": "<p>The XML tag designating the element that contains each record in an XML document being parsed. This can't identify a self-closing element (closed by <code>/&gt;</code>). An empty row element that contains only attributes can be parsed as long as it ends with a closing tag (for example, <code>&lt;row item_a=\"A\" item_b=\"B\"&gt;&lt;/row&gt;</code> is okay, but <code>&lt;row item_a=\"A\" item_b=\"B\" /&gt;</code> is not).</p>"
84858500
}
84868501
},
8502+
"RuleMetricsMap": {
8503+
"base": null,
8504+
"refs": {
8505+
"DataQualityRuleResult$RuleMetrics": "<p>A map containing metrics associated with the evaluation of the rule based on row-level results. </p>"
8506+
}
8507+
},
84878508
"RulesetNames": {
84888509
"base": null,
84898510
"refs": {
@@ -9246,7 +9267,7 @@
92469267
"refs": {}
92479268
},
92489269
"StartDataQualityRuleRecommendationRunRequest": {
9249-
"base": null,
9270+
"base": "<p>The request of the Data Quality rule recommendation request.</p>",
92509271
"refs": {}
92519272
},
92529273
"StartDataQualityRuleRecommendationRunResponse": {

generator/ServiceModels/glue/glue-2017-03-31.normal.json

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8788,7 +8788,8 @@
87888788
"shape":"HashString",
87898789
"documentation":"<p>Used for idempotency and is recommended to be set to a random ID (such as a UUID) to avoid creating or starting multiple instances of the same resource.</p>"
87908790
}
8791-
}
8791+
},
8792+
"documentation":"<p>A request to create a data quality ruleset.</p>"
87928793
},
87938794
"CreateDataQualityRulesetResponse":{
87948795
"type":"structure",
@@ -10343,6 +10344,36 @@
1034310344
"type":"list",
1034410345
"member":{"shape":"DataOperation"}
1034510346
},
10347+
"DataQualityAggregatedMetrics":{
10348+
"type":"structure",
10349+
"members":{
10350+
"TotalRowsProcessed":{
10351+
"shape":"NullableDouble",
10352+
"documentation":"<p>The total number of rows that were processed during the data quality evaluation.</p>"
10353+
},
10354+
"TotalRowsPassed":{
10355+
"shape":"NullableDouble",
10356+
"documentation":"<p>The total number of rows that passed all applicable data quality rules.</p>"
10357+
},
10358+
"TotalRowsFailed":{
10359+
"shape":"NullableDouble",
10360+
"documentation":"<p>The total number of rows that failed one or more data quality rules.</p>"
10361+
},
10362+
"TotalRulesProcessed":{
10363+
"shape":"NullableDouble",
10364+
"documentation":"<p>The total number of data quality rules that were evaluated.</p>"
10365+
},
10366+
"TotalRulesPassed":{
10367+
"shape":"NullableDouble",
10368+
"documentation":"<p>The total number of data quality rules that passed their evaluation criteria.</p>"
10369+
},
10370+
"TotalRulesFailed":{
10371+
"shape":"NullableDouble",
10372+
"documentation":"<p>The total number of data quality rules that failed their evaluation criteria.</p>"
10373+
}
10374+
},
10375+
"documentation":"<p>A summary of metrics showing the total counts of processed rows and rules, including their pass/fail statistics based on row-level results.</p>"
10376+
},
1034610377
"DataQualityAnalyzerResult":{
1034710378
"type":"structure",
1034810379
"members":{
@@ -10525,6 +10556,10 @@
1052510556
"Observations":{
1052610557
"shape":"DataQualityObservations",
1052710558
"documentation":"<p>A list of <code>DataQualityObservation</code> objects representing the observations generated after evaluating the rules and analyzers. </p>"
10559+
},
10560+
"AggregatedMetrics":{
10561+
"shape":"DataQualityAggregatedMetrics",
10562+
"documentation":"<p> A summary of <code>DataQualityAggregatedMetrics</code> objects showing the total counts of processed rows and rules, including their pass/fail statistics based on row-level results. </p>"
1052810563
}
1052910564
},
1053010565
"documentation":"<p>Describes a data quality result.</p>"
@@ -10672,6 +10707,10 @@
1067210707
"EvaluatedRule":{
1067310708
"shape":"DataQualityRuleResultDescription",
1067410709
"documentation":"<p>The evaluated rule.</p>"
10710+
},
10711+
"RuleMetrics":{
10712+
"shape":"RuleMetricsMap",
10713+
"documentation":"<p>A map containing metrics associated with the evaluation of the rule based on row-level results. </p>"
1067510714
}
1067610715
},
1067710716
"documentation":"<p>Describes the result of the evaluation of a data quality rule.</p>"
@@ -14043,8 +14082,13 @@
1404314082
"Observations":{
1404414083
"shape":"DataQualityObservations",
1404514084
"documentation":"<p>A list of <code>DataQualityObservation</code> objects representing the observations generated after evaluating the rules and analyzers. </p>"
14085+
},
14086+
"AggregatedMetrics":{
14087+
"shape":"DataQualityAggregatedMetrics",
14088+
"documentation":"<p> A summary of <code>DataQualityAggregatedMetrics</code> objects showing the total counts of processed rows and rules, including their pass/fail statistics based on row-level results. </p>"
1404614089
}
14047-
}
14090+
},
14091+
"documentation":"<p>The response for the data quality result.</p>"
1404814092
},
1404914093
"GetDataQualityRuleRecommendationRunRequest":{
1405014094
"type":"structure",
@@ -14115,7 +14159,8 @@
1411514159
"shape":"NameString",
1411614160
"documentation":"<p>The name of the security configuration created with the data quality encryption option.</p>"
1411714161
}
14118-
}
14162+
},
14163+
"documentation":"<p>The response for the Data Quality rule recommendation run.</p>"
1411914164
},
1412014165
"GetDataQualityRulesetEvaluationRunRequest":{
1412114166
"type":"structure",
@@ -14237,7 +14282,8 @@
1423714282
"shape":"NameString",
1423814283
"documentation":"<p>The name of the security configuration created with the data quality encryption option.</p>"
1423914284
}
14240-
}
14285+
},
14286+
"documentation":"<p>Returns the data quality ruleset response.</p>"
1424114287
},
1424214288
"GetDatabaseRequest":{
1424314289
"type":"structure",
@@ -21632,6 +21678,12 @@
2163221678
},
2163321679
"RoleString":{"type":"string"},
2163421680
"RowTag":{"type":"string"},
21681+
"RuleMetricsMap":{
21682+
"type":"map",
21683+
"key":{"shape":"NameString"},
21684+
"value":{"shape":"NullableDouble"},
21685+
"sensitive":true
21686+
},
2163521687
"RulesetNames":{
2163621688
"type":"list",
2163721689
"member":{"shape":"NameString"},
@@ -23838,7 +23890,8 @@
2383823890
"shape":"HashString",
2383923891
"documentation":"<p>Used for idempotency and is recommended to be set to a random ID (such as a UUID) to avoid creating or starting multiple instances of the same resource.</p>"
2384023892
}
23841-
}
23893+
},
23894+
"documentation":"<p>The request of the Data Quality rule recommendation request.</p>"
2384223895
},
2384323896
"StartDataQualityRuleRecommendationRunResponse":{
2384423897
"type":"structure",

0 commit comments

Comments
 (0)