Merge pull request #11 from Eppo-exp/update-window-definitions

tbuffington7 · web-flow · commit d78d379664cd · 2024-08-16T12:41:11.000-05:00
updated schema to use new timeframe parameters
diff --git a/eppo_metrics_sync/eppo_metrics_sync.py b/eppo_metrics_sync/eppo_metrics_sync.py
@@ -82,7 +82,7 @@ def read_yaml_files(self):
                             self.load_eppo_yaml(yaml_path)
                         else:
                             self.validation_errors.append(
-                                f"Schema violation in {yaml_path}: \n{valid.error_message}"
+                                f"Schema violation in {yaml_path}: \n{valid['error_message']}"
                             )
                     
                     elif self.schema_type == 'dbt-model':
diff --git a/eppo_metrics_sync/schema/eppo_metric_schema.json b/eppo_metrics_sync/schema/eppo_metric_schema.json
@@ -216,13 +216,17 @@
                                     }
                                 }
                             },
-                            "aggregation_timeframe_value": {
-                                "description": "How many timeframe units since assignment to include (optional)",
-                                "value": "number"
+                            "aggregation_timeframe_start_value": {
+                                "description": "The start of the timeframe window defined in number of timeframe units following assignment (optional)",
+                                "type": "number"
+                            },
+                            "aggregation_timeframe_end_value": {
+                                "description": "The end of the timeframe window defined in number of timeframe units following assignment (optional)",
+                                "type": "number"
                             },
                             "aggregation_timeframe_unit": {
                                 "description": "What time unit to use: minutes, hours, days, or weeks (optional)",
-                                "enum": ["minutes", "hours", "days", "weeks"]
+                                "enum": ["minutes", "hours", "days", "weeks", "calendar_days"]
                             },
                             "winsorization_lower_percentile": {
                                 "description": "Percentile at which to clip aggregated metrics (optional)",
@@ -274,13 +278,17 @@
                                     }
                                 }
                             },
-                            "aggregation_timeframe_value": {
-                                "description": "How many timeframe units since assignment to include (optional)",
+                            "aggregation_timeframe_start_value": {
+                                "description": "The start of the timeframe window defined in number of timeframe units following assignment (optional)",
+                                "type": "number"
+                            },
+                            "aggregation_timeframe_end_value": {
+                                "description": "The end of the timeframe window defined in number of timeframe units following assignment (optional)",
                                 "type": "number"
                             },
                             "aggregation_timeframe_unit": {
                                 "description": "What time unit to use: minutes, hours, days, or weeks (optional)",
-                                "enum": ["minutes", "hours", "days", "weeks"]
+                                "enum": ["minutes", "hours", "days", "weeks", "calendar_days"]
                             },
                             "winsorization_lower_percentile": {
                                 "description": "Percentile at which to clip aggregated metrics (optional)",
diff --git a/eppo_metrics_sync/validation.py b/eppo_metrics_sync/validation.py
@@ -12,7 +12,8 @@
 ]
 
 timeframe_parameters = [
-    'aggregation_timeframe_value',
+    'aggregation_timeframe_start_value',
+    'aggregation_timeframe_end_value',
     'aggregation_timeframe_unit'
 ]
 
@@ -27,15 +28,14 @@ def check_for_duplicated_names(payload, names, object_name):
 
 
 def unique_names(payload):
-    
     fact_source_names = []
     fact_names = []
     fact_property_names = []
 
     for fact_source in payload.fact_sources:
         fact_source_names.append(fact_source['name'])
         fact_names.extend([f['name'] for f in fact_source['facts']])
-        if('properties' in fact_source):
+        if ('properties' in fact_source):
             fact_property_names.extend(
                 [f['name'] for f in fact_source['properties']]
             )
@@ -45,14 +45,13 @@ def unique_names(payload):
     check_for_duplicated_names(payload, fact_source_names, 'Fact source')
     check_for_duplicated_names(payload, fact_names, 'Fact')
     # TODO: check for distinct names within a given fact source
-    #check_for_duplicated_names(payload, fact_property_names, 'Fact property')
+    # check_for_duplicated_names(payload, fact_property_names, 'Fact property')
     check_for_duplicated_names(payload, metric_names, 'Metric')
-    
+
     return True
 
 
 def valid_fact_references(payload):
-
     fact_references = set()
     for metric in payload.metrics:
         fact_references.add(metric['numerator']['fact_name'])
@@ -66,7 +65,7 @@ def valid_fact_references(payload):
 
     if fact_references.issubset(set(fact_names)) == False:
         payload.validation_errors.append(
-            "Invalid fact reference(s): " + 
+            "Invalid fact reference(s): " +
             str(', '.join(fact_references.difference(fact_names)))
         )
 
@@ -79,7 +78,7 @@ def metric_aggregation_is_valid(payload):
             payload.validation_errors.append(
                 f"{m['name']} has invalid numerator: {numerator_error}"
             )
-        
+
         if 'denominator' in m:
             denominator_error = aggregation_is_valid(m['denominator'])
             if denominator_error:
@@ -89,11 +88,11 @@ def metric_aggregation_is_valid(payload):
 
 
 def distinct_advanced_aggregation_parameter_set(
-        aggregation, 
-        operation, 
+        aggregation,
+        operation,
         aggregation_parameter,
-        error_message 
-    ):
+        error_message
+):
     if aggregation['operation'] == operation:
         matched = [p for p in advanced_aggregation_parameters if p in aggregation]
         if len(matched) == 0:
@@ -110,10 +109,10 @@ def distinct_advanced_aggregation_parameter_set(
 
 
 def aggregation_is_valid(aggregation):
-
     error_message = []
 
-    if aggregation['operation'] not in ['sum', 'count', 'count_distinct', 'distinct_entity', 'threshold', 'retention', 'conversion']:
+    if aggregation['operation'] not in ['sum', 'count', 'count_distinct', 'distinct_entity', 'threshold', 'retention',
+                                        'conversion']:
         error_message.append(
             'Invalid aggregation operation: ' + aggregation['operation']
         )
@@ -124,12 +123,19 @@ def aggregation_is_valid(aggregation):
             error_message.append(
                 'Cannot winsorize a metric with operation ' + aggregation['operation']
             )
-        
-    # either 0 or 2 of timeframe_parameters must be set
-    if len([name for name in timeframe_parameters if name in aggregation]) == 1:
+
+    # The aggregation_timeframe_unit must be specified if timeframe parameters are set
+    included_timeframe_parameters = [name for name in timeframe_parameters if name in aggregation]
+
+    if 'aggregation_timeframe_value' in aggregation:
+        error_message.append(
+            'The aggregation_timeframe_value parameter has been deprecated. Please use aggregation_timeframe_end instead.'
+        )
+
+    timeframe_unit_specified = 'aggregation_timeframe_unit' in included_timeframe_parameters
+    if len(included_timeframe_parameters) > 0 and not timeframe_unit_specified:
         error_message.append(
-            'Either both or neither aggregation_timeframe_value and ' +
-            'aggregation_timeframe_unit must be set'
+            'The aggregation_timeframe_unit must be set to use timeframe parameters.'
         )
 
     # only set timeframe_parameters on a some operation types
@@ -154,20 +160,20 @@ def aggregation_is_valid(aggregation):
         pass
 
     distinct_advanced_aggregation_parameter_set(
-        aggregation, 
-        'retention', 
+        aggregation,
+        'retention',
         'retention_threshold_days',
         error_message
     )
     distinct_advanced_aggregation_parameter_set(
-        aggregation, 
-        'conversion', 
+        aggregation,
+        'conversion',
         'conversion_threshold_days',
         error_message
     )
     distinct_advanced_aggregation_parameter_set(
-        aggregation, 
-        'threshold', 
+        aggregation,
+        'threshold',
         'threshold_metric_settings',
         error_message
     )
@@ -176,4 +182,3 @@ def aggregation_is_valid(aggregation):
         return '\n'.join(error_message)
     else:
         return None
-    
diff --git a/tests/test_validation.py b/tests/test_validation.py
@@ -5,39 +5,36 @@
 from eppo_metrics_sync.eppo_metrics_sync import EppoMetricsSync
 
 # If we use context.py we can do something like this instead
-#from .context import eppo_metric_sync
-#from .context import validation
+# from .context import eppo_metric_sync
+# from .context import validation
 
 
 test_yaml_dir = "tests/yaml/invalid"
 
 
 def test_unique_fact_source_names():
-
-    eppo_metrics_sync = EppoMetricsSync(directory = None)
+    eppo_metrics_sync = EppoMetricsSync(directory=None)
     eppo_metrics_sync.load_eppo_yaml(
-        path = test_yaml_dir + "/duplicated_fact_source_names.yaml")
-    
-    with pytest.raises(ValueError, match = "Fact source names are not unique: upgrades_table"):
+        path=test_yaml_dir + "/duplicated_fact_source_names.yaml")
+
+    with pytest.raises(ValueError, match="Fact source names are not unique: upgrades_table"):
         eppo_metrics_sync.validate()
 
 
 def test_unique_metric_names():
-
-    eppo_metrics_sync = EppoMetricsSync(directory = None)
+    eppo_metrics_sync = EppoMetricsSync(directory=None)
     eppo_metrics_sync.load_eppo_yaml(
-        path = test_yaml_dir + "/duplicated_metric_names.yaml")
-    
-    with pytest.raises(ValueError, match = "Metric names are not unique: Total Upgrades to Paid Plan"):
+        path=test_yaml_dir + "/duplicated_metric_names.yaml")
+
+    with pytest.raises(ValueError, match="Metric names are not unique: Total Upgrades to Paid Plan"):
         eppo_metrics_sync.validate()
 
 
 def test_unique_fact_names():
-
-    eppo_metrics_sync = EppoMetricsSync(directory = None)
+    eppo_metrics_sync = EppoMetricsSync(directory=None)
     eppo_metrics_sync.load_eppo_yaml(
-        path = test_yaml_dir + "/duplicated_fact_names.yaml")
-    
+        path=test_yaml_dir + "/duplicated_fact_names.yaml")
+
     with pytest.raises(ValueError, match="Fact names are not unique: upgrades"):
         eppo_metrics_sync.validate()
 
@@ -53,10 +50,10 @@ def test_unique_fact_names():
 
 
 def test_invalid_fact_reference():
-    eppo_metrics_sync = EppoMetricsSync(directory = None)
+    eppo_metrics_sync = EppoMetricsSync(directory=None)
     eppo_metrics_sync.load_eppo_yaml(
-        path = test_yaml_dir + "/invalid_fact_reference.yaml")
-    with pytest.raises(ValueError, match = re.escape("Invalid fact reference(s): revenue")):
+        path=test_yaml_dir + "/invalid_fact_reference.yaml")
+    with pytest.raises(ValueError, match=re.escape("Invalid fact reference(s): revenue")):
         eppo_metrics_sync.validate()
 
 
@@ -72,23 +69,22 @@ def test_invalid_winsorization_operation():
 def test_invalid_aggregation_for_timeframe():
     test_agg = {
         'operation': 'conversion',
-        'aggregation_timeframe_value': 1,
+        'aggregation_timeframe_end_value': 1,
         'aggregation_timeframe_unit': 'days',
         'conversion_threshold_days': 1
     }
-    
+
     res = aggregation_is_valid(test_agg)
-    assert res == 'Cannot specify aggregation_timeframe_value for operation conversion'
+    assert res == 'Cannot specify aggregation_timeframe_end_value for operation conversion'
 
 
 def test_invalid_timeframe_parameters():
     test_agg = {
         'operation': 'sum',
-        'aggregation_timeframe_value': 1
+        'aggregation_timeframe_end_value': 1
     }
 
-    expected_error = 'Either both or neither aggregation_timeframe_value and ' + \
-        'aggregation_timeframe_unit must be set'
+    expected_error = 'The aggregation_timeframe_unit must be set to use timeframe parameters.'
 
     res = aggregation_is_valid(test_agg)
     assert res == expected_error
@@ -102,7 +98,7 @@ def test_invalid_aggregation_parameter():
 
     res = aggregation_is_valid(test_agg)
     assert res == 'retention_threshold_days specified, but operation is sum'
-        
+
 
 def test_missing_conversion_threshold():
     test_agg = {
@@ -122,12 +118,14 @@ def test_extra_parameter_on_retention_metric():
     res = aggregation_is_valid(test_agg)
     assert res == 'Invalid parameter for retention aggregation: conversion_threshold_days'
 
+
 def test_count_distinct():
     test_agg = {
         'operation': 'count_distinct',
-        'aggregation_timeframe_value': 1,
+        'aggregation_timeframe_start_value': 1,
+        'aggregation_timeframe_end_value': 7,
         'aggregation_timeframe_unit': 'days'
     }
-    
+
     res = aggregation_is_valid(test_agg)
-    assert res == None
+    assert res == None

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ def read_yaml_files(self):`
`82`	`82`	`self.load_eppo_yaml(yaml_path)`
`83`	`83`	`else:`
`84`	`84`	`self.validation_errors.append(`
`85`		`- f"Schema violation in {yaml_path}: \n{valid.error_message}"`
	`85`	`+ f"Schema violation in {yaml_path}: \n{valid['error_message']}"`
`86`	`86`	`)`
`87`	`87`
`88`	`88`	`elif self.schema_type == 'dbt-model':`