Skip to content

Commit b930399

Browse files
anyalee0221anya-li
andauthored
Update featureset spec for featureset source (Azure#34668)
* Update featureset spec for featureset source * Update source_matadata_schema * Correct typo * Move checks to constructor and add more unit tests * Fix pylint errors --------- Co-authored-by: Anya Li <[email protected]>
1 parent ad95c41 commit b930399

File tree

11 files changed

+252
-4
lines changed

11 files changed

+252
-4
lines changed

sdk/ml/azure-ai-ml/azure/ai/ml/_schema/_feature_set/featureset_spec_metadata_schema.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class FeaturesetSpecMetadataSchema(YamlFileSchema):
2222
source = fields.Nested(SourceMetadataSchema, required=True)
2323
feature_transformation_code = fields.Nested(FeatureTransformationCodeMetadataSchema, required=False)
2424
features = fields.List(NestedField(FeatureSchema), required=True, allow_none=False)
25-
index_columns = fields.List(NestedField(DataColumnSchema), required=True, allow_none=False)
25+
index_columns = fields.List(NestedField(DataColumnSchema), required=False)
2626
source_lookback = fields.Nested(DelayMetadataSchema, required=False)
2727
temporal_join_lookback = fields.Nested(DelayMetadataSchema, required=False)
2828

sdk/ml/azure-ai-ml/azure/ai/ml/_schema/_feature_set/source_metadata_schema.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
class SourceMetadataSchema(metaclass=PatchedSchemaMeta):
1919
type = fields.Str(required=True)
2020
path = fields.Str(required=False)
21-
timestamp_column = fields.Nested(TimestampColumnMetadataSchema, required=True)
21+
timestamp_column = fields.Nested(TimestampColumnMetadataSchema, required=False)
2222
source_delay = fields.Nested(DelayMetadataSchema, required=False)
2323
source_process_code = fields.Nested(SourceProcessCodeSchema, load_only=True, required=False)
2424
dict = fields.Dict(keys=fields.Str(), values=fields.Str(), data_key="kwargs", load_only=True, required=False)

sdk/ml/azure-ai-ml/azure/ai/ml/entities/_feature_set/featureset_spec_metadata.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from azure.ai.ml.constants._common import BASE_PATH_CONTEXT_KEY
1616
from azure.ai.ml.entities._feature_store_entity.data_column import DataColumn
1717
from azure.ai.ml.entities._util import load_from_dict
18+
from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationErrorType, ValidationException
1819

1920
from .delay_metadata import DelayMetadata
2021
from .feature import Feature
@@ -31,11 +32,29 @@ def __init__(
3132
source: SourceMetadata,
3233
feature_transformation_code: Optional[FeatureTransformationCodeMetadata] = None,
3334
features: List[Feature],
34-
index_columns: List[DataColumn],
35+
index_columns: Optional[List[DataColumn]] = None,
3536
source_lookback: Optional[DelayMetadata] = None,
3637
temporal_join_lookback: Optional[DelayMetadata] = None,
3738
**_kwargs: Any,
3839
):
40+
if source.type == "featureset" and index_columns:
41+
msg = f"You cannot provide index_columns for {source.type} feature source."
42+
raise ValidationException(
43+
message=msg,
44+
no_personal_data_message=msg,
45+
error_type=ValidationErrorType.INVALID_VALUE,
46+
target=ErrorTarget.FEATURE_SET,
47+
error_category=ErrorCategory.USER_ERROR,
48+
)
49+
if not index_columns and source.type != "featureset":
50+
msg = f"You need to provide index_columns for {source.type} feature source."
51+
raise ValidationException(
52+
message=msg,
53+
no_personal_data_message=msg,
54+
error_type=ValidationErrorType.INVALID_VALUE,
55+
target=ErrorTarget.FEATURE_SET,
56+
error_category=ErrorCategory.USER_ERROR,
57+
)
3958
self.source = source
4059
self.feature_transformation_code = feature_transformation_code
4160
self.features = features
@@ -74,6 +93,7 @@ def _load(
7493
res: FeaturesetSpecMetadata = load_from_dict(
7594
FeaturesetSpecMetadataSchema, yaml_data, context, "", unknown=INCLUDE, **kwargs
7695
)
96+
7797
return res
7898

7999
def _to_dict(self) -> Dict:

sdk/ml/azure-ai-ml/azure/ai/ml/entities/_feature_set/source_metadata.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,44 @@ def __init__(
1818
self,
1919
*,
2020
type: str,
21-
timestamp_column: TimestampColumnMetadata,
21+
timestamp_column: Optional[TimestampColumnMetadata] = None,
2222
path: Optional[str] = None,
2323
source_delay: Optional[DelayMetadata] = None,
2424
source_process_code: Optional[SourceProcessCodeMetadata] = None,
2525
dict: Optional[Dict] = None,
2626
**kwargs: Any,
2727
):
28+
if type != "featureset":
29+
if not timestamp_column:
30+
msg = f"You need to provide timestamp_column for {type} feature source."
31+
raise ValidationException(
32+
message=msg,
33+
no_personal_data_message=msg,
34+
error_type=ValidationErrorType.INVALID_VALUE,
35+
target=ErrorTarget.FEATURE_SET,
36+
error_category=ErrorCategory.USER_ERROR,
37+
)
38+
2839
if type != "custom":
40+
if type == "featureset":
41+
if not path:
42+
msg = f"You need to provide path for {type} feature source."
43+
raise ValidationException(
44+
message=msg,
45+
no_personal_data_message=msg,
46+
error_type=ValidationErrorType.INVALID_VALUE,
47+
target=ErrorTarget.FEATURE_SET,
48+
error_category=ErrorCategory.USER_ERROR,
49+
)
50+
if timestamp_column:
51+
msg = f"Cannot provide timestamp_column for {type} feature source."
52+
raise ValidationException(
53+
message=msg,
54+
no_personal_data_message=msg,
55+
error_type=ValidationErrorType.INVALID_VALUE,
56+
target=ErrorTarget.FEATURE_SET,
57+
error_category=ErrorCategory.USER_ERROR,
58+
)
2959
if not (path and not dict and not source_process_code):
3060
msg = f"Cannot provide source_process_code or kwargs for {type} feature source."
3161
raise ValidationException(

sdk/ml/azure-ai-ml/tests/feature_set/unittests/test_feature_set_spec_schema.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,19 @@ def test_feature_set_spec_load(self) -> None:
3636
assert len(fspec.features) == 3
3737
assert len(fspec.source.kwargs.keys()) == 3
3838

39+
spec_path = "./tests/test_configs/feature_set/featureset_source_spec"
40+
featureset_spec_contents = read_feature_set_metadata(path=spec_path)
41+
featureset_spec_yaml_path = Path(spec_path, "FeatureSetSpec.yaml")
42+
fspec = FeaturesetSpecMetadata._load(featureset_spec_contents, featureset_spec_yaml_path)
43+
44+
assert fspec.feature_transformation_code is not None
45+
assert fspec.source is not None
46+
assert fspec.source.timestamp_column is None
47+
assert len(fspec.features) == 3
48+
assert fspec.index_columns is None
49+
assert fspec.source.source_delay is None
50+
assert fspec.source.timestamp_column is None
51+
3952
def test_feature_set_spec_load_failure(self) -> None:
4053
spec_path = "./tests/test_configs/feature_set/invalid_spec1"
4154
featureset_spec_contents = read_feature_set_metadata(path=spec_path)
@@ -48,3 +61,33 @@ def test_feature_set_spec_load_failure(self) -> None:
4861
featureset_spec_yaml_path = Path(spec_path, "FeatureSetSpec.yaml")
4962
with pytest.raises(ValidationException):
5063
FeaturesetSpecMetadata._load(featureset_spec_contents, featureset_spec_yaml_path)
64+
65+
spec_path = "./tests/test_configs/feature_set/invalid_spec3"
66+
featureset_spec_contents = read_feature_set_metadata(path=spec_path)
67+
featureset_spec_yaml_path = Path(spec_path, "FeatureSetSpec.yaml")
68+
with pytest.raises(ValidationException):
69+
FeaturesetSpecMetadata._load(featureset_spec_contents, featureset_spec_yaml_path)
70+
71+
spec_path = "./tests/test_configs/feature_set/invalid_spec4"
72+
featureset_spec_contents = read_feature_set_metadata(path=spec_path)
73+
featureset_spec_yaml_path = Path(spec_path, "FeatureSetSpec.yaml")
74+
with pytest.raises(ValidationException):
75+
FeaturesetSpecMetadata._load(featureset_spec_contents, featureset_spec_yaml_path)
76+
77+
spec_path = "./tests/test_configs/feature_set/invalid_spec5"
78+
featureset_spec_contents = read_feature_set_metadata(path=spec_path)
79+
featureset_spec_yaml_path = Path(spec_path, "FeatureSetSpec.yaml")
80+
with pytest.raises(ValidationException):
81+
FeaturesetSpecMetadata._load(featureset_spec_contents, featureset_spec_yaml_path)
82+
83+
spec_path = "./tests/test_configs/feature_set/invalid_spec6"
84+
featureset_spec_contents = read_feature_set_metadata(path=spec_path)
85+
featureset_spec_yaml_path = Path(spec_path, "FeatureSetSpec.yaml")
86+
with pytest.raises(ValidationException):
87+
FeaturesetSpecMetadata._load(featureset_spec_contents, featureset_spec_yaml_path)
88+
89+
spec_path = "./tests/test_configs/feature_set/invalid_spec7"
90+
featureset_spec_contents = read_feature_set_metadata(path=spec_path)
91+
featureset_spec_yaml_path = Path(spec_path, "FeatureSetSpec.yaml")
92+
with pytest.raises(ValidationException):
93+
FeaturesetSpecMetadata._load(featureset_spec_contents, featureset_spec_yaml_path)
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
source:
2+
type: featureset
3+
path: azureml://subscriptions/my_sub/resourcegroups/my_rg/workspaces/my_fs/feturesets/source_feature_set_name/versions/version1
4+
feature_transformation_code:
5+
path: ./code
6+
transformer_class: driver_hourly_transform.DriverHourlyTransformer
7+
features:
8+
- name: conv_rate
9+
type: double
10+
- name: acc_rate
11+
type: double
12+
- name: avg_daily_trips
13+
type: double
14+
source_lookback:
15+
days: 30
16+
hours: 0
17+
minutes: 0
18+
temporal_join_lookback:
19+
days: 2
20+
hours: 0
21+
minutes: 0
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# No path for featureset feature source
2+
source:
3+
type: featureset
4+
feature_transformation_code:
5+
path: ./code
6+
transformer_class: driver_hourly_transform.DriverHourlyTransformer
7+
features:
8+
- name: conv_rate
9+
type: double
10+
- name: acc_rate
11+
type: double
12+
- name: avg_daily_trips
13+
type: double
14+
source_lookback:
15+
days: 30
16+
hours: 0
17+
minutes: 0
18+
temporal_join_lookback:
19+
days: 2
20+
hours: 0
21+
minutes: 0
22+
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# No timestamp_column for custom feature source
2+
source:
3+
type: custom
4+
kwargs:
5+
k1: v1
6+
k2: v2
7+
k3: v3
8+
source_process_code:
9+
path: ./source_process_code
10+
process_class: source_process.MyDataSourceLoader
11+
feature_transformation_code:
12+
path: ./code
13+
transformer_class: driver_hourly_transform.DriverHourlyTransformer
14+
features:
15+
- name: conv_rate
16+
type: double
17+
- name: acc_rate
18+
type: double
19+
- name: avg_daily_trips
20+
type: double
21+
index_columns:
22+
- name: driver_id
23+
type: long
24+
source_lookback:
25+
days: 30
26+
hours: 0
27+
minutes: 0
28+
temporal_join_lookback:
29+
days: 2
30+
hours: 0
31+
minutes: 0
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# No index_columns for custom feature source
2+
source:
3+
type: custom
4+
kwargs:
5+
k1: v1
6+
k2: v2
7+
k3: v3
8+
timestamp_column:
9+
name: timestamp
10+
format: "%Y-%m-%d %H:%M:%S"
11+
source_process_code:
12+
path: ./source_process_code
13+
process_class: source_process.MyDataSourceLoader
14+
feature_transformation_code:
15+
path: ./code
16+
transformer_class: driver_hourly_transform.DriverHourlyTransformer
17+
features:
18+
- name: conv_rate
19+
type: double
20+
- name: acc_rate
21+
type: double
22+
- name: avg_daily_trips
23+
type: double
24+
source_lookback:
25+
days: 30
26+
hours: 0
27+
minutes: 0
28+
temporal_join_lookback:
29+
days: 2
30+
hours: 0
31+
minutes: 0
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# timestamp_column is not supported for featureset feature source
2+
source:
3+
type: featureset
4+
path: azureml://subscriptions/my_sub/resourcegroups/my_rg/workspaces/my_fs/feturesets/source_feature_set_name/versions/version1
5+
timestamp_column:
6+
name: timestamp
7+
format: "%Y-%m-%d %H:%M:%S"
8+
feature_transformation_code:
9+
path: ./code
10+
transformer_class: driver_hourly_transform.DriverHourlyTransformer
11+
features:
12+
- name: conv_rate
13+
type: double
14+
- name: acc_rate
15+
type: double
16+
- name: avg_daily_trips
17+
type: double
18+
source_lookback:
19+
days: 30
20+
hours: 0
21+
minutes: 0
22+
temporal_join_lookback:
23+
days: 2
24+
hours: 0
25+
minutes: 0

0 commit comments

Comments
 (0)