|
63 | 63 | _SlicedYKey = typing.NamedTuple('_SlicedYKey', [('slice_key', types.SliceKey), |
64 | 64 | ('y', _YType)]) |
65 | 65 |
|
66 | | -_SlicedXKey = typing.NamedTuple('_SlicedXKey', [('slice_key', types.SliceKey), |
67 | | - ('x_path', types.FeaturePath), |
68 | | - ('x', _XType)]) |
69 | 66 |
|
70 | | -_SlicedXYKey = typing.NamedTuple('_SlicedXYKey', [('slice_key', types.SliceKey), |
71 | | - ('x_path', types.FeaturePath), |
72 | | - ('x', _XType), ('y', _YType)]) |
| 67 | +# TODO(embr,zhuo): FeaturePathTuple is used instead of FeaturePath because: |
| 68 | +# - FeaturePath does not have a deterministic coder |
| 69 | +# - Even if it does, beam does not automatically derive a coder for a |
| 70 | +# NamedTuple. |
| 71 | +# Once the latter is supported we can change all FEaturePathTuples back to |
| 72 | +# FeaturePaths. |
| 73 | +_SlicedXKey = typing.NamedTuple('_SlicedXKey', |
| 74 | + [('slice_key', types.SliceKey), |
| 75 | + ('x_path', types.FeaturePathTuple), |
| 76 | + ('x', _XType)]) |
| 77 | + |
| 78 | +_SlicedXYKey = typing.NamedTuple('_SlicedXYKey', |
| 79 | + [('slice_key', types.SliceKey), |
| 80 | + ('x_path', types.FeaturePathTuple), |
| 81 | + ('x', _XType), ('y', _YType)]) |
73 | 82 |
|
74 | 83 | _LiftSeriesKey = typing.NamedTuple('_LiftSeriesKey', |
75 | 84 | [('slice_key', types.SliceKey), |
76 | | - ('x_path', types.FeaturePath), |
| 85 | + ('x_path', types.FeaturePathTuple), |
77 | 86 | ('y', _YType), ('y_count', _CountType)]) |
78 | 87 |
|
79 | 88 | _SlicedFeatureKey = typing.NamedTuple('_SlicedFeatureKey', |
80 | 89 | [('slice_key', types.SliceKey), |
81 | | - ('x_path', types.FeaturePath)]) |
| 90 | + ('x_path', types.FeaturePathTuple)]) |
82 | 91 |
|
83 | 92 | _ConditionalYRate = typing.NamedTuple('_ConditionalYRate', |
84 | | - [('x_path', types.FeaturePath), |
| 93 | + [('x_path', types.FeaturePathTuple), |
85 | 94 | ('x', _XType), ('xy_count', _CountType), |
86 | 95 | ('x_count', _CountType)]) |
87 | 96 |
|
@@ -171,15 +180,15 @@ def _get_example_value_presence( |
171 | 180 | if is_binary_like: |
172 | 181 | # return binary like values a pd.Categorical wrapped in a Series. This makes |
173 | 182 | # subsqeuent operations like pd.Merge cheaper. |
174 | | - values = arr_flat_dict[values] |
| 183 | + values = arr_flat_dict[values].tolist() |
175 | 184 | else: |
176 | 185 | values = values.tolist() # converts values to python native types. |
177 | 186 | if weight_column_name: |
178 | 187 | weights = arrow_util.get_weight_feature(record_batch, weight_column_name) |
179 | | - weights = np.asarray(weights)[example_indices] |
| 188 | + weights = np.asarray(weights)[example_indices].tolist() |
180 | 189 | else: |
181 | 190 | weights = np.ones(len(example_indices), dtype=int).tolist() |
182 | | - return _ValuePresence(example_indices, values, weights) |
| 191 | + return _ValuePresence(example_indices.tolist(), values, weights) |
183 | 192 |
|
184 | 193 |
|
185 | 194 | def _to_partial_copresence_counts( |
@@ -246,7 +255,8 @@ def _to_partial_copresence_counts( |
246 | 255 | if num_xy_pairs_batch_copresent: |
247 | 256 | num_xy_pairs_batch_copresent.update(len(copresence_counts)) |
248 | 257 | for (x, y), count in copresence_counts.items(): |
249 | | - yield _SlicedXYKey(slice_key=slice_key, x_path=x_path, x=x, y=y), count |
| 258 | + yield (_SlicedXYKey(slice_key=slice_key, x_path=x_path.steps(), x=x, |
| 259 | + y=y), count) |
250 | 260 |
|
251 | 261 |
|
252 | 262 | def _to_partial_counts( |
@@ -283,7 +293,7 @@ def _to_partial_x_counts( |
283 | 293 | x_path, |
284 | 294 | boundaries=None, |
285 | 295 | weight_column_name=example_weight_map.get(x_path)): |
286 | | - yield _SlicedXKey(slice_key, x_path, x), x_count |
| 296 | + yield _SlicedXKey(slice_key, x_path.steps(), x), x_count |
287 | 297 |
|
288 | 298 |
|
289 | 299 | def _get_unicode_value(value: Union[Text, bytes]) -> Text: |
@@ -324,11 +334,12 @@ def _make_dataset_feature_stats_proto( |
324 | 334 | The populated DatasetFeatureStatistics proto. |
325 | 335 | """ |
326 | 336 | key, lift_series_list = lifts |
| 337 | + x_path = types.FeaturePath(key.x_path) |
327 | 338 | stats = statistics_pb2.DatasetFeatureStatistics() |
328 | 339 | cross_stats = stats.cross_features.add( |
329 | | - path_x=key.x_path.to_proto(), path_y=y_path.to_proto()) |
| 340 | + path_x=x_path.to_proto(), path_y=y_path.to_proto()) |
330 | 341 | if output_custom_stats: |
331 | | - feature_stats = stats.features.add(path=key.x_path.to_proto()) |
| 342 | + feature_stats = stats.features.add(path=x_path.to_proto()) |
332 | 343 | for lift_series in sorted(lift_series_list): |
333 | 344 | lift_series_proto = ( |
334 | 345 | cross_stats.categorical_cross_stats.lift.lift_series.add()) |
@@ -392,7 +403,8 @@ def _make_dataset_feature_stats_proto( |
392 | 403 | def _cross_join_y_keys( |
393 | 404 | join_info: Tuple[types.SliceKey, Dict[Text, Sequence[Any]]] |
394 | 405 | # TODO(b/147153346) update dict value list element type annotation to: |
395 | | - # Union[_YKey, Tuple[_YType, Tuple[types.FeaturePath, _XType, _CountType]]] |
| 406 | + # Union[_YKey, Tuple[_YType, |
| 407 | + # Tuple[types.FeaturePathTuple, _XType, _CountType]]] |
396 | 408 | ) -> Iterator[Tuple[_SlicedXYKey, _CountType]]: |
397 | 409 | slice_key, join_args = join_info |
398 | 410 | for x_path, x, _ in join_args['x_counts']: |
|
0 commit comments