|
| 1 | +import datetime |
1 | 2 | import hashlib |
2 | 3 | import json |
3 | 4 | import re |
| 5 | +import uuid |
4 | 6 | from collections.abc import Callable, Generator, Iterator |
5 | 7 | from dataclasses import dataclass, field |
| 8 | +from decimal import Decimal |
6 | 9 | from typing import Any, Literal, TypeVar, overload |
7 | 10 |
|
8 | 11 | import click |
@@ -889,6 +892,49 @@ def in_out_handler( |
889 | 892 |
|
890 | 893 | return pa.RecordBatch.from_arrays([["last"], ["row"]], schema=output_schema) |
891 | 894 |
|
| 895 | + static_data_schema = SchemaCollection( |
| 896 | + scalar_functions_by_name=CaseInsensitiveDict(), |
| 897 | + table_functions_by_name=CaseInsensitiveDict(), |
| 898 | + tables_by_name=CaseInsensitiveDict( |
| 899 | + { |
| 900 | + "employees": TableInfo( |
| 901 | + table_versions=[ |
| 902 | + pa.Table.from_arrays( |
| 903 | + [ |
| 904 | + ["Emily", "Amy"], |
| 905 | + [30, 32], |
| 906 | + [datetime.datetime(2023, 10, 1), datetime.datetime(2024, 10, 2)], |
| 907 | + ["{}", "[1,2,3]"], |
| 908 | + [uuid.uuid4().bytes, uuid.uuid4().bytes], |
| 909 | + [datetime.date(2023, 10, 1), datetime.date(2024, 10, 2)], |
| 910 | + [True, False], |
| 911 | + ["Ann", None], |
| 912 | + [1234.123, 5678.123], |
| 913 | + [Decimal("12345.678790"), Decimal("67890.123456")], |
| 914 | + ], |
| 915 | + schema=pa.schema( |
| 916 | + [ |
| 917 | + pa.field("name", pa.string()), |
| 918 | + pa.field("age", pa.int32()), |
| 919 | + pa.field("start_date", pa.timestamp("ms")), |
| 920 | + pa.field("json_data", pa.json_(pa.string())), |
| 921 | + pa.field("id", pa.uuid()), |
| 922 | + pa.field("birthdate", pa.date32()), |
| 923 | + pa.field("is_active", pa.bool_()), |
| 924 | + pa.field("nickname", pa.string()), |
| 925 | + pa.field("salary", pa.float64()), |
| 926 | + pa.field("balance", pa.decimal128(12, 6)), |
| 927 | + ], |
| 928 | + metadata={"can_produce_statistics": "1"}, |
| 929 | + ), |
| 930 | + ) |
| 931 | + ], |
| 932 | + row_id_counter=2, |
| 933 | + ) |
| 934 | + } |
| 935 | + ), |
| 936 | + ) |
| 937 | + |
892 | 938 | util_schema = SchemaCollection( |
893 | 939 | scalar_functions_by_name=CaseInsensitiveDict( |
894 | 940 | { |
@@ -1065,6 +1111,7 @@ def in_out_handler( |
1065 | 1111 | ) |
1066 | 1112 |
|
1067 | 1113 | library.databases_by_name[database_name].schemas_by_name["utils"] = util_schema |
| 1114 | + library.databases_by_name[database_name].schemas_by_name["static_data"] = static_data_schema |
1068 | 1115 |
|
1069 | 1116 | return iter([]) |
1070 | 1117 | elif action.type == "drop_database": |
@@ -1611,6 +1658,58 @@ def action_change_column_type( |
1611 | 1658 | schema_name=parameters.schema_name, |
1612 | 1659 | )[0] |
1613 | 1660 |
|
| 1661 | + def action_column_statistics( |
| 1662 | + self, |
| 1663 | + *, |
| 1664 | + context: base_server.CallContext[auth.Account, auth.AccountToken], |
| 1665 | + parameters: parameter_types.ColumnStatistics, |
| 1666 | + ) -> pa.Table: |
| 1667 | + assert context.caller is not None |
| 1668 | + |
| 1669 | + descriptor_parts = descriptor_unpack_(parameters.flight_descriptor) |
| 1670 | + library = self.contents[context.caller.token.token] |
| 1671 | + database = library.by_name(descriptor_parts.catalog_name) |
| 1672 | + schema = database.by_name(descriptor_parts.schema_name) |
| 1673 | + |
| 1674 | + assert descriptor_parts.type == "table" |
| 1675 | + table = schema.by_name("table", descriptor_parts.name) |
| 1676 | + |
| 1677 | + contents = table.version().column(parameters.column_name) |
| 1678 | + # Since the table is a Pyarrow table we need to produce some values. |
| 1679 | + not_null_count = pc.count(contents, "only_valid").as_py() |
| 1680 | + null_count = pc.count(contents, "only_null").as_py() |
| 1681 | + distinct_count = len(set(contents.to_pylist())) |
| 1682 | + sorted_contents = sorted(filter(lambda x: x is not None, contents.to_pylist())) |
| 1683 | + min_value = sorted_contents[0] |
| 1684 | + max_value = sorted_contents[-1] |
| 1685 | + |
| 1686 | + if contents.type == pa.uuid(): |
| 1687 | + # For UUIDs, we need to convert them to strings for the output. |
| 1688 | + min_value = min_value.bytes |
| 1689 | + max_value = max_value.bytes |
| 1690 | + |
| 1691 | + result_table = pa.Table.from_pylist( |
| 1692 | + [ |
| 1693 | + { |
| 1694 | + "has_not_null": not_null_count > 0, |
| 1695 | + "has_null": null_count > 0, |
| 1696 | + "distinct_count": distinct_count, |
| 1697 | + "min": min_value, |
| 1698 | + "max": max_value, |
| 1699 | + } |
| 1700 | + ], |
| 1701 | + schema=pa.schema( |
| 1702 | + [ |
| 1703 | + pa.field("has_not_null", pa.bool_()), |
| 1704 | + pa.field("has_null", pa.bool_()), |
| 1705 | + pa.field("distinct_count", pa.uint64()), |
| 1706 | + pa.field("min", contents.type), |
| 1707 | + pa.field("max", contents.type), |
| 1708 | + ] |
| 1709 | + ), |
| 1710 | + ) |
| 1711 | + return result_table |
| 1712 | + |
1614 | 1713 | def impl_do_get( |
1615 | 1714 | self, |
1616 | 1715 | *, |
|
0 commit comments