diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 00000000..5aa82b3d --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +db/ \ No newline at end of file diff --git a/benchmarks/apoc_addresses_1M.parquet b/benchmarks/apoc_addresses_1M.parquet new file mode 100644 index 00000000..52a7d688 Binary files /dev/null and b/benchmarks/apoc_addresses_1M.parquet differ diff --git a/benchmarks/parse_address.py b/benchmarks/parse_address.py new file mode 100644 index 00000000..249501e0 --- /dev/null +++ b/benchmarks/parse_address.py @@ -0,0 +1,56 @@ +import datetime +import os +from pathlib import Path +from time import time +from typing import Callable + +import ibis +import ibis.expr.types as ir +import pandas as pd +from ibis import _, Table + +from mismo.lib.geo import postal_parse_address +from mismo.lib.geo.tests.test_postal_benchmark import noop, postal_only, postal_parse_address__direct_import, postal_parse_address__initial_impl, python_only + + +_CURRENT_DIR = Path(__file__).parent +_DB_DIR = Path(_CURRENT_DIR, 'db') + + +def _prepare_db_table(benchmark_id: str, db_name: str) -> Table: + apoc_file = Path(_CURRENT_DIR, 'apoc_addresses_1M.parquet') + apoc_data = pd.read_parquet(apoc_file) + + # db_file = Path(_DB_DIR, benchmark_id, db_name) + # os.makedirs(db_file.parent, exist_ok=True) + # con = ibis.duckdb.connect(db_file) + # t = con.create_table(db_name, apoc_data) + t = ibis.memtable(apoc_data) + + return t + + +def run_benchmark(benchmark_id: str, parse_fn: Callable[..., ir.Value]) -> None: + input_table = _prepare_db_table(benchmark_id, f"{parse_fn.__name__}.ddb") + input_table = input_table.cache() + start = time() + res = parse_fn(input_table.full_address) + persisted = res.as_table().cache() + end = time() + print(f"{parse_fn.__name__:<35} took {end - start:>8.4f} seconds") + + +def main(): + # Windows does not allow ':' in file names + benchmark_id = datetime.datetime.now(datetime.timezone.utc).isoformat().replace(":", "-") + + run_benchmark(benchmark_id, noop) + run_benchmark(benchmark_id, python_only) + run_benchmark(benchmark_id, postal_only) + run_benchmark(benchmark_id, postal_parse_address) + run_benchmark(benchmark_id, postal_parse_address__direct_import) + run_benchmark(benchmark_id, postal_parse_address__initial_impl) + + +if __name__ == '__main__': + main() diff --git a/mismo/lib/geo/_address.py b/mismo/lib/geo/_address.py index 6d7662a3..1a551be6 100644 --- a/mismo/lib/geo/_address.py +++ b/mismo/lib/geo/_address.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections import defaultdict import re import ibis @@ -295,24 +294,64 @@ def postal_parse_address(address_string: ir.StringValue) -> ir.StructValue: from postal.parser import parse_address as _parse_address @ibis.udf.scalar.python(signature=((str,), ADDRESS_SCHEMA)) - def udf(address_string: str | None) -> dict[str, str] | None: - # remove once https://github.com/ibis-project/ibis/pull/9625 is fixed + def udf(address_string: str | None) -> dict[str, str | None] | None: + # TODO: remove once https://github.com/ibis-project/ibis/pull/9625 is fixed if address_string is None: return None + + # Initially, the key set of the `result` dict is given by the union of + # both the names of the fields in the `ADDRESS_SCHEMA` struct and + # the names of the pypostal fields we use. + # Later, the latter are popped to match the shape of `ADDRESS_SCHEMA`. + + # NB: due to https://github.com/ibis-project/ibis/issues/9613 + # the keys of the `result` dict returned at the end of this function + # must be sorted in the same order as they are declared in the + # `ADDRESS_SCHEMA` struct. Current workaround is to create the dict + # with all those keys in the proper order since the beginning. + result: dict[str, str | None] = { + "street1": None, + "street2": None, + "city": None, + "state": None, + "postal_code": None, + "country": None, + + # Temporary keys used to store values returned by pypostal and + # popped before returning the dictionary + "house_number": None, + "road": None, + "unit": None, + "postcode": None + } + parsed_fields = _parse_address(address_string) - label_to_values = defaultdict(list) for value, label in parsed_fields: - label_to_values[label].append(value) - renamed = { - "street1": label_to_values["house_number"] + label_to_values["road"], - "street2": label_to_values["unit"], - "city": label_to_values["city"], - "state": label_to_values["state"], - "postal_code": label_to_values["postcode"], - "country": label_to_values["country"], - } - # replace empty strings with None - return {k: " ".join(v) or None for k, v in renamed.items()} + # Pypostal returns more fields than the ones we actually need. + # Here `False` is used as a placeholder under the assumption that + # such value is never returned by pypostal a field value. + current = result.get(label, False) + + # Keep only the fields declared when `result` is initialized. + # Pypostal fields can be repeated, in such case we concat their values. + if current is not False: + result[label] = value if current is None else f"{current} {value}" + + # Hack to prepend "house_number" to "road" + house_number = result.pop("house_number") + if house_number is not None: + road = result["road"] + if road is None: + result["road"] = house_number + else: + result["road"] = f"{house_number} {road}" + + # Modify `result` to match the shape of an `ADDRESS_SCHEMA`. + result["street1"] = result.pop("road") + result["street2"] = result.pop("unit") + result["postal_code"] = result.pop("postcode") + + return result return udf(address_string) diff --git a/mismo/lib/geo/tests/test_postal_benchmark.py b/mismo/lib/geo/tests/test_postal_benchmark.py index 62281b1e..b98755ae 100644 --- a/mismo/lib/geo/tests/test_postal_benchmark.py +++ b/mismo/lib/geo/tests/test_postal_benchmark.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import defaultdict from pathlib import Path import ibis @@ -31,23 +32,30 @@ @udf -def noop(address_string: str | None) -> dict: +def noop(address_string: str | None) -> dict[str, None]: return _NOOP_ADDRESS @udf -def python_only(address_string: str | None) -> dict: +def python_only(address_string: str | None) -> dict[str, str | None] | None: + # remove once https://github.com/ibis-project/ibis/pull/9625 is fixed + if address_string is None: + return None + result: dict[str, str | None] = { - "house_number": None, - "road": None, - "unit": None, - "city": None, - "state": None, - "postcode": None, - "country": None, + "street1": None, + "street2": None, + "city": None, + "state": None, + "postal_code": None, + "country": None, + "house_number": None, + "road": None, + "unit": None, + "postcode": None } - # Fake 'parse_address' function that emits just one field ("street") + # Fake '_parse_address' function that emits just one field ("street") # containing the whole address. parsed_fields = (("street", address_string),) for value, label in parsed_fields: @@ -71,40 +79,40 @@ def python_only(address_string: str | None) -> dict: @udf -def postal_only(address_string: str | None) -> dict: - _parse_address(address_string or "") +def postal_only(address_string: str | None) -> dict[str, None] | None: + # remove once https://github.com/ibis-project/ibis/pull/9625 is fixed + if address_string is None: + return None + + _parse_address(address_string) return _NOOP_ADDRESS -@udf -def complete(address_string: str | None) -> dict | None: +@ibis.udf.scalar.python +def postal_parse_address__direct_import(address_string: str) -> ADDRESS_SCHEMA: + # TODO: remove once https://github.com/ibis-project/ibis/pull/9625 is fixed if address_string is None: return None - # Initially, the keys match the names of pypostal fields we need. - # Later, this dict is modified to match the shape of an `ADDRESS_SCHEMA`. + result: dict[str, str | None] = { - "house_number": None, - "road": None, - "unit": None, + "street1": None, + "street2": None, "city": None, "state": None, - "postcode": None, + "postal_code": None, "country": None, + "house_number": None, + "road": None, + "unit": None, + "postcode": None } parsed_fields = _parse_address(address_string) for value, label in parsed_fields: - # Pypostal returns more fields than the ones we actually need. - # Here `False` is used as a placeholder under the assumption that - # such value is never returned by pypostal a field value. current = result.get(label, False) - - # Keep only the fields declared when `result` is initialized. - # Pypostal fields can be repeated, in such case we concat their values. if current is not False: result[label] = value if current is None else f"{current} {value}" - # Hack to prepend "house_number" to "road" house_number = result.pop("house_number") if house_number is not None: road = result["road"] @@ -113,7 +121,6 @@ def complete(address_string: str | None) -> dict | None: else: result["road"] = f"{house_number} {road}" - # Modify `result` in-place to match the shape of an `ADDRESS_SCHEMA`. result["street1"] = result.pop("road") result["street2"] = result.pop("unit") result["postal_code"] = result.pop("postcode") @@ -121,6 +128,27 @@ def complete(address_string: str | None) -> dict | None: return result +@udf +def postal_parse_address__initial_impl(address_string: str | None) -> dict[str, str | None] | None: + # remove once https://github.com/ibis-project/ibis/pull/9625 is fixed + if address_string is None: + return None + parsed_fields = _parse_address(address_string) + label_to_values = defaultdict(list) + for value, label in parsed_fields: + label_to_values[label].append(value) + renamed = { + "street1": label_to_values["house_number"] + label_to_values["road"], + "street2": label_to_values["unit"], + "city": label_to_values["city"], + "state": label_to_values["state"], + "postal_code": label_to_values["postcode"], + "country": label_to_values["country"], + } + # replace empty strings with None + return {k: " ".join(v) or None for k, v in renamed.items()} + + def download_test_data() -> ir.Table: # download test data from https://github.com/NickCrews/apoc-data/releases/tag/20240717-111158 URL_TEMPLATE = "https://github.com/NickCrews/apoc-data/releases/download/20240717-111158/income_{year}.csv" @@ -162,8 +190,9 @@ def data(backend: ibis.BaseBackend) -> ir.Table: noop, python_only, postal_only, - complete, postal_parse_address, + postal_parse_address__direct_import, + postal_parse_address__initial_impl, ], ) @pytest.mark.parametrize(