|
| 1 | +from pathlib import Path |
| 2 | + |
| 3 | +import geopandas as gpd |
1 | 4 | import pandas as pd |
2 | 5 | import pyarrow as pa |
3 | 6 | import pyarrow.feather as feather |
4 | 7 | import shapely |
| 8 | +from lonboard.colormap import apply_continuous_cmap |
| 9 | +from lonboard.geoarrow.geopandas_interop import geopandas_to_geoarrow |
| 10 | +from palettable.colorbrewer.diverging import BrBG_10 |
5 | 11 |
|
| 12 | +url = "https://ookla-open-data.s3.us-west-2.amazonaws.com/parquet/performance/type=mobile/year=2019/quarter=1/2019-01-01_performance_mobile_tiles.parquet" |
6 | 13 |
|
7 | | -class PointGeometryType(pa.ExtensionType): |
8 | | - def __init__(self): |
9 | | - pa.ExtensionType.__init__(self, self._storage_type, self._extension_name) |
10 | | - |
11 | | - _storage_type = pa.list_(pa.field("xy", pa.float64()), 2) |
12 | | - _extension_name = "geoarrow.point" |
13 | | - |
14 | | - def __arrow_ext_serialize__(self): |
15 | | - # since we don't have a parameterized type, we don't need extra |
16 | | - # metadata to be deserialized |
17 | | - return b"" |
18 | | - |
19 | | - @classmethod |
20 | | - def __arrow_ext_deserialize__(cls, storage_type, serialized): |
21 | | - # return an instance of this subclass given the serialized |
22 | | - # metadata. |
23 | | - return PointGeometryType() |
24 | | - |
25 | | - |
26 | | -# https://ookla-open-data.s3.us-west-2.amazonaws.com/parquet/performance/type=mobile/year=2019/quarter=1/2019-01-01_performance_mobile_tiles.parquet |
| 14 | +path = Path("2019-01-01_performance_mobile_tiles.parquet") |
27 | 15 |
|
28 | 16 |
|
29 | 17 | def main(): |
30 | | - df = pd.read_parquet("2019-01-01_performance_mobile_tiles.parquet") |
| 18 | + if not path.exists(): |
| 19 | + msg = f"Please download file to this directory from {url=}." |
| 20 | + raise ValueError(msg) |
| 21 | + |
| 22 | + df = pd.read_parquet(path) |
31 | 23 | centroids = shapely.centroid(shapely.from_wkt(df["tile"])) |
32 | 24 |
|
33 | 25 | # Save space by using a smaller data type |
34 | 26 | df_cols = ["avg_d_kbps", "avg_u_kbps", "avg_lat_ms"] |
35 | 27 | for col in df_cols: |
36 | 28 | df[col] = pd.to_numeric(df[col], downcast="unsigned") |
37 | 29 |
|
38 | | - table = pa.Table.from_pandas(df[df_cols]) |
39 | | - coords = shapely.get_coordinates(centroids) |
40 | | - parr = pa.FixedSizeListArray.from_arrays(coords.flatten(), 2) |
41 | | - extension_arr = pa.ExtensionArray.from_storage(PointGeometryType(), parr) |
42 | | - table = table.append_column("geometry", extension_arr) |
| 30 | + gdf = gpd.GeoDataFrame(df[df_cols], geometry=centroids) |
| 31 | + table = geopandas_to_geoarrow(gdf, preserve_index=False) |
| 32 | + |
| 33 | + min_bound = 5000 |
| 34 | + max_bound = 50000 |
| 35 | + download_speed = gdf["avg_d_kbps"] |
| 36 | + normalized_download_speed = (download_speed - min_bound) / (max_bound - min_bound) |
| 37 | + |
| 38 | + colors = apply_continuous_cmap(normalized_download_speed, BrBG_10) |
| 39 | + table = table.append_column( |
| 40 | + "colors", pa.FixedSizeListArray.from_arrays(colors.flatten("C"), 3) |
| 41 | + ) |
| 42 | + |
43 | 43 | feather.write_feather( |
44 | 44 | table, "2019-01-01_performance_mobile_tiles.feather", compression="uncompressed" |
45 | 45 | ) |
|
0 commit comments