Skip to content

Commit 192f46b

Browse files
authored
feat: 1311 further optimize the pmtiles creation algorithm (#1378)
1 parent ffd294c commit 192f46b

File tree

14 files changed

+622
-288
lines changed

14 files changed

+622
-288
lines changed

functions-python/helpers/tests/test_transform.py

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
to_boolean,
77
get_nested_value,
88
to_float,
9-
get_safe_value,
10-
get_safe_float,
11-
get_safe_int,
9+
get_safe_value_from_csv,
10+
get_safe_float_from_csv,
11+
get_safe_int_from_csv,
1212
)
1313

1414

@@ -94,82 +94,88 @@ def test_default_value(self):
9494
class TestGetSafeValue(unittest.TestCase):
9595
def test_valid_value(self):
9696
row = {"name": " Alice "}
97-
self.assertEqual(get_safe_value(row, "name"), "Alice")
97+
self.assertEqual(get_safe_value_from_csv(row, "name"), "Alice")
9898

9999
def test_missing_column(self):
100100
row = {"age": 30}
101-
self.assertIsNone(get_safe_value(row, "name"))
101+
self.assertIsNone(get_safe_value_from_csv(row, "name"))
102102

103103
def test_empty_string(self):
104104
row = {"name": " "}
105-
self.assertIsNone(get_safe_value(row, "name"))
105+
self.assertIsNone(get_safe_value_from_csv(row, "name"))
106106

107107
def test_nan_value(self):
108108
row = {"name": pd.NA}
109-
self.assertIsNone(get_safe_value(row, "name"))
109+
self.assertIsNone(get_safe_value_from_csv(row, "name"))
110110
row = {"name": float("nan")}
111-
self.assertIsNone(get_safe_value(row, "name"))
111+
self.assertIsNone(get_safe_value_from_csv(row, "name"))
112112

113113
def test_default_value(self):
114114
row = {"name": ""}
115115
self.assertEqual(
116-
get_safe_value(row, "name", default_value="default"), "default"
116+
get_safe_value_from_csv(row, "name", default_value="default"), "default"
117117
)
118118

119119

120120
class TestGetSafeFloat(unittest.TestCase):
121121
def test_valid_float(self):
122122
row = {"value": "3.14"}
123-
self.assertEqual(get_safe_float(row, "value"), 3.14)
123+
self.assertEqual(get_safe_float_from_csv(row, "value"), 3.14)
124124
row = {"value": 2.5}
125-
self.assertEqual(get_safe_float(row, "value"), 2.5)
125+
self.assertEqual(get_safe_float_from_csv(row, "value"), 2.5)
126126
row = {"value": "0"}
127-
self.assertEqual(get_safe_float(row, "value"), 0.0)
127+
self.assertEqual(get_safe_float_from_csv(row, "value"), 0.0)
128128
row = {"value": 0}
129-
self.assertEqual(get_safe_float(row, "value"), 0.0)
129+
self.assertEqual(get_safe_float_from_csv(row, "value"), 0.0)
130130

131131
def test_missing_column(self):
132132
row = {"other": 1.23}
133-
self.assertIsNone(get_safe_float(row, "value"))
133+
self.assertIsNone(get_safe_float_from_csv(row, "value"))
134134

135135
def test_empty_string(self):
136136
row = {"value": " "}
137-
self.assertIsNone(get_safe_float(row, "value"))
137+
self.assertIsNone(get_safe_float_from_csv(row, "value"))
138138

139139
def test_nan_value(self):
140140
row = {"value": pd.NA}
141-
self.assertIsNone(get_safe_float(row, "value"))
141+
self.assertIsNone(get_safe_float_from_csv(row, "value"))
142142
row = {"value": float("nan")}
143-
self.assertIsNone(get_safe_float(row, "value"))
143+
self.assertIsNone(get_safe_float_from_csv(row, "value"))
144144

145145
def test_invalid_float(self):
146146
row = {"value": "abc"}
147-
self.assertIsNone(get_safe_float(row, "value"))
147+
self.assertIsNone(get_safe_float_from_csv(row, "value"))
148148
row = {"value": None}
149-
self.assertIsNone(get_safe_float(row, "value"))
149+
self.assertIsNone(get_safe_float_from_csv(row, "value"))
150150

151151
def test_default_value(self):
152152
row = {"value": ""}
153-
self.assertEqual(get_safe_float(row, "value", default_value=1.23), 1.23)
153+
self.assertEqual(
154+
get_safe_float_from_csv(row, "value", default_value=1.23), 1.23
155+
)
154156
row = {"value": "abc"}
155-
self.assertEqual(get_safe_float(row, "value", default_value=4.56), 4.56)
157+
self.assertEqual(
158+
get_safe_float_from_csv(row, "value", default_value=4.56), 4.56
159+
)
156160
row = {"value": None}
157-
self.assertEqual(get_safe_float(row, "value", default_value=7.89), 7.89)
161+
self.assertEqual(
162+
get_safe_float_from_csv(row, "value", default_value=7.89), 7.89
163+
)
158164

159165

160166
class TestGetSafeInt(unittest.TestCase):
161167
def test_valid_int(self):
162168
row = {"value": "42"}
163-
self.assertEqual(get_safe_int(row, "value"), 42)
169+
self.assertEqual(get_safe_int_from_csv(row, "value"), 42)
164170

165171
def test_invalid_int(self):
166172
row = {"value": "abc"}
167-
self.assertIsNone(get_safe_int(row, "value"))
173+
self.assertIsNone(get_safe_int_from_csv(row, "value"))
168174

169175
def test_missing_key(self):
170176
row = {}
171-
self.assertIsNone(get_safe_int(row, "value"))
177+
self.assertIsNone(get_safe_int_from_csv(row, "value"))
172178

173179
def test_empty_string(self):
174180
row = {"value": ""}
175-
self.assertIsNone(get_safe_int(row, "value"))
181+
self.assertIsNone(get_safe_int_from_csv(row, "value"))

functions-python/helpers/transform.py

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -89,38 +89,70 @@ def to_float(value, default_value: Optional[float] = None) -> Optional[float]:
8989
return default_value
9090

9191

92-
def get_safe_value(row, column_name, default_value: str = None) -> Optional[str]:
92+
def get_safe_value_from_csv(
93+
row, column_name, default_value: str = None
94+
) -> Optional[str]:
9395
"""
94-
Get a safe value from the row. If the value is missing or empty, return the default value.
96+
Get a safe value from a csv row. If the value is missing or empty, return the default value.
97+
"""
98+
99+
raw_value = row.get(column_name, None)
100+
101+
return get_safe_value(raw_value, default_value)
102+
103+
104+
def get_safe_value(raw_value, default_value: str = None) -> Optional[str]:
105+
"""
106+
Get a safe value. If the value is missing or empty, return the default value.
95107
"""
96108
import pandas
97109

98-
value = row.get(column_name, None)
99110
if (
100-
value is None
101-
or pandas.isna(value)
102-
or (isinstance(value, str) and value.strip() == "")
111+
raw_value is None
112+
or pandas.isna(raw_value)
113+
or (isinstance(raw_value, str) and raw_value.strip() == "")
103114
):
104115
return default_value
105-
return f"{value}".strip()
116+
return f"{raw_value}".strip()
117+
118+
119+
def get_safe_float_from_csv(
120+
row, column_name, default_value: float = None
121+
) -> Optional[float]:
122+
"""
123+
Get a safe float value from a csv row.
124+
Use the default value if the value is missing or cannot be converted to float.
125+
"""
126+
raw_value = row.get(column_name, None)
106127

128+
return get_safe_float(raw_value, default_value)
107129

108-
def get_safe_float(row, column_name, default_value: float = None) -> Optional[float]:
130+
131+
def get_safe_float(raw_value, default_value: float = None) -> Optional[float]:
109132
"""
110-
Get a safe float value from the row. If the value is missing or cannot be converted to float.
133+
Get a safe float value. Use the default value if the value is missing or cannot be converted to float.
111134
"""
112-
safe_value = get_safe_value(row, column_name)
135+
safe_value = get_safe_value(raw_value, default_value)
113136
try:
114137
return float(safe_value)
115138
except (ValueError, TypeError):
116139
return default_value
117140

118141

119-
def get_safe_int(row, column_name, default_value: int = None) -> Optional[int]:
142+
def get_safe_int_from_csv(row, column_name, default_value: int = None) -> Optional[int]:
143+
"""
144+
Get a safe int value from a csv row. Use the default value if the value is missing or cannot be converted to int.
145+
"""
146+
raw_value = row.get(column_name, None)
147+
148+
return get_safe_int(raw_value, default_value)
149+
150+
151+
def get_safe_int(raw_value, default_value: int = None) -> Optional[int]:
120152
"""
121-
Get a safe int value from the row. If the value is missing or cannot be converted to int.
153+
Get a safe int value. Use the default value if the value is missing or cannot be converted to int.
122154
"""
123-
safe_value = get_safe_value(row, column_name)
155+
safe_value = get_safe_value(raw_value, default_value)
124156
try:
125157
return int(safe_value)
126158
except (ValueError, TypeError):

functions-python/helpers/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,9 @@ def detect_encoding(
363363
enc = result.encoding.lower()
364364

365365
# If UTF-8 is detected, always use utf-8-sig to strip BOM if present
366-
if enc in ("utf_8", "utf-8", "utf8", "utf8mb4"):
366+
# Treat ascii as UTF-8, since it's a subset of UTF-8 and it will prevent errors where UTF-8 characters are present
367+
# after the first 100K characters of the file.
368+
if enc in ("ascii", "utf_8", "utf-8", "utf8", "utf8mb4"):
367369
return "utf-8-sig"
368370

369371
return enc

functions-python/pmtiles_builder/.coveragerc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ omit =
77

88
[report]
99
exclude_lines =
10-
if __name__ == .__main__.:
10+
pragma: no cover
11+
if __name__ == .__main__.:

functions-python/pmtiles_builder/function_config.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,17 @@
22
"name": "pmtiles-builder",
33
"description": "The PMTiles Builder function creates PMTiles from dataset files",
44
"entry_point": "build_pmtiles_handler",
5-
"timeout": 1000,
5+
"timeout": 1680,
66
"memory": "8Gi",
77
"trigger_http": true,
88
"include_folders": ["helpers"],
99
"include_api_folders": ["database_gen", "database", "common"],
1010
"environment_variables": [
1111
{
1212
"key": "DATASETS_BUCKET_NAME"
13+
},
14+
{
15+
"key": "LOGGING_LEVEL"
1316
}
1417
],
1518
"secret_environment_variables": [

functions-python/pmtiles_builder/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,7 @@ python-dotenv==1.0.0
2727
tippecanoe
2828
psutil
2929
pandas
30+
numpy
31+
pympler
32+
3033
charset_normalizer

0 commit comments

Comments
 (0)