Skip to content

Commit 9ea5e71

Browse files
district10TANG ZHIXIONG
andauthored
benchmark, normalize json, sort keys in encoding geobuf (#18)
* add benchmark * add cpp benchmark * python version * lint * add fix * should diff json * denouse double int * strip geometry z all is 0 * fix tab * normalize json * normalize json * reday? * test normalize rapidjson * round geometry * fix * use array * optional * use scale * fix test * extract code to helpers * ready to release --------- Co-authored-by: TANG ZHIXIONG <dvora4tzx@gmail.com>
1 parent 09f837c commit 9ea5e71

File tree

13 files changed

+515
-16
lines changed

13 files changed

+515
-16
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@ dist
44
*.egg-info
55
tests/rapidjson.png.json
66
__pycache__
7+
benchmarks/*.json
8+
benchmarks/*.pbf*

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ repos:
5858
rev: v1.1.13
5959
hooks:
6060
- id: remove-tabs
61-
exclude: ^(docs|Makefile)
61+
exclude: ^(docs|Makefile|benchmarks/Makefile)
6262

6363
- repo: https://github.com/PyCQA/flake8
6464
rev: 3.9.2

benchmarks/Makefile

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
clean:
2+
rm -f *.pbf *.pbf.json *linted*
3+
4+
JSON_PATH ?= ak_alaska_zip_codes_geo.min.json
5+
ak_alaska_zip_codes_geo.min.json:
6+
wget https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/ak_alaska_zip_codes_geo.min.json
7+
pull: ak_alaska_zip_codes_geo.min.json
8+
.PHONY: pull
9+
10+
roundtrip_js:
11+
make encoding_js
12+
make decoding_js
13+
encoding_js:
14+
json2geobuf < $(JSON_PATH) > $(JSON_PATH)__js.pbf
15+
decoding_js:
16+
geobuf2json < $(JSON_PATH)__js.pbf > $(JSON_PATH)__js.pbf.json
17+
benchmark_js:
18+
hyperfine --warmup 3 'make encoding_js'
19+
hyperfine --warmup 3 'make decoding_js'
20+
# hyperfine --warmup 3 'make roundtrip_js'
21+
.PHONY: roundtrip_js encoding_js decoding_js benchmark_js
22+
23+
roundtrip_py:
24+
make encoding_py
25+
make decoding_py
26+
encoding_py:
27+
geobuf encode < $(JSON_PATH) > $(JSON_PATH)__py.pbf
28+
decoding_py:
29+
geobuf decode < $(JSON_PATH)__py.pbf > $(JSON_PATH)__py.pbf.json
30+
benchmark_py:
31+
hyperfine --warmup 3 'make encoding_py'
32+
hyperfine --warmup 3 'make decoding_py'
33+
# hyperfine --warmup 3 'make roundtrip_py'
34+
.PHONY: roundtrip_py encoding_py decoding_py benchmark_py
35+
36+
roundtrip_cpp:
37+
make encoding_cpp
38+
make decoding_cpp
39+
encoding_cpp:
40+
python3 -m pybind11_geobuf json2geobuf --precision=6 --only_xy=True $(JSON_PATH) $(JSON_PATH)__cpp.pbf
41+
decoding_cpp:
42+
python3 -m pybind11_geobuf geobuf2json $(JSON_PATH)__cpp.pbf $(JSON_PATH)__cpp.pbf.json
43+
benchmark_cpp:
44+
hyperfine --warmup 3 'make encoding_cpp'
45+
hyperfine --warmup 3 'make decoding_cpp'
46+
# hyperfine --warmup 3 'make roundtrip_cpp'
47+
.PHONY: roundtrip_cpp encoding_cpp decoding_cpp benchmark_cpp
48+
49+
benchmark_all:
50+
make benchmark_js
51+
make benchmark_py
52+
make benchmark_cpp
53+
54+
normalize:
55+
python3 -m pybind11_geobuf normalize_json $(JSON_PATH) --output_path linted_input.json
56+
python3 -m pybind11_geobuf normalize_json $(JSON_PATH)__js.pbf.json --output_path linted_js.json
57+
# python3 -m pybind11_geobuf normalize_json $(JSON_PATH)__py.pbf.json --output_path linted_py.json
58+
python3 -m pybind11_geobuf normalize_json $(JSON_PATH)__cpp.pbf.json --output_path linted_cpp.json && python3 fix.py linted_cpp.json
59+
60+
check:
61+
vim -p linted_input.json linted_js.json linted_py.json linted_cpp.json

benchmarks/README

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
make benchmark_all --no-print-directory

benchmarks/diffjson.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
file1="${1:?
6+
Usage: $(basename $0) file1.json file2.json
7+
Show diff of JSON, normalizing key ordering.
8+
}"
9+
10+
file2="${2:?Missing file2}"
11+
12+
diff <(jq --sort-keys . < "$file1") <(jq --sort-keys . < "$file2") | less -R

benchmarks/fix.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import json
2+
import sys
3+
4+
with open(sys.argv[1], encoding="utf-8") as f:
5+
data = json.load(f)
6+
7+
8+
def list_chop_z(data):
9+
if not isinstance(data, list):
10+
return
11+
if isinstance(data[0], float) and len(data) == 3 and data[2] == 0.0:
12+
data.pop()
13+
for d in data:
14+
list_chop_z(d)
15+
16+
17+
for f in data["features"]:
18+
coords = f["geometry"]["coordinates"]
19+
list_chop_z(coords)
20+
21+
with open(sys.argv[1], "w", encoding="utf-8") as f:
22+
json.dump(data, f, indent=4)

docs/about/release-notes.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ To upgrade `pybind11-geobuf` to the latest version, use pip:
1010
pip install -U pybind11-geobuf
1111
```
1212

13+
## Version 0.1.1 (2023-04-09)
14+
15+
* benchmark (speed: cpp > js > python)
16+
* normalize json round geometry/non-geometry parts
17+
* sort keys in encoding geobuf
18+
1319
## Version 0.1.0 (2023-04-01)
1420

1521
* round z in geobuf encoding

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def build_extension(self, ext):
127127
# logic and declaration, and simpler if you include description/version in a file.
128128
setup(
129129
name="pybind11_geobuf",
130-
version="0.1.0",
130+
version="0.1.1",
131131
author="tzx",
132132
author_email="dvorak4tzx@gmail.com",
133133
url="https://geobuf-cpp.readthedocs.io",

src/geobuf/geobuf.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -477,9 +477,17 @@ void Encoder::writeProps(const mapbox::feature::property_map &props,
477477
{
478478
std::vector<uint32_t> indexes;
479479
int valueIndex = 0;
480+
std::vector<std::pair<std::string, const mapbox::feature::value *>> kv;
481+
kv.reserve(props.size());
480482
for (auto &pair : props) {
483+
kv.emplace_back(pair.first, &pair.second);
484+
}
485+
std::sort(kv.begin(), kv.end(), [](const auto &p1, const auto &p2) {
486+
return p1.first < p2.first;
487+
});
488+
for (auto &pair : kv) {
481489
protozero::pbf_writer pbf_value{pbf, 13};
482-
writeValue(pair.second, pbf_value);
490+
writeValue(*pair.second, pbf_value);
483491
indexes.push_back(keys.at(pair.first));
484492
indexes.push_back(valueIndex++);
485493
}

src/geobuf/rapidjson_helpers.hpp

Lines changed: 211 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,217 @@ inline void round_rapidjson(RapidjsonValue &json, double scale, int depth = 1,
9393
} else if (json.IsDouble()) {
9494
// see round_coords in geojson_helpers
9595
json.SetDouble(std::floor(json.GetDouble() * scale + 0.5) / scale);
96-
} else if (json.IsFloat()) {
97-
json.SetFloat(std::floor(json.GetFloat() * scale + 0.5) / scale);
96+
}
97+
}
98+
99+
inline void round_geojson_non_geometry(RapidjsonValue &json, double scale)
100+
{
101+
if (!json.IsObject()) {
102+
return;
103+
}
104+
auto itr = json.FindMember("type");
105+
if (itr == json.MemberEnd() || !itr->value.IsString()) {
106+
return;
107+
}
108+
const auto type =
109+
std::string(itr->value.GetString(), itr->value.GetStringLength());
110+
if (type == "Feature") {
111+
round_rapidjson(json, scale, INT_MAX, {"geometry"});
112+
} else if (type == "FeatureCollection") {
113+
round_rapidjson(json, scale, INT_MAX, {"features"});
114+
for (auto &f : json["features"].GetArray()) {
115+
round_rapidjson(f, scale);
116+
}
117+
} else if (type == "Point" || type == "MultiPoint" ||
118+
type == "LineString" || type == "MultiLineString" ||
119+
type == "Polygon" || type == "MultiPolygon") {
120+
round_rapidjson(json, scale, INT_MAX, {"coordinates"});
121+
} else if (type == "GeometryCollection") {
122+
round_rapidjson(json, scale, INT_MAX, {"geometries"});
123+
for (auto &g : json["geometries"].GetArray()) {
124+
round_rapidjson(g, scale);
125+
}
126+
}
127+
}
128+
129+
inline void __round_geojson_geometry(RapidjsonValue &json,
130+
const Eigen::Vector3d &scale)
131+
{
132+
if (!json.IsArray() || json.Empty()) {
133+
return;
134+
}
135+
if (!json[0].IsNumber()) {
136+
for (auto &e : json.GetArray()) {
137+
__round_geojson_geometry(e, scale);
138+
}
139+
return;
140+
}
141+
const int N = std::min(3, (int)json.Size());
142+
for (int i = 0; i < N; ++i) {
143+
if (json[i].IsDouble()) {
144+
json[i].SetDouble(std::floor(json[i].GetDouble() * scale[i] + 0.5) /
145+
scale[i]);
146+
}
147+
}
148+
}
149+
150+
inline void round_geojson_geometry(RapidjsonValue &json,
151+
const Eigen::Vector3d &scale)
152+
{
153+
if (!json.IsObject()) {
154+
return;
155+
}
156+
auto itr = json.FindMember("type");
157+
if (itr == json.MemberEnd() || !itr->value.IsString()) {
158+
return;
159+
}
160+
const auto type =
161+
std::string(itr->value.GetString(), itr->value.GetStringLength());
162+
if (type == "Feature") {
163+
round_geojson_geometry(json["geometry"], scale);
164+
} else if (type == "FeatureCollection") {
165+
for (auto &f : json["features"].GetArray()) {
166+
round_geojson_geometry(f["geometry"], scale);
167+
}
168+
} else if (type == "Point" || type == "MultiPoint" ||
169+
type == "LineString" || type == "MultiLineString" ||
170+
type == "Polygon" || type == "MultiPolygon") {
171+
__round_geojson_geometry(json["coordinates"], scale);
172+
} else if (type == "GeometryCollection") {
173+
for (auto &g : json["geometries"].GetArray()) {
174+
round_geojson_geometry(g, scale);
175+
}
176+
}
177+
}
178+
179+
inline void denoise_double_0_rapidjson(RapidjsonValue &json)
180+
{
181+
if (json.IsArray()) {
182+
for (auto &e : json.GetArray()) {
183+
denoise_double_0_rapidjson(e);
184+
}
185+
} else if (json.IsObject()) {
186+
auto obj = json.GetObject();
187+
for (auto &kv : obj) {
188+
denoise_double_0_rapidjson(kv.value);
189+
}
190+
} else if (json.IsDouble()) {
191+
double d = json.GetDouble();
192+
if (std::floor(d) == d) {
193+
if (d >= 0) {
194+
auto i = static_cast<uint64_t>(d);
195+
if (i == d) {
196+
json.SetUint64(i);
197+
}
198+
} else {
199+
auto i = static_cast<int64_t>(d);
200+
if (i == d) {
201+
json.SetInt64(i);
202+
}
203+
}
204+
}
205+
}
206+
}
207+
208+
inline bool __all_is_z0(RapidjsonValue &json)
209+
{
210+
// [x, y, 0.0], [[x, y, 0.0], ...], [[[x, y, 0.0], ..], ..]
211+
if (!json.IsArray()) {
212+
return false;
213+
}
214+
if (json.Empty()) {
215+
return true;
216+
}
217+
if (!json[0].IsNumber()) {
218+
for (auto &e : json.GetArray()) {
219+
if (!__all_is_z0(e)) {
220+
return false;
221+
}
222+
}
223+
return true;
224+
}
225+
if (json.Size() != 3 || !json[2].IsNumber()) {
226+
return false;
227+
}
228+
return json[2].GetDouble() == 0.0;
229+
}
230+
231+
inline void __strip_geometry_z_0(RapidjsonValue &json)
232+
{
233+
if (!json.IsArray() || json.Empty()) {
234+
return;
235+
}
236+
if (!json[0].IsNumber()) {
237+
for (auto &e : json.GetArray()) {
238+
__strip_geometry_z_0(e);
239+
}
240+
return;
241+
}
242+
if (json.Size() == 3) {
243+
json.PopBack();
244+
}
245+
}
246+
247+
inline void strip_geometry_z_0(RapidjsonValue &json)
248+
{
249+
if (json.IsObject()) {
250+
auto itr = json.FindMember("type");
251+
if (itr == json.MemberEnd() || !itr->value.IsString()) {
252+
return;
253+
}
254+
const auto type =
255+
std::string(itr->value.GetString(), itr->value.GetStringLength());
256+
if (type == "Feature") {
257+
strip_geometry_z_0(json["geometry"]);
258+
} else if (type == "FeatureCollection") {
259+
for (auto &f : json["features"].GetArray()) {
260+
strip_geometry_z_0(f["geometry"]);
261+
}
262+
} else if (type == "Point" || type == "MultiPoint" ||
263+
type == "LineString" || type == "MultiLineString" ||
264+
type == "Polygon" || type == "MultiPolygon") {
265+
strip_geometry_z_0(json["coordinates"]);
266+
} else if (type == "GeometryCollection") {
267+
for (auto &g : json["geometries"].GetArray()) {
268+
strip_geometry_z_0(g);
269+
}
270+
}
271+
return;
272+
}
273+
274+
if (!__all_is_z0(json)) {
275+
return;
276+
}
277+
__strip_geometry_z_0(json);
278+
}
279+
280+
inline void
281+
normalize_json(RapidjsonValue &json, //
282+
bool sort_keys = true, //
283+
std::optional<int> round_geojson_non_geometry = 3, //
284+
const std::optional<std::array<int, 3>> &round_geojson_geometry =
285+
std::array<int, 3>{8, 8, 3}, //
286+
bool denoise_double_0 = true, //
287+
bool strip_geometry_z_0 = true)
288+
{
289+
if (sort_keys) {
290+
sort_keys_inplace(json);
291+
}
292+
if (round_geojson_non_geometry) {
293+
double scale = std::pow(10.0, *round_geojson_non_geometry);
294+
cubao::round_geojson_non_geometry(json, scale);
295+
}
296+
if (round_geojson_geometry) {
297+
auto &precision = *round_geojson_geometry;
298+
cubao::round_geojson_geometry(json, {std::pow(10.0, precision[0]),
299+
std::pow(10.0, precision[1]),
300+
std::pow(10.0, precision[2])});
301+
}
302+
if (strip_geometry_z_0) {
303+
cubao::strip_geometry_z_0(json);
304+
}
305+
if (denoise_double_0) {
306+
denoise_double_0_rapidjson(json);
98307
}
99308
}
100309

0 commit comments

Comments
 (0)