|
13 | 13 | "## Baby steps\n", |
14 | 14 | "\n", |
15 | 15 | "First, though, we want to explore the performance characteristics of our API over page size.\n", |
16 | | - "Let's start with the default page size (10)." |
| 16 | + "Let's start with the default page size." |
17 | 17 | ] |
18 | 18 | }, |
19 | 19 | { |
20 | 20 | "cell_type": "code", |
21 | | - "execution_count": 3, |
| 21 | + "execution_count": 20, |
22 | 22 | "metadata": {}, |
23 | 23 | "outputs": [], |
24 | 24 | "source": [ |
|
29 | 29 | }, |
30 | 30 | { |
31 | 31 | "cell_type": "code", |
32 | | - "execution_count": 4, |
| 32 | + "execution_count": 21, |
33 | 33 | "metadata": {}, |
34 | 34 | "outputs": [ |
35 | 35 | { |
36 | 36 | "name": "stdout", |
37 | 37 | "output_type": "stream", |
38 | 38 | "text": [ |
39 | | - "Retrieved 100 in 32.19s (3.11 items/s)\n" |
| 39 | + "Retrieved 100 in 17.31s (5.78 items/s)\n" |
40 | 40 | ] |
41 | 41 | } |
42 | 42 | ], |
|
53 | 53 | "metadata": {}, |
54 | 54 | "source": [ |
55 | 55 | "That's not excellent.\n", |
56 | | - "Let's try bumping it all the way up." |
| 56 | + "Let's try bumping it up." |
57 | 57 | ] |
58 | 58 | }, |
59 | 59 | { |
60 | 60 | "cell_type": "code", |
61 | | - "execution_count": 5, |
| 61 | + "execution_count": 22, |
62 | 62 | "metadata": {}, |
63 | 63 | "outputs": [ |
64 | 64 | { |
65 | 65 | "name": "stdout", |
66 | 66 | "output_type": "stream", |
67 | 67 | "text": [ |
68 | | - "Retrieved 100 in 3.25s (30.73 items/s)\n" |
| 68 | + "Retrieved 100 in 1.69s (59.16 items/s)\n" |
69 | 69 | ] |
70 | 70 | } |
71 | 71 | ], |
|
89 | 89 | }, |
90 | 90 | { |
91 | 91 | "cell_type": "code", |
92 | | - "execution_count": 6, |
| 92 | + "execution_count": 23, |
93 | 93 | "metadata": {}, |
94 | 94 | "outputs": [ |
95 | 95 | { |
96 | 96 | "name": "stdout", |
97 | 97 | "output_type": "stream", |
98 | 98 | "text": [ |
99 | | - "Retrieved 2000 in 4.63s (432.17 items/s)\n" |
| 99 | + "Retrieved 2000 in 3.04s (656.88 items/s)\n", |
| 100 | + "Retrieved 5000 in 4.96s (1008.75 items/s)\n" |
100 | 101 | ] |
101 | 102 | } |
102 | 103 | ], |
|
106 | 107 | " items = list(\n", |
107 | 108 | " client.search(collections=[\"naip\"], max_items=2000, limit=2000).items_as_dicts()\n", |
108 | 109 | " )\n", |
| 110 | + " timer.report(items)\n", |
| 111 | + "\n", |
| 112 | + "with Timer() as timer:\n", |
| 113 | + " items = list(\n", |
| 114 | + " client.search(collections=[\"naip\"], max_items=5000, limit=5000).items_as_dicts()\n", |
| 115 | + " )\n", |
109 | 116 | " timer.report(items)" |
110 | 117 | ] |
111 | 118 | }, |
|
120 | 127 | }, |
121 | 128 | { |
122 | 129 | "cell_type": "code", |
123 | | - "execution_count": 7, |
| 130 | + "execution_count": 24, |
124 | 131 | "metadata": {}, |
125 | 132 | "outputs": [ |
126 | 133 | { |
127 | 134 | "name": "stdout", |
128 | 135 | "output_type": "stream", |
129 | 136 | "text": [ |
130 | | - "Retrieved 10000 in 10.64s (939.65 items/s)\n" |
| 137 | + "Retrieved 10000 in 8.23s (1215.79 items/s)\n" |
131 | 138 | ] |
132 | 139 | } |
133 | 140 | ], |
|
147 | 154 | "metadata": {}, |
148 | 155 | "source": [ |
149 | 156 | "One neat feature of **stac-geoparquet** is that we can query it directly using **DuckDB** from our client.\n", |
150 | | - "[stacrs](https://stac-utils.github.io/stacrs/) is a relatively new Python library that can do that.\n", |
| 157 | + "[stacrs](https://stac-utils.github.io/stacrs/) can do that.\n", |
151 | 158 | "What happens when we hit our **stac-geoparquet** in an s3 bucket directly?\n", |
152 | 159 | "\n", |
153 | 160 | "!!! note \"You need to configure your AWS account, either w/ access to the bucket via the eoAPI sub-account, or with requestor pays\"" |
|
162 | 169 | "name": "stdout", |
163 | 170 | "output_type": "stream", |
164 | 171 | "text": [ |
165 | | - "Retrieved 10000 in 1.60s (6239.00 items/s)\n" |
| 172 | + "Retrieved 10000 in 1.40s (7137.57 items/s)\n" |
166 | 173 | ] |
167 | 174 | } |
168 | 175 | ], |
|
172 | 179 | "from labs_375 import NAIP_GEOPARQUET_URI\n", |
173 | 180 | "\n", |
174 | 181 | "client = DuckdbClient()\n", |
| 182 | + "client.execute(\"CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN)\")\n", |
175 | 183 | "with Timer() as timer:\n", |
176 | 184 | " items = client.search(\n", |
177 | 185 | " NAIP_GEOPARQUET_URI,\n", |
178 | | - " )[\"features\"]\n", |
| 186 | + " )\n", |
| 187 | + " timer.report(items)" |
| 188 | + ] |
| 189 | + }, |
| 190 | + { |
| 191 | + "cell_type": "markdown", |
| 192 | + "metadata": {}, |
| 193 | + "source": [ |
| 194 | + "## Comparison with pgstac\n", |
| 195 | + "\n", |
| 196 | + "We've got the same items loaded into a [pgstac](https://github.com/stac-utils/pgstac) database, with a [stac-fastapi-pgstac](https://github.com/stac-utils/stac-fastapi-pgstac) serving them over HTTP.\n", |
| 197 | + "Let's try the same tests against that server, except for the full scan case — that one times out." |
| 198 | + ] |
| 199 | + }, |
| 200 | + { |
| 201 | + "cell_type": "code", |
| 202 | + "execution_count": 28, |
| 203 | + "metadata": {}, |
| 204 | + "outputs": [ |
| 205 | + { |
| 206 | + "name": "stdout", |
| 207 | + "output_type": "stream", |
| 208 | + "text": [ |
| 209 | + "Retrieved 100 in 1.01s (99.10 items/s)\n", |
| 210 | + "Retrieved 100 in 0.21s (484.68 items/s)\n", |
| 211 | + "Retrieved 2000 in 2.72s (734.03 items/s)\n", |
| 212 | + "Retrieved 5000 in 6.96s (718.13 items/s)\n" |
| 213 | + ] |
| 214 | + } |
| 215 | + ], |
| 216 | + "source": [ |
| 217 | + "from labs_375 import STAC_FASTAPI_PGSTAC_URI\n", |
| 218 | + "\n", |
| 219 | + "client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n", |
| 220 | + "\n", |
| 221 | + "with Timer() as timer:\n", |
| 222 | + " items = list(client.search(collections=[\"naip\"], max_items=100).items_as_dicts())\n", |
| 223 | + " timer.report(items)\n", |
| 224 | + "\n", |
| 225 | + "with Timer() as timer:\n", |
| 226 | + " items = list(\n", |
| 227 | + " client.search(collections=[\"naip\"], max_items=100, limit=100).items_as_dicts()\n", |
| 228 | + " )\n", |
| 229 | + " timer.report(items)\n", |
| 230 | + "\n", |
| 231 | + "with Timer() as timer:\n", |
| 232 | + " items = list(\n", |
| 233 | + " client.search(collections=[\"naip\"], max_items=2000, limit=2000).items_as_dicts()\n", |
| 234 | + " )\n", |
| 235 | + " timer.report(items)\n", |
| 236 | + "\n", |
| 237 | + "with Timer() as timer:\n", |
| 238 | + " items = list(\n", |
| 239 | + " client.search(collections=[\"naip\"], max_items=5000, limit=5000).items_as_dicts()\n", |
| 240 | + " )\n", |
| 241 | + " timer.report(items)" |
| 242 | + ] |
| 243 | + }, |
| 244 | + { |
| 245 | + "cell_type": "markdown", |
| 246 | + "metadata": {}, |
| 247 | + "source": [ |
| 248 | + "## Sorting\n", |
| 249 | + "\n", |
| 250 | + "It looks like there's about equal performance in the 2000 item case, so let's use that point to explore how sorting effects performance.\n", |
| 251 | + "Our best guess is that **pgstac** will perform better, since it's a database!\n", |
| 252 | + "Let's see." |
| 253 | + ] |
| 254 | + }, |
| 255 | + { |
| 256 | + "cell_type": "code", |
| 257 | + "execution_count": 34, |
| 258 | + "metadata": {}, |
| 259 | + "outputs": [ |
| 260 | + { |
| 261 | + "name": "stderr", |
| 262 | + "output_type": "stream", |
| 263 | + "text": [ |
| 264 | + "/Users/gadomski/Code/developmentseed/labs-375-stac-geoparquet-backend/.venv/lib/python3.12/site-packages/pystac_client/item_search.py:442: DoesNotConformTo: Server does not conform to SORT\n", |
| 265 | + " warnings.warn(DoesNotConformTo(\"SORT\"))\n" |
| 266 | + ] |
| 267 | + }, |
| 268 | + { |
| 269 | + "name": "stdout", |
| 270 | + "output_type": "stream", |
| 271 | + "text": [ |
| 272 | + "geoparquet datetime\n", |
| 273 | + "Retrieved 2000 in 3.03s (660.35 items/s)\n", |
| 274 | + "pgstac datetime\n", |
| 275 | + "Retrieved 2000 in 2.98s (672.20 items/s)\n", |
| 276 | + "\n", |
| 277 | + "geoparquet -datetime\n", |
| 278 | + "Retrieved 2000 in 2.78s (718.80 items/s)\n", |
| 279 | + "pgstac -datetime\n", |
| 280 | + "Retrieved 2000 in 2.90s (688.56 items/s)\n", |
| 281 | + "\n", |
| 282 | + "geoparquet naip:year\n", |
| 283 | + "Retrieved 2000 in 2.80s (714.57 items/s)\n", |
| 284 | + "pgstac naip:year\n", |
| 285 | + "Retrieved 2000 in 3.08s (650.32 items/s)\n", |
| 286 | + "\n" |
| 287 | + ] |
| 288 | + } |
| 289 | + ], |
| 290 | + "source": [ |
| 291 | + "geoparquet_client = Client.open(STAC_FASTAPI_GEOPARQUET_URI)\n", |
| 292 | + "pgstac_client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n", |
| 293 | + "\n", |
| 294 | + "for sortby in [\"datetime\", \"-datetime\", \"naip:year\"]:\n", |
| 295 | + " with Timer() as timer:\n", |
| 296 | + " items = list(\n", |
| 297 | + " geoparquet_client.search(\n", |
| 298 | + " collections=[\"naip\"], sortby=sortby, max_items=2000, limit=2000\n", |
| 299 | + " ).items_as_dicts()\n", |
| 300 | + " )\n", |
| 301 | + " print(\"geoparquet\", sortby)\n", |
| 302 | + " timer.report(items)\n", |
| 303 | + " with Timer() as timer:\n", |
| 304 | + " items = list(\n", |
| 305 | + " pgstac_client.search(\n", |
| 306 | + " collections=[\"naip\"], sortby=sortby, max_items=2000, limit=2000\n", |
| 307 | + " ).items_as_dicts()\n", |
| 308 | + " )\n", |
| 309 | + " print(\"pgstac\", sortby)\n", |
| 310 | + " timer.report(items)\n", |
| 311 | + "\n", |
| 312 | + " print()" |
| 313 | + ] |
| 314 | + }, |
| 315 | + { |
| 316 | + "cell_type": "markdown", |
| 317 | + "metadata": {}, |
| 318 | + "source": [ |
| 319 | + "## Fields\n", |
| 320 | + "\n", |
| 321 | + "One of the \"sells\" of (geo)parquet is that you don't need to fetch the entirety of the data, if you only need a few of the fields.\n", |
| 322 | + "For example, if you're only visualizing the STAC items, you might just return the `id` and the `geometry`.\n", |
| 323 | + "How do the two backends perform in this scenario?\n", |
| 324 | + "Let's also test against the direct access (without the API server)." |
| 325 | + ] |
| 326 | + }, |
| 327 | + { |
| 328 | + "cell_type": "code", |
| 329 | + "execution_count": null, |
| 330 | + "metadata": {}, |
| 331 | + "outputs": [ |
| 332 | + { |
| 333 | + "name": "stderr", |
| 334 | + "output_type": "stream", |
| 335 | + "text": [ |
| 336 | + "/Users/gadomski/Code/developmentseed/labs-375-stac-geoparquet-backend/.venv/lib/python3.12/site-packages/pystac_client/item_search.py:480: DoesNotConformTo: Server does not conform to FIELDS\n", |
| 337 | + " warnings.warn(DoesNotConformTo(\"FIELDS\"))\n" |
| 338 | + ] |
| 339 | + }, |
| 340 | + { |
| 341 | + "name": "stdout", |
| 342 | + "output_type": "stream", |
| 343 | + "text": [ |
| 344 | + "geoparquet\n", |
| 345 | + "Retrieved 2000 in 2.97s (672.97 items/s)\n", |
| 346 | + "pgstac\n", |
| 347 | + "Retrieved 2000 in 1.60s (1251.75 items/s)\n", |
| 348 | + "Retrieved 2000 in 1.12s (1778.71 items/s)\n" |
| 349 | + ] |
| 350 | + } |
| 351 | + ], |
| 352 | + "source": [ |
| 353 | + "geoparquet_client = Client.open(STAC_FASTAPI_GEOPARQUET_URI)\n", |
| 354 | + "pgstac_client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n", |
| 355 | + "duckdb_client = DuckdbClient()\n", |
| 356 | + "duckdb_client.execute(\"CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN)\")\n", |
| 357 | + "\n", |
| 358 | + "with Timer() as timer:\n", |
| 359 | + " items = list(\n", |
| 360 | + " geoparquet_client.search(\n", |
| 361 | + " collections=[\"naip\"], fields=[\"id\", \"geometry\"], max_items=2000, limit=2000\n", |
| 362 | + " ).items_as_dicts()\n", |
| 363 | + " )\n", |
| 364 | + " print(\"geoparquet\")\n", |
| 365 | + " timer.report(items)\n", |
| 366 | + "\n", |
| 367 | + "with Timer() as timer:\n", |
| 368 | + " items = list(\n", |
| 369 | + " pgstac_client.search(\n", |
| 370 | + " collections=[\"naip\"], fields=[\"id\", \"geometry\"], max_items=2000, limit=2000\n", |
| 371 | + " ).items_as_dicts()\n", |
| 372 | + " )\n", |
| 373 | + " print(\"pgstac\")\n", |
| 374 | + " timer.report(items)\n", |
| 375 | + "\n", |
| 376 | + "with Timer() as timer:\n", |
| 377 | + " items = duckdb_client.search(\n", |
| 378 | + " NAIP_GEOPARQUET_URI, fields=[\"id\", \"geometry\"], max_items=2000, limit=2000\n", |
| 379 | + " )\n", |
| 380 | + " print(\"duckdb\")\n", |
179 | 381 | " timer.report(items)" |
180 | 382 | ] |
181 | 383 | } |
|
0 commit comments