Skip to content
This repository was archived by the owner on Jul 21, 2025. It is now read-only.

Commit 569065f

Browse files
authored
feat: some updates to the full scan (#79)
1 parent b2e17f8 commit 569065f

File tree

6 files changed

+469
-18
lines changed

6 files changed

+469
-18
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ cdk diff # to show any differences
4242

4343
### pgstac
4444

45-
The **pgstac** database's connection parameters live in the `pgstac-db > db > Secret` resource in the `stac-fastapi-geoparquet-labs-375-pgstac` CloudFormation stack.
45+
The **pgstac** database's connection parameters live in the `pgstac-db > db > Secret` resource in the `stac-fastapi-geoparquet-labs-375-infra` CloudFormation stack.
4646

4747
## Releasing and deploying
4848

docs/katas/0_full_scan.ipynb

Lines changed: 216 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@
1313
"## Baby steps\n",
1414
"\n",
1515
"First, though, we want to explore the performance characteristics of our API over page size.\n",
16-
"Let's start with the default page size (10)."
16+
"Let's start with the default page size."
1717
]
1818
},
1919
{
2020
"cell_type": "code",
21-
"execution_count": 3,
21+
"execution_count": 20,
2222
"metadata": {},
2323
"outputs": [],
2424
"source": [
@@ -29,14 +29,14 @@
2929
},
3030
{
3131
"cell_type": "code",
32-
"execution_count": 4,
32+
"execution_count": 21,
3333
"metadata": {},
3434
"outputs": [
3535
{
3636
"name": "stdout",
3737
"output_type": "stream",
3838
"text": [
39-
"Retrieved 100 in 32.19s (3.11 items/s)\n"
39+
"Retrieved 100 in 17.31s (5.78 items/s)\n"
4040
]
4141
}
4242
],
@@ -53,19 +53,19 @@
5353
"metadata": {},
5454
"source": [
5555
"That's not excellent.\n",
56-
"Let's try bumping it all the way up."
56+
"Let's try bumping it up."
5757
]
5858
},
5959
{
6060
"cell_type": "code",
61-
"execution_count": 5,
61+
"execution_count": 22,
6262
"metadata": {},
6363
"outputs": [
6464
{
6565
"name": "stdout",
6666
"output_type": "stream",
6767
"text": [
68-
"Retrieved 100 in 3.25s (30.73 items/s)\n"
68+
"Retrieved 100 in 1.69s (59.16 items/s)\n"
6969
]
7070
}
7171
],
@@ -89,14 +89,15 @@
8989
},
9090
{
9191
"cell_type": "code",
92-
"execution_count": 6,
92+
"execution_count": 23,
9393
"metadata": {},
9494
"outputs": [
9595
{
9696
"name": "stdout",
9797
"output_type": "stream",
9898
"text": [
99-
"Retrieved 2000 in 4.63s (432.17 items/s)\n"
99+
"Retrieved 2000 in 3.04s (656.88 items/s)\n",
100+
"Retrieved 5000 in 4.96s (1008.75 items/s)\n"
100101
]
101102
}
102103
],
@@ -106,6 +107,12 @@
106107
" items = list(\n",
107108
" client.search(collections=[\"naip\"], max_items=2000, limit=2000).items_as_dicts()\n",
108109
" )\n",
110+
" timer.report(items)\n",
111+
"\n",
112+
"with Timer() as timer:\n",
113+
" items = list(\n",
114+
" client.search(collections=[\"naip\"], max_items=5000, limit=5000).items_as_dicts()\n",
115+
" )\n",
109116
" timer.report(items)"
110117
]
111118
},
@@ -120,14 +127,14 @@
120127
},
121128
{
122129
"cell_type": "code",
123-
"execution_count": 7,
130+
"execution_count": 24,
124131
"metadata": {},
125132
"outputs": [
126133
{
127134
"name": "stdout",
128135
"output_type": "stream",
129136
"text": [
130-
"Retrieved 10000 in 10.64s (939.65 items/s)\n"
137+
"Retrieved 10000 in 8.23s (1215.79 items/s)\n"
131138
]
132139
}
133140
],
@@ -147,7 +154,7 @@
147154
"metadata": {},
148155
"source": [
149156
"One neat feature of **stac-geoparquet** is that we can query it directly using **DuckDB** from our client.\n",
150-
"[stacrs](https://stac-utils.github.io/stacrs/) is a relatively new Python library that can do that.\n",
157+
"[stacrs](https://stac-utils.github.io/stacrs/) can do that.\n",
151158
"What happens when we hit our **stac-geoparquet** in an s3 bucket directly?\n",
152159
"\n",
153160
"!!! note \"You need to configure your AWS account, either w/ access to the bucket via the eoAPI sub-account, or with requestor pays\""
@@ -162,7 +169,7 @@
162169
"name": "stdout",
163170
"output_type": "stream",
164171
"text": [
165-
"Retrieved 10000 in 1.60s (6239.00 items/s)\n"
172+
"Retrieved 10000 in 1.40s (7137.57 items/s)\n"
166173
]
167174
}
168175
],
@@ -172,10 +179,205 @@
172179
"from labs_375 import NAIP_GEOPARQUET_URI\n",
173180
"\n",
174181
"client = DuckdbClient()\n",
182+
"client.execute(\"CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN)\")\n",
175183
"with Timer() as timer:\n",
176184
" items = client.search(\n",
177185
" NAIP_GEOPARQUET_URI,\n",
178-
" )[\"features\"]\n",
186+
" )\n",
187+
" timer.report(items)"
188+
]
189+
},
190+
{
191+
"cell_type": "markdown",
192+
"metadata": {},
193+
"source": [
194+
"## Comparison with pgstac\n",
195+
"\n",
196+
"We've got the same items loaded into a [pgstac](https://github.com/stac-utils/pgstac) database, with a [stac-fastapi-pgstac](https://github.com/stac-utils/stac-fastapi-pgstac) serving them over HTTP.\n",
197+
"Let's try the same tests against that server, except for the full scan case — that one times out."
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": 28,
203+
"metadata": {},
204+
"outputs": [
205+
{
206+
"name": "stdout",
207+
"output_type": "stream",
208+
"text": [
209+
"Retrieved 100 in 1.01s (99.10 items/s)\n",
210+
"Retrieved 100 in 0.21s (484.68 items/s)\n",
211+
"Retrieved 2000 in 2.72s (734.03 items/s)\n",
212+
"Retrieved 5000 in 6.96s (718.13 items/s)\n"
213+
]
214+
}
215+
],
216+
"source": [
217+
"from labs_375 import STAC_FASTAPI_PGSTAC_URI\n",
218+
"\n",
219+
"client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n",
220+
"\n",
221+
"with Timer() as timer:\n",
222+
" items = list(client.search(collections=[\"naip\"], max_items=100).items_as_dicts())\n",
223+
" timer.report(items)\n",
224+
"\n",
225+
"with Timer() as timer:\n",
226+
" items = list(\n",
227+
" client.search(collections=[\"naip\"], max_items=100, limit=100).items_as_dicts()\n",
228+
" )\n",
229+
" timer.report(items)\n",
230+
"\n",
231+
"with Timer() as timer:\n",
232+
" items = list(\n",
233+
" client.search(collections=[\"naip\"], max_items=2000, limit=2000).items_as_dicts()\n",
234+
" )\n",
235+
" timer.report(items)\n",
236+
"\n",
237+
"with Timer() as timer:\n",
238+
" items = list(\n",
239+
" client.search(collections=[\"naip\"], max_items=5000, limit=5000).items_as_dicts()\n",
240+
" )\n",
241+
" timer.report(items)"
242+
]
243+
},
244+
{
245+
"cell_type": "markdown",
246+
"metadata": {},
247+
"source": [
248+
"## Sorting\n",
249+
"\n",
250+
"It looks like there's about equal performance in the 2000 item case, so let's use that point to explore how sorting effects performance.\n",
251+
"Our best guess is that **pgstac** will perform better, since it's a database!\n",
252+
"Let's see."
253+
]
254+
},
255+
{
256+
"cell_type": "code",
257+
"execution_count": 34,
258+
"metadata": {},
259+
"outputs": [
260+
{
261+
"name": "stderr",
262+
"output_type": "stream",
263+
"text": [
264+
"/Users/gadomski/Code/developmentseed/labs-375-stac-geoparquet-backend/.venv/lib/python3.12/site-packages/pystac_client/item_search.py:442: DoesNotConformTo: Server does not conform to SORT\n",
265+
" warnings.warn(DoesNotConformTo(\"SORT\"))\n"
266+
]
267+
},
268+
{
269+
"name": "stdout",
270+
"output_type": "stream",
271+
"text": [
272+
"geoparquet datetime\n",
273+
"Retrieved 2000 in 3.03s (660.35 items/s)\n",
274+
"pgstac datetime\n",
275+
"Retrieved 2000 in 2.98s (672.20 items/s)\n",
276+
"\n",
277+
"geoparquet -datetime\n",
278+
"Retrieved 2000 in 2.78s (718.80 items/s)\n",
279+
"pgstac -datetime\n",
280+
"Retrieved 2000 in 2.90s (688.56 items/s)\n",
281+
"\n",
282+
"geoparquet naip:year\n",
283+
"Retrieved 2000 in 2.80s (714.57 items/s)\n",
284+
"pgstac naip:year\n",
285+
"Retrieved 2000 in 3.08s (650.32 items/s)\n",
286+
"\n"
287+
]
288+
}
289+
],
290+
"source": [
291+
"geoparquet_client = Client.open(STAC_FASTAPI_GEOPARQUET_URI)\n",
292+
"pgstac_client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n",
293+
"\n",
294+
"for sortby in [\"datetime\", \"-datetime\", \"naip:year\"]:\n",
295+
" with Timer() as timer:\n",
296+
" items = list(\n",
297+
" geoparquet_client.search(\n",
298+
" collections=[\"naip\"], sortby=sortby, max_items=2000, limit=2000\n",
299+
" ).items_as_dicts()\n",
300+
" )\n",
301+
" print(\"geoparquet\", sortby)\n",
302+
" timer.report(items)\n",
303+
" with Timer() as timer:\n",
304+
" items = list(\n",
305+
" pgstac_client.search(\n",
306+
" collections=[\"naip\"], sortby=sortby, max_items=2000, limit=2000\n",
307+
" ).items_as_dicts()\n",
308+
" )\n",
309+
" print(\"pgstac\", sortby)\n",
310+
" timer.report(items)\n",
311+
"\n",
312+
" print()"
313+
]
314+
},
315+
{
316+
"cell_type": "markdown",
317+
"metadata": {},
318+
"source": [
319+
"## Fields\n",
320+
"\n",
321+
"One of the \"sells\" of (geo)parquet is that you don't need to fetch the entirety of the data, if you only need a few of the fields.\n",
322+
"For example, if you're only visualizing the STAC items, you might just return the `id` and the `geometry`.\n",
323+
"How do the two backends perform in this scenario?\n",
324+
"Let's also test against the direct access (without the API server)."
325+
]
326+
},
327+
{
328+
"cell_type": "code",
329+
"execution_count": null,
330+
"metadata": {},
331+
"outputs": [
332+
{
333+
"name": "stderr",
334+
"output_type": "stream",
335+
"text": [
336+
"/Users/gadomski/Code/developmentseed/labs-375-stac-geoparquet-backend/.venv/lib/python3.12/site-packages/pystac_client/item_search.py:480: DoesNotConformTo: Server does not conform to FIELDS\n",
337+
" warnings.warn(DoesNotConformTo(\"FIELDS\"))\n"
338+
]
339+
},
340+
{
341+
"name": "stdout",
342+
"output_type": "stream",
343+
"text": [
344+
"geoparquet\n",
345+
"Retrieved 2000 in 2.97s (672.97 items/s)\n",
346+
"pgstac\n",
347+
"Retrieved 2000 in 1.60s (1251.75 items/s)\n",
348+
"Retrieved 2000 in 1.12s (1778.71 items/s)\n"
349+
]
350+
}
351+
],
352+
"source": [
353+
"geoparquet_client = Client.open(STAC_FASTAPI_GEOPARQUET_URI)\n",
354+
"pgstac_client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n",
355+
"duckdb_client = DuckdbClient()\n",
356+
"duckdb_client.execute(\"CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN)\")\n",
357+
"\n",
358+
"with Timer() as timer:\n",
359+
" items = list(\n",
360+
" geoparquet_client.search(\n",
361+
" collections=[\"naip\"], fields=[\"id\", \"geometry\"], max_items=2000, limit=2000\n",
362+
" ).items_as_dicts()\n",
363+
" )\n",
364+
" print(\"geoparquet\")\n",
365+
" timer.report(items)\n",
366+
"\n",
367+
"with Timer() as timer:\n",
368+
" items = list(\n",
369+
" pgstac_client.search(\n",
370+
" collections=[\"naip\"], fields=[\"id\", \"geometry\"], max_items=2000, limit=2000\n",
371+
" ).items_as_dicts()\n",
372+
" )\n",
373+
" print(\"pgstac\")\n",
374+
" timer.report(items)\n",
375+
"\n",
376+
"with Timer() as timer:\n",
377+
" items = duckdb_client.search(\n",
378+
" NAIP_GEOPARQUET_URI, fields=[\"id\", \"geometry\"], max_items=2000, limit=2000\n",
379+
" )\n",
380+
" print(\"duckdb\")\n",
179381
" timer.report(items)"
180382
]
181383
}

docs/katas/labs_375.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
from types import TracebackType
77
from typing import Any
88

9-
STAC_FASTAPI_GEOPARQUET_URI = "https://4y16a90iwk.execute-api.us-west-2.amazonaws.com/"
10-
NAIP_GEOPARQUET_URI = "s3://stac-fastapi-geoparquet-devseed/naip.parquet"
9+
STAC_FASTAPI_GEOPARQUET_URI = "https://1sotk6vb0d.execute-api.us-west-2.amazonaws.com/"
10+
STAC_FASTAPI_PGSTAC_URI = "https://31ukqsqah7.execute-api.us-west-2.amazonaws.com/"
11+
NAIP_GEOPARQUET_URI = "s3://stac-fastapi-geoparquet-labs-375/naip.parquet"
1112

1213

1314
class Timer:

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@ version = "0.0.0"
44
description = "Test the performance of stac-fastapi-geoparquet"
55
readme = "README.md"
66
requires-python = ">=3.12"
7-
dependencies = ["stac-fastapi-geoparquet", "rustac"]
7+
dependencies = [
8+
"stac-fastapi-geoparquet",
9+
"rustac",
10+
"pypgstac>=0.9.6",
11+
"psycopg[pool]>=3.2.6",
12+
]
813

914
[project.optional-dependencies]
1015
lambda = ["mangum==0.19.0"]

0 commit comments

Comments
 (0)