Skip to content
This repository was archived by the owner on Jul 21, 2025. It is now read-only.

Commit 9602eb6

Browse files
committed
--wip-- [skip ci]
1 parent 58d8bb2 commit 9602eb6

File tree

3 files changed

+227
-0
lines changed

3 files changed

+227
-0
lines changed

docs/katas/1_needle_heystack.ipynb

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "29d14024",
6+
"metadata": {},
7+
"source": [
8+
"# Kata 1: needle in a haystack\n",
9+
"\n",
10+
"Another common use (some would say, the primary use) of STAC is to search for things.\n",
11+
"Let's compare the performance of a bunch of different searches."
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 5,
17+
"id": "a540a58e",
18+
"metadata": {},
19+
"outputs": [
20+
{
21+
"name": "stdout",
22+
"output_type": "stream",
23+
"text": [
24+
"Retrieved 1 in 0.19s (5.32 items/s)\n",
25+
"Retrieved 1 in 0.09s (10.80 items/s)\n"
26+
]
27+
}
28+
],
29+
"source": [
30+
"from pystac_client import Client\n",
31+
"\n",
32+
"from labs_375 import STAC_FASTAPI_GEOPARQUET_URI, STAC_FASTAPI_PGSTAC_URI, Timer\n",
33+
"\n",
34+
"LONGMONT = {\"type\": \"Point\", \"coordinates\": [-105.1019, 40.1672]}\n",
35+
"\n",
36+
"geoparquet_client = Client.open(STAC_FASTAPI_GEOPARQUET_URI)\n",
37+
"pgstac_client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n",
38+
"\n",
39+
"with Timer() as timer:\n",
40+
" items = list(\n",
41+
" geoparquet_client.search(\n",
42+
" collections=[\"naip\"], intersects=LONGMONT\n",
43+
" ).items_as_dicts()\n",
44+
" )\n",
45+
" timer.report(items)\n",
46+
"\n",
47+
"with Timer() as timer:\n",
48+
" items = list(\n",
49+
" pgstac_client.search(collections=[\"naip\"], intersects=LONGMONT).items_as_dicts()\n",
50+
" )\n",
51+
" timer.report(items)\n"
52+
]
53+
},
54+
{
55+
"cell_type": "markdown",
56+
"id": "b9a4f67c",
57+
"metadata": {},
58+
"source": [
59+
"Intersects is pretty slow, let's see how we do on `id`.\n",
60+
"We'll first do a full scan to get the first and last IDs."
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": 6,
66+
"id": "9cbddddc",
67+
"metadata": {},
68+
"outputs": [
69+
{
70+
"name": "stdout",
71+
"output_type": "stream",
72+
"text": [
73+
"Retrieved 1 in 0.25s (4.06 items/s)\n",
74+
"Retrieved 1 in 0.26s (3.90 items/s)\n",
75+
"Retrieved 1 in 0.11s (8.99 items/s)\n",
76+
"Retrieved 1 in 0.07s (13.42 items/s)\n"
77+
]
78+
}
79+
],
80+
"source": [
81+
"items = list(\n",
82+
" geoparquet_client.search(\n",
83+
" collections=[\"naip\"], max_items=10000, limit=10000\n",
84+
" ).items_as_dicts()\n",
85+
")\n",
86+
"first_id = items[0][\"id\"]\n",
87+
"last_id = items[-1][\"id\"]\n",
88+
"\n",
89+
"with Timer() as timer:\n",
90+
" items = list(\n",
91+
" geoparquet_client.search(collections=[\"naip\"], ids=[first_id]).items_as_dicts()\n",
92+
" )\n",
93+
" timer.report(items)\n",
94+
"\n",
95+
"with Timer() as timer:\n",
96+
" items = list(\n",
97+
" geoparquet_client.search(collections=[\"naip\"], ids=[last_id]).items_as_dicts()\n",
98+
" )\n",
99+
" timer.report(items)\n",
100+
"\n",
101+
"with Timer() as timer:\n",
102+
" items = list(\n",
103+
" pgstac_client.search(collections=[\"naip\"], ids=[first_id]).items_as_dicts()\n",
104+
" )\n",
105+
" timer.report(items)\n",
106+
"\n",
107+
"with Timer() as timer:\n",
108+
" items = list(\n",
109+
" pgstac_client.search(collections=[\"naip\"], ids=[last_id]).items_as_dicts()\n",
110+
" )\n",
111+
" timer.report(items)"
112+
]
113+
},
114+
{
115+
"cell_type": "markdown",
116+
"id": "425c9650",
117+
"metadata": {},
118+
"source": [
119+
"Ok, not as good as **pgstac** but not terrible.\n",
120+
"Let's try a cql2 filter."
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": 8,
126+
"id": "c8d74047",
127+
"metadata": {},
128+
"outputs": [
129+
{
130+
"name": "stdout",
131+
"output_type": "stream",
132+
"text": [
133+
"Retrieved 279 in 5.12s (54.44 items/s)\n"
134+
]
135+
}
136+
],
137+
"source": [
138+
"from cql2 import Expr\n",
139+
"\n",
140+
"# with Timer() as timer:\n",
141+
"# items = list(\n",
142+
"# geoparquet_client.search(\n",
143+
"# collections=[\"naip\"], filter=\"naip:year = '2022'\"\n",
144+
"# ).items_as_dicts()\n",
145+
"# )\n",
146+
"# timer.report(items)\n",
147+
"\n",
148+
"expr = Expr(\"naip:year = '2022'\").to_json()\n",
149+
"with Timer() as timer:\n",
150+
" items = list(\n",
151+
" pgstac_client.search(\n",
152+
" collections=[\"naip\"], filter=expr\n",
153+
" ).items_as_dicts()\n",
154+
" )\n",
155+
" timer.report(items)\n"
156+
]
157+
}
158+
],
159+
"metadata": {
160+
"kernelspec": {
161+
"display_name": ".venv",
162+
"language": "python",
163+
"name": "python3"
164+
},
165+
"language_info": {
166+
"codemirror_mode": {
167+
"name": "ipython",
168+
"version": 3
169+
},
170+
"file_extension": ".py",
171+
"mimetype": "text/x-python",
172+
"name": "python",
173+
"nbconvert_exporter": "python",
174+
"pygments_lexer": "ipython3",
175+
"version": "3.12.6"
176+
}
177+
},
178+
"nbformat": 4,
179+
"nbformat_minor": 5
180+
}

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ dev = [
3232
katas = [
3333
"adlfs>=2024.12.0",
3434
"dask[dataframe]>=2025.4.1",
35+
"cql2>=0.3.6",
3536
"duckdb>=1.2.2",
3637
"fsspec>=2025.3.2",
3738
"geopandas>=1.0.1",

uv.lock

Lines changed: 46 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)