feat: some updates to the full scan (#79)

gadomski · web-flow · commit 569065f3c37e · 2025-04-25T16:01:27.000Z
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ cdk diff # to show any differences
 
 ### pgstac
 
-The **pgstac** database's connection parameters live in the `pgstac-db > db > Secret` resource in the `stac-fastapi-geoparquet-labs-375-pgstac` CloudFormation stack.
+The **pgstac** database's connection parameters live in the `pgstac-db > db > Secret` resource in the `stac-fastapi-geoparquet-labs-375-infra` CloudFormation stack.
 
 ## Releasing and deploying
 
diff --git a/docs/katas/0_full_scan.ipynb b/docs/katas/0_full_scan.ipynb
@@ -13,12 +13,12 @@
     "## Baby steps\n",
     "\n",
     "First, though, we want to explore the performance characteristics of our API over page size.\n",
-    "Let's start with the default page size (10)."
+    "Let's start with the default page size."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,14 +29,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Retrieved 100 in 32.19s (3.11 items/s)\n"
+      "Retrieved 100 in 17.31s (5.78 items/s)\n"
      ]
     }
    ],
@@ -53,19 +53,19 @@
    "metadata": {},
    "source": [
     "That's not excellent.\n",
-    "Let's try bumping it all the way up."
+    "Let's try bumping it up."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Retrieved 100 in 3.25s (30.73 items/s)\n"
+      "Retrieved 100 in 1.69s (59.16 items/s)\n"
      ]
     }
    ],
@@ -89,14 +89,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Retrieved 2000 in 4.63s (432.17 items/s)\n"
+      "Retrieved 2000 in 3.04s (656.88 items/s)\n",
+      "Retrieved 5000 in 4.96s (1008.75 items/s)\n"
      ]
     }
    ],
@@ -106,6 +107,12 @@
     "    items = list(\n",
     "        client.search(collections=[\"naip\"], max_items=2000, limit=2000).items_as_dicts()\n",
     "    )\n",
+    "    timer.report(items)\n",
+    "\n",
+    "with Timer() as timer:\n",
+    "    items = list(\n",
+    "        client.search(collections=[\"naip\"], max_items=5000, limit=5000).items_as_dicts()\n",
+    "    )\n",
     "    timer.report(items)"
    ]
   },
@@ -120,14 +127,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Retrieved 10000 in 10.64s (939.65 items/s)\n"
+      "Retrieved 10000 in 8.23s (1215.79 items/s)\n"
      ]
     }
    ],
@@ -147,7 +154,7 @@
    "metadata": {},
    "source": [
     "One neat feature of **stac-geoparquet** is that we can query it directly using **DuckDB** from our client.\n",
-    "[stacrs](https://stac-utils.github.io/stacrs/) is a relatively new Python library that can do that.\n",
+    "[stacrs](https://stac-utils.github.io/stacrs/) can do that.\n",
     "What happens when we hit our **stac-geoparquet** in an s3 bucket directly?\n",
     "\n",
     "!!! note \"You need to configure your AWS account, either w/ access to the bucket via the eoAPI sub-account, or with requestor pays\""
@@ -162,7 +169,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Retrieved 10000 in 1.60s (6239.00 items/s)\n"
+      "Retrieved 10000 in 1.40s (7137.57 items/s)\n"
      ]
     }
    ],
@@ -172,10 +179,205 @@
     "from labs_375 import NAIP_GEOPARQUET_URI\n",
     "\n",
     "client = DuckdbClient()\n",
+    "client.execute(\"CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN)\")\n",
     "with Timer() as timer:\n",
     "    items = client.search(\n",
     "        NAIP_GEOPARQUET_URI,\n",
-    "    )[\"features\"]\n",
+    "    )\n",
+    "    timer.report(items)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Comparison with pgstac\n",
+    "\n",
+    "We've got the same items loaded into a [pgstac](https://github.com/stac-utils/pgstac) database, with a [stac-fastapi-pgstac](https://github.com/stac-utils/stac-fastapi-pgstac) serving them over HTTP.\n",
+    "Let's try the same tests against that server, except for the full scan case — that one times out."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Retrieved 100 in 1.01s (99.10 items/s)\n",
+      "Retrieved 100 in 0.21s (484.68 items/s)\n",
+      "Retrieved 2000 in 2.72s (734.03 items/s)\n",
+      "Retrieved 5000 in 6.96s (718.13 items/s)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from labs_375 import STAC_FASTAPI_PGSTAC_URI\n",
+    "\n",
+    "client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n",
+    "\n",
+    "with Timer() as timer:\n",
+    "    items = list(client.search(collections=[\"naip\"], max_items=100).items_as_dicts())\n",
+    "    timer.report(items)\n",
+    "\n",
+    "with Timer() as timer:\n",
+    "    items = list(\n",
+    "        client.search(collections=[\"naip\"], max_items=100, limit=100).items_as_dicts()\n",
+    "    )\n",
+    "    timer.report(items)\n",
+    "\n",
+    "with Timer() as timer:\n",
+    "    items = list(\n",
+    "        client.search(collections=[\"naip\"], max_items=2000, limit=2000).items_as_dicts()\n",
+    "    )\n",
+    "    timer.report(items)\n",
+    "\n",
+    "with Timer() as timer:\n",
+    "    items = list(\n",
+    "        client.search(collections=[\"naip\"], max_items=5000, limit=5000).items_as_dicts()\n",
+    "    )\n",
+    "    timer.report(items)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Sorting\n",
+    "\n",
+    "It looks like there's about equal performance in the 2000 item case, so let's use that point to explore how sorting effects performance.\n",
+    "Our best guess is that **pgstac** will perform better, since it's a database!\n",
+    "Let's see."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/gadomski/Code/developmentseed/labs-375-stac-geoparquet-backend/.venv/lib/python3.12/site-packages/pystac_client/item_search.py:442: DoesNotConformTo: Server does not conform to SORT\n",
+      "  warnings.warn(DoesNotConformTo(\"SORT\"))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "geoparquet datetime\n",
+      "Retrieved 2000 in 3.03s (660.35 items/s)\n",
+      "pgstac datetime\n",
+      "Retrieved 2000 in 2.98s (672.20 items/s)\n",
+      "\n",
+      "geoparquet -datetime\n",
+      "Retrieved 2000 in 2.78s (718.80 items/s)\n",
+      "pgstac -datetime\n",
+      "Retrieved 2000 in 2.90s (688.56 items/s)\n",
+      "\n",
+      "geoparquet naip:year\n",
+      "Retrieved 2000 in 2.80s (714.57 items/s)\n",
+      "pgstac naip:year\n",
+      "Retrieved 2000 in 3.08s (650.32 items/s)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "geoparquet_client = Client.open(STAC_FASTAPI_GEOPARQUET_URI)\n",
+    "pgstac_client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n",
+    "\n",
+    "for sortby in [\"datetime\", \"-datetime\", \"naip:year\"]:\n",
+    "    with Timer() as timer:\n",
+    "        items = list(\n",
+    "            geoparquet_client.search(\n",
+    "                collections=[\"naip\"], sortby=sortby, max_items=2000, limit=2000\n",
+    "            ).items_as_dicts()\n",
+    "        )\n",
+    "        print(\"geoparquet\", sortby)\n",
+    "        timer.report(items)\n",
+    "    with Timer() as timer:\n",
+    "        items = list(\n",
+    "            pgstac_client.search(\n",
+    "                collections=[\"naip\"], sortby=sortby, max_items=2000, limit=2000\n",
+    "            ).items_as_dicts()\n",
+    "        )\n",
+    "        print(\"pgstac\", sortby)\n",
+    "        timer.report(items)\n",
+    "\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fields\n",
+    "\n",
+    "One of the \"sells\" of (geo)parquet is that you don't need to fetch the entirety of the data, if you only need a few of the fields.\n",
+    "For example, if you're only visualizing the STAC items, you might just return the `id` and the `geometry`.\n",
+    "How do the two backends perform in this scenario?\n",
+    "Let's also test against the direct access (without the API server)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/gadomski/Code/developmentseed/labs-375-stac-geoparquet-backend/.venv/lib/python3.12/site-packages/pystac_client/item_search.py:480: DoesNotConformTo: Server does not conform to FIELDS\n",
+      "  warnings.warn(DoesNotConformTo(\"FIELDS\"))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "geoparquet\n",
+      "Retrieved 2000 in 2.97s (672.97 items/s)\n",
+      "pgstac\n",
+      "Retrieved 2000 in 1.60s (1251.75 items/s)\n",
+      "Retrieved 2000 in 1.12s (1778.71 items/s)\n"
+     ]
+    }
+   ],
+   "source": [
+    "geoparquet_client = Client.open(STAC_FASTAPI_GEOPARQUET_URI)\n",
+    "pgstac_client = Client.open(STAC_FASTAPI_PGSTAC_URI)\n",
+    "duckdb_client = DuckdbClient()\n",
+    "duckdb_client.execute(\"CREATE SECRET (TYPE S3, PROVIDER CREDENTIAL_CHAIN)\")\n",
+    "\n",
+    "with Timer() as timer:\n",
+    "    items = list(\n",
+    "        geoparquet_client.search(\n",
+    "            collections=[\"naip\"], fields=[\"id\", \"geometry\"], max_items=2000, limit=2000\n",
+    "        ).items_as_dicts()\n",
+    "    )\n",
+    "    print(\"geoparquet\")\n",
+    "    timer.report(items)\n",
+    "\n",
+    "with Timer() as timer:\n",
+    "    items = list(\n",
+    "        pgstac_client.search(\n",
+    "            collections=[\"naip\"], fields=[\"id\", \"geometry\"], max_items=2000, limit=2000\n",
+    "        ).items_as_dicts()\n",
+    "    )\n",
+    "    print(\"pgstac\")\n",
+    "    timer.report(items)\n",
+    "\n",
+    "with Timer() as timer:\n",
+    "    items = duckdb_client.search(\n",
+    "        NAIP_GEOPARQUET_URI, fields=[\"id\", \"geometry\"], max_items=2000, limit=2000\n",
+    "    )\n",
+    "    print(\"duckdb\")\n",
     "    timer.report(items)"
    ]
   }
diff --git a/docs/katas/labs_375.py b/docs/katas/labs_375.py
@@ -6,8 +6,9 @@
 from types import TracebackType
 from typing import Any
 
-STAC_FASTAPI_GEOPARQUET_URI = "https://4y16a90iwk.execute-api.us-west-2.amazonaws.com/"
-NAIP_GEOPARQUET_URI = "s3://stac-fastapi-geoparquet-devseed/naip.parquet"
+STAC_FASTAPI_GEOPARQUET_URI = "https://1sotk6vb0d.execute-api.us-west-2.amazonaws.com/"
+STAC_FASTAPI_PGSTAC_URI = "https://31ukqsqah7.execute-api.us-west-2.amazonaws.com/"
+NAIP_GEOPARQUET_URI = "s3://stac-fastapi-geoparquet-labs-375/naip.parquet"
 
 
 class Timer:
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,12 @@ version = "0.0.0"
 description = "Test the performance of stac-fastapi-geoparquet"
 readme = "README.md"
 requires-python = ">=3.12"
-dependencies = ["stac-fastapi-geoparquet", "rustac"]
+dependencies = [
+    "stac-fastapi-geoparquet",
+    "rustac",
+    "pypgstac>=0.9.6",
+    "psycopg[pool]>=3.2.6",
+]
 
 [project.optional-dependencies]
 lambda = ["mangum==0.19.0"]
diff --git a/scripts/pgstac-ingest b/scripts/pgstac-ingest
diff --git a/uv.lock b/uv.lock