#49.5 separate-vector-index: update ES targets to works-v33

jasonpriem · claude · jasonpriem · commit 2581b47812bf · 2026-02-17T16:48:19.000-06:00
sync_works.ipynb: ELASTIC_INDEX works-v32 → works-v33
sync_vector_index.ipynb: comment references works-v32 → works-v33

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/notebooks/elastic/sync_vector_index.ipynb b/notebooks/elastic/sync_vector_index.ipynb
@@ -20,9 +20,9 @@
     "Sync Vector Index (works-vectors-v1)\n",
     "\n",
     "Bulk loads embeddings + 14 flat filter fields into a dedicated lightweight\n",
-    "ES index optimized for kNN vector search (12 shards vs 72 on works-v32).\n",
+    "ES index optimized for kNN vector search (12 shards vs 72 on works-v33).\n",
     "\n",
-    "Two-phase semantic search: kNN here returns IDs → mget full docs from works-v32.\n",
+    "Two-phase semantic search: kNN here returns IDs → mget full docs from works-v33.\n",
     "\n",
     "Run modes:\n",
     "- is_full_sync=true: Load all ~413M embeddings (initial load, ~6-8 hours)\n",
diff --git a/notebooks/elastic/sync_works.ipynb b/notebooks/elastic/sync_works.ipynb
@@ -39,7 +39,7 @@
     }
    },
    "outputs": [],
-   "source": "import uuid\nfrom datetime import datetime\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import *\nfrom dataclasses import dataclass\n\nfrom elasticsearch import Elasticsearch, helpers\nimport logging\nimport json\n\nlogging.basicConfig(level=logging.WARNING, format='[%(asctime)s]: %(message)s')\nlog = logging.getLogger(__name__)\n\nELASTIC_INDEX = \"works-v32\"\nELASTIC_URL = dbutils.secrets.get(scope=\"elastic\", key=\"elastic_url\")\nMAX_LENGTH = 32000  # Slightly below the 32766 limit\n\nIS_FULL_SYNC = dbutils.widgets.get(\"is_full_sync\").lower() == \"true\" # default is incremental\n\nprint(f\"IS_FULL_SYNC: {IS_FULL_SYNC}\")"
+   "source": "import uuid\nfrom datetime import datetime\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import *\nfrom dataclasses import dataclass\n\nfrom elasticsearch import Elasticsearch, helpers\nimport logging\nimport json\n\nlogging.basicConfig(level=logging.WARNING, format='[%(asctime)s]: %(message)s')\nlog = logging.getLogger(__name__)\n\nELASTIC_INDEX = \"works-v33\"\nELASTIC_URL = dbutils.secrets.get(scope=\"elastic\", key=\"elastic_url\")\nMAX_LENGTH = 32000  # Slightly below the 32766 limit\n\nIS_FULL_SYNC = dbutils.widgets.get(\"is_full_sync\").lower() == \"true\" # default is incremental\n\nprint(f\"IS_FULL_SYNC: {IS_FULL_SYNC}\")"
   },
   {
    "cell_type": "code",

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@`
`39`	`39`	`}`
`40`	`40`	`},`
`41`	`41`	`"outputs": [],`
`42`		- "source": "import uuid\nfrom datetime import datetime\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import *\nfrom dataclasses import dataclass\n\nfrom elasticsearch import Elasticsearch, helpers\nimport logging\nimport json\n\nlogging.basicConfig(level=logging.WARNING, format='[%(asctime)s]: %(message)s')\nlog = logging.getLogger(__name__)\n\nELASTIC_INDEX = \"works-v32\"\nELASTIC_URL = dbutils.secrets.get(scope=\"elastic\", key=\"elastic_url\")\nMAX_LENGTH = 32000 # Slightly below the 32766 limit\n\nIS_FULL_SYNC = dbutils.widgets.get(\"is_full_sync\").lower() == \"true\" # default is incremental\n\nprint(f\"IS_FULL_SYNC: {IS_FULL_SYNC}\")"
	`42`	+ "source": "import uuid\nfrom datetime import datetime\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import *\nfrom dataclasses import dataclass\n\nfrom elasticsearch import Elasticsearch, helpers\nimport logging\nimport json\n\nlogging.basicConfig(level=logging.WARNING, format='[%(asctime)s]: %(message)s')\nlog = logging.getLogger(__name__)\n\nELASTIC_INDEX = \"works-v33\"\nELASTIC_URL = dbutils.secrets.get(scope=\"elastic\", key=\"elastic_url\")\nMAX_LENGTH = 32000 # Slightly below the 32766 limit\n\nIS_FULL_SYNC = dbutils.widgets.get(\"is_full_sync\").lower() == \"true\" # default is incremental\n\nprint(f\"IS_FULL_SYNC: {IS_FULL_SYNC}\")"
`43`	`43`	`},`
`44`	`44`	`{`
`45`	`45`	`"cell_type": "code",`