Reduce ES sync workers to 1 and add FWCI batch parameters

caseydm · claude · caseydm · commit e40ee3d3cd29 · 2026-02-25T08:17:47.000-06:00
Halves concurrent ES pressure by dropping from 2 to 1 worker nodes.
Adds fwci_batch/fwci_total_batches params to process FWCI sync in
~50M-record chunks via id modulo, allowing incremental runs with
progress preserved by ES upsert.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/jobs/sync_all_works_to_elasticsearch.yaml b/jobs/sync_all_works_to_elasticsearch.yaml
@@ -29,7 +29,7 @@ resources:
             enable_elastic_disk: true
             data_security_mode: SINGLE_USER
             runtime_engine: STANDARD
-            num_workers: 2
+            num_workers: 1
         - job_cluster_key: es_sync_daily
           new_cluster:
             cluster_name: ""
@@ -54,3 +54,7 @@ resources:
           default: "true"
         - name: changed_fwci_sync
           default: "false"
+        - name: fwci_batch
+          default: "0"
+        - name: fwci_total_batches
+          default: "6"
diff --git a/notebooks/elastic/sync_works.ipynb b/notebooks/elastic/sync_works.ipynb
@@ -39,7 +39,7 @@
     }
    },
    "outputs": [],
-   "source": "import uuid\nfrom datetime import datetime\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import *\nfrom dataclasses import dataclass\n\nfrom elasticsearch import Elasticsearch, helpers\nimport logging\nimport json\n\nlogging.basicConfig(level=logging.WARNING, format='[%(asctime)s]: %(message)s')\nlog = logging.getLogger(__name__)\n\nELASTIC_INDEX = \"works-v33\"\nELASTIC_URL = dbutils.secrets.get(scope=\"elastic\", key=\"elastic_url\")\nMAX_LENGTH = 32000  # Slightly below the 32766 limit\n\ndbutils.widgets.text(\"is_full_sync\", \"false\")\ndbutils.widgets.text(\"changed_fwci_sync\", \"false\")\n\nIS_FULL_SYNC = dbutils.widgets.get(\"is_full_sync\").lower() == \"true\" # default is incremental\nIS_FWCI_SYNC = dbutils.widgets.get(\"changed_fwci_sync\").lower() == \"true\"\n\nprint(f\"IS_FULL_SYNC: {IS_FULL_SYNC}\")\nif IS_FWCI_SYNC:\n    print(\"FWCI sync mode: syncing all FWCI-eligible works\")"
+   "source": "import uuid\nfrom datetime import datetime\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import *\nfrom dataclasses import dataclass\n\nfrom elasticsearch import Elasticsearch, helpers\nimport logging\nimport json\n\nlogging.basicConfig(level=logging.WARNING, format='[%(asctime)s]: %(message)s')\nlog = logging.getLogger(__name__)\n\nELASTIC_INDEX = \"works-v33\"\nELASTIC_URL = dbutils.secrets.get(scope=\"elastic\", key=\"elastic_url\")\nMAX_LENGTH = 32000  # Slightly below the 32766 limit\n\ndbutils.widgets.text(\"is_full_sync\", \"false\")\ndbutils.widgets.text(\"changed_fwci_sync\", \"false\")\ndbutils.widgets.text(\"fwci_batch\", \"0\")\ndbutils.widgets.text(\"fwci_total_batches\", \"6\")\n\nIS_FULL_SYNC = dbutils.widgets.get(\"is_full_sync\").lower() == \"true\" # default is incremental\nIS_FWCI_SYNC = dbutils.widgets.get(\"changed_fwci_sync\").lower() == \"true\"\nFWCI_BATCH = int(dbutils.widgets.get(\"fwci_batch\"))  # 0 = all, 1-N = specific batch\nFWCI_TOTAL_BATCHES = int(dbutils.widgets.get(\"fwci_total_batches\"))\n\nprint(f\"IS_FULL_SYNC: {IS_FULL_SYNC}\")\nif IS_FWCI_SYNC:\n    print(\"FWCI sync mode: syncing all FWCI-eligible works\")\n    if FWCI_BATCH > 0:\n        print(f\"  Batch {FWCI_BATCH} of {FWCI_TOTAL_BATCHES} (id % {FWCI_TOTAL_BATCHES} = {FWCI_BATCH - 1})\")\n    else:\n        print(\"  Processing ALL batches (no batching)\")"
   },
   {
    "cell_type": "code",
@@ -81,7 +81,7 @@
     }
    },
    "outputs": [],
-   "source": "if IS_FWCI_SYNC:\n    # Sync all FWCI-eligible works (used after inline FWCI calculation change)\n    FWCI_WHERE = \"\"\"primary_topic.subfield.id IS NOT NULL\n      AND (type IN ('article', 'book', 'review', 'book-chapter')\n           OR (type = 'article' AND primary_location.source.type = 'conference'))\"\"\"\n    SQL_QUERY = f\"SELECT * FROM openalex.works.openalex_works WHERE {FWCI_WHERE}\"\n    COUNT_QUERY = f\"SELECT COUNT(*) as cnt FROM openalex.works.openalex_works WHERE {FWCI_WHERE}\"\nelif IS_FULL_SYNC:\n    SQL_QUERY = \"\"\"SELECT * FROM openalex.works.openalex_works\"\"\"\n    COUNT_QUERY = None  # Full sync doesn't need count-based optimization\nelse:\n    SQL_QUERY = \"\"\"SELECT * FROM openalex.works.openalex_works\nWHERE updated_date >= current_date() - INTERVAL 2 days\n\"\"\"\n    COUNT_QUERY = \"\"\"SELECT COUNT(*) as cnt FROM openalex.works.openalex_works\nWHERE updated_date >= current_date() - INTERVAL 2 days\n\"\"\"\n\n# Get record count BEFORE loading data (lightweight SQL count, no transformations)\nrecord_count = None\nif not IS_FULL_SYNC and COUNT_QUERY:\n    record_count = spark.sql(COUNT_QUERY).collect()[0].cnt\n    print(f\"Record count for sync: {record_count:,}\")\n\ndf = (\n    spark.sql(SQL_QUERY)\n    .withColumn(\"display_name\", F.col(\"title\"))\n    # First cast to date/timestamp\n    .withColumn(\"created_date\", F.to_timestamp(\"created_date\"))\n    .withColumn(\"updated_date\", F.to_timestamp(\"updated_date\"))\n    .withColumn(\"publication_date\", F.to_date(\"publication_date\"))\n    .withColumn(\n        \"concepts\",\n        F.transform(\n            F.col(\"concepts\"),\n            lambda c: F.struct(\n                F.concat(F.lit(\"https://openalex.org/C\"), c.id).alias(\"id\"),\n                c.wikidata.alias(\"wikidata\"),\n                c.display_name.alias(\"display_name\"),\n                c.level.alias(\"level\"),\n                c.score.alias(\"score\")\n            )\n        )\n    )\n    # Apply range checks using BETWEEN\n    .withColumn(\n        \"created_date\",\n        F.when(\n            F.col(\"created_date\").between(F.lit(\"1000-01-01\"), F.lit(\"9999-12-31\")),\n            F.col(\"created_date\")\n        ).otherwise(F.lit(None).cast(\"timestamp\"))\n    )\n    .withColumn(\n        \"updated_date\",\n        F.when(\n            F.col(\"updated_date\").between(F.lit(\"1000-01-01\"), F.lit(\"9999-12-31\")),\n            F.col(\"updated_date\")\n        ).otherwise(F.lit(None).cast(\"timestamp\"))\n    )\n    .withColumn(\n        \"publication_date\",\n        F.when(\n            F.col(\"publication_date\").between(F.lit(\"1000-01-01\"), F.lit(\"2050-12-31\")),\n            F.col(\"publication_date\")\n        ).otherwise(F.lit(None).cast(\"date\"))\n    )\n    .filter(F.col(\"id\").isNotNull())\n)\n\n# Dynamic partitioning based on record volume\n# Only apply partition optimization for non-full syncs\nif not IS_FULL_SYNC and record_count is not None:\n    # Calculate optimal partition count:\n    # - Small updates (<500k): use fewer partitions for efficiency\n    # - Medium updates (500k-5M): moderate partitions  \n    # - Large updates (5M-20M): many partitions like full sync\n    # - Very large updates (>20M): use repartitionByRange for even distribution\n    RECORDS_PER_PARTITION = 10000  # Target ~10k records per partition\n    \n    if record_count < 2_000_000:\n        # Small daily update - coalesce to reduce overhead\n        optimal_partitions = max(64, record_count // RECORDS_PER_PARTITION)\n        df = df.coalesce(optimal_partitions)\n        print(f\"Small update: coalesced to {optimal_partitions} partitions\")\n    elif record_count < 10_000_000:\n        # Medium daily update - use more partitions\n        optimal_partitions = max(1024, record_count // RECORDS_PER_PARTITION)\n        df = df.repartition(optimal_partitions)\n        print(f\"Medium update: repartitioned to {optimal_partitions} partitions\")\n    elif record_count < 20_000_000:\n        # Large daily update - repartition for better distribution\n        optimal_partitions = min(4096, record_count // RECORDS_PER_PARTITION)\n        df = df.repartition(optimal_partitions)\n        print(f\"Large update: repartitioned to {optimal_partitions} partitions\")\n    else:\n        # Very large update - use repartitionByRange like full sync\n        df = df.repartitionByRange(8096, \"id\")\n        print(f\"Very large update: using repartitionByRange with 8096 partitions\")\n\nprint(f\"SQL query:\\n{SQL_QUERY}\")"
+   "source": "if IS_FWCI_SYNC:\n    # Sync all FWCI-eligible works (used after inline FWCI calculation change)\n    FWCI_WHERE = \"\"\"primary_topic.subfield.id IS NOT NULL\n      AND (type IN ('article', 'book', 'review', 'book-chapter')\n           OR (type = 'article' AND primary_location.source.type = 'conference'))\"\"\"\n    if FWCI_BATCH > 0:\n        FWCI_WHERE += f\"\\n      AND id % {FWCI_TOTAL_BATCHES} = {FWCI_BATCH - 1}\"\n    SQL_QUERY = f\"SELECT * FROM openalex.works.openalex_works WHERE {FWCI_WHERE}\"\n    COUNT_QUERY = f\"SELECT COUNT(*) as cnt FROM openalex.works.openalex_works WHERE {FWCI_WHERE}\"\nelif IS_FULL_SYNC:\n    SQL_QUERY = \"\"\"SELECT * FROM openalex.works.openalex_works\"\"\"\n    COUNT_QUERY = None  # Full sync doesn't need count-based optimization\nelse:\n    SQL_QUERY = \"\"\"SELECT * FROM openalex.works.openalex_works\nWHERE updated_date >= current_date() - INTERVAL 2 days\n\"\"\"\n    COUNT_QUERY = \"\"\"SELECT COUNT(*) as cnt FROM openalex.works.openalex_works\nWHERE updated_date >= current_date() - INTERVAL 2 days\n\"\"\"\n\n# Get record count BEFORE loading data (lightweight SQL count, no transformations)\nrecord_count = None\nif not IS_FULL_SYNC and COUNT_QUERY:\n    record_count = spark.sql(COUNT_QUERY).collect()[0].cnt\n    print(f\"Record count for sync: {record_count:,}\")\n\ndf = (\n    spark.sql(SQL_QUERY)\n    .withColumn(\"display_name\", F.col(\"title\"))\n    # First cast to date/timestamp\n    .withColumn(\"created_date\", F.to_timestamp(\"created_date\"))\n    .withColumn(\"updated_date\", F.to_timestamp(\"updated_date\"))\n    .withColumn(\"publication_date\", F.to_date(\"publication_date\"))\n    .withColumn(\n        \"concepts\",\n        F.transform(\n            F.col(\"concepts\"),\n            lambda c: F.struct(\n                F.concat(F.lit(\"https://openalex.org/C\"), c.id).alias(\"id\"),\n                c.wikidata.alias(\"wikidata\"),\n                c.display_name.alias(\"display_name\"),\n                c.level.alias(\"level\"),\n                c.score.alias(\"score\")\n            )\n        )\n    )\n    # Apply range checks using BETWEEN\n    .withColumn(\n        \"created_date\",\n        F.when(\n            F.col(\"created_date\").between(F.lit(\"1000-01-01\"), F.lit(\"9999-12-31\")),\n            F.col(\"created_date\")\n        ).otherwise(F.lit(None).cast(\"timestamp\"))\n    )\n    .withColumn(\n        \"updated_date\",\n        F.when(\n            F.col(\"updated_date\").between(F.lit(\"1000-01-01\"), F.lit(\"9999-12-31\")),\n            F.col(\"updated_date\")\n        ).otherwise(F.lit(None).cast(\"timestamp\"))\n    )\n    .withColumn(\n        \"publication_date\",\n        F.when(\n            F.col(\"publication_date\").between(F.lit(\"1000-01-01\"), F.lit(\"2050-12-31\")),\n            F.col(\"publication_date\")\n        ).otherwise(F.lit(None).cast(\"date\"))\n    )\n    .filter(F.col(\"id\").isNotNull())\n)\n\n# Dynamic partitioning based on record volume\n# Only apply partition optimization for non-full syncs\nif not IS_FULL_SYNC and record_count is not None:\n    # Calculate optimal partition count:\n    # - Small updates (<500k): use fewer partitions for efficiency\n    # - Medium updates (500k-5M): moderate partitions  \n    # - Large updates (5M-20M): many partitions like full sync\n    # - Very large updates (>20M): use repartitionByRange for even distribution\n    RECORDS_PER_PARTITION = 10000  # Target ~10k records per partition\n    \n    if record_count < 2_000_000:\n        # Small daily update - coalesce to reduce overhead\n        optimal_partitions = max(64, record_count // RECORDS_PER_PARTITION)\n        df = df.coalesce(optimal_partitions)\n        print(f\"Small update: coalesced to {optimal_partitions} partitions\")\n    elif record_count < 10_000_000:\n        # Medium daily update - use more partitions\n        optimal_partitions = max(1024, record_count // RECORDS_PER_PARTITION)\n        df = df.repartition(optimal_partitions)\n        print(f\"Medium update: repartitioned to {optimal_partitions} partitions\")\n    elif record_count < 20_000_000:\n        # Large daily update - repartition for better distribution\n        optimal_partitions = min(4096, record_count // RECORDS_PER_PARTITION)\n        df = df.repartition(optimal_partitions)\n        print(f\"Large update: repartitioned to {optimal_partitions} partitions\")\n    else:\n        # Very large update - use repartitionByRange like full sync\n        df = df.repartitionByRange(8096, \"id\")\n        print(f\"Very large update: using repartitionByRange with 8096 partitions\")\n\nprint(f\"SQL query:\\n{SQL_QUERY}\")"
   },
   {
    "cell_type": "code",

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@`
`39`	`39`	`}`
`40`	`40`	`},`
`41`	`41`	`"outputs": [],`
`42`		- "source": "import uuid\nfrom datetime import datetime\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import *\nfrom dataclasses import dataclass\n\nfrom elasticsearch import Elasticsearch, helpers\nimport logging\nimport json\n\nlogging.basicConfig(level=logging.WARNING, format='[%(asctime)s]: %(message)s')\nlog = logging.getLogger(__name__)\n\nELASTIC_INDEX = \"works-v33\"\nELASTIC_URL = dbutils.secrets.get(scope=\"elastic\", key=\"elastic_url\")\nMAX_LENGTH = 32000 # Slightly below the 32766 limit\n\ndbutils.widgets.text(\"is_full_sync\", \"false\")\ndbutils.widgets.text(\"changed_fwci_sync\", \"false\")\n\nIS_FULL_SYNC = dbutils.widgets.get(\"is_full_sync\").lower() == \"true\" # default is incremental\nIS_FWCI_SYNC = dbutils.widgets.get(\"changed_fwci_sync\").lower() == \"true\"\n\nprint(f\"IS_FULL_SYNC: {IS_FULL_SYNC}\")\nif IS_FWCI_SYNC:\n print(\"FWCI sync mode: syncing all FWCI-eligible works\")"
	`42`	+ "source": "import uuid\nfrom datetime import datetime\nfrom pyspark.sql import functions as F\nfrom pyspark.sql.types import *\nfrom dataclasses import dataclass\n\nfrom elasticsearch import Elasticsearch, helpers\nimport logging\nimport json\n\nlogging.basicConfig(level=logging.WARNING, format='[%(asctime)s]: %(message)s')\nlog = logging.getLogger(__name__)\n\nELASTIC_INDEX = \"works-v33\"\nELASTIC_URL = dbutils.secrets.get(scope=\"elastic\", key=\"elastic_url\")\nMAX_LENGTH = 32000 # Slightly below the 32766 limit\n\ndbutils.widgets.text(\"is_full_sync\", \"false\")\ndbutils.widgets.text(\"changed_fwci_sync\", \"false\")\ndbutils.widgets.text(\"fwci_batch\", \"0\")\ndbutils.widgets.text(\"fwci_total_batches\", \"6\")\n\nIS_FULL_SYNC = dbutils.widgets.get(\"is_full_sync\").lower() == \"true\" # default is incremental\nIS_FWCI_SYNC = dbutils.widgets.get(\"changed_fwci_sync\").lower() == \"true\"\nFWCI_BATCH = int(dbutils.widgets.get(\"fwci_batch\")) # 0 = all, 1-N = specific batch\nFWCI_TOTAL_BATCHES = int(dbutils.widgets.get(\"fwci_total_batches\"))\n\nprint(f\"IS_FULL_SYNC: {IS_FULL_SYNC}\")\nif IS_FWCI_SYNC:\n print(\"FWCI sync mode: syncing all FWCI-eligible works\")\n if FWCI_BATCH > 0:\n print(f\" Batch {FWCI_BATCH} of {FWCI_TOTAL_BATCHES} (id % {FWCI_TOTAL_BATCHES} = {FWCI_BATCH - 1})\")\n else:\n print(\" Processing ALL batches (no batching)\")"
`43`	`43`	`},`
`44`	`44`	`{`
`45`	`45`	`"cell_type": "code",`
`@@ -81,7 +81,7 @@`
`81`	`81`	`}`
`82`	`82`	`},`
`83`	`83`	`"outputs": [],`
`84`		- "source": "if IS_FWCI_SYNC:\n # Sync all FWCI-eligible works (used after inline FWCI calculation change)\n FWCI_WHERE = \"\"\"primary_topic.subfield.id IS NOT NULL\n AND (type IN ('article', 'book', 'review', 'book-chapter')\n OR (type = 'article' AND primary_location.source.type = 'conference'))\"\"\"\n SQL_QUERY = f\"SELECT * FROM openalex.works.openalex_works WHERE {FWCI_WHERE}\"\n COUNT_QUERY = f\"SELECT COUNT() as cnt FROM openalex.works.openalex_works WHERE {FWCI_WHERE}\"\nelif IS_FULL_SYNC:\n SQL_QUERY = \"\"\"SELECT FROM openalex.works.openalex_works\"\"\"\n COUNT_QUERY = None # Full sync doesn't need count-based optimization\nelse:\n SQL_QUERY = \"\"\"SELECT * FROM openalex.works.openalex_works\nWHERE updated_date >= current_date() - INTERVAL 2 days\n\"\"\"\n COUNT_QUERY = \"\"\"SELECT COUNT(*) as cnt FROM openalex.works.openalex_works\nWHERE updated_date >= current_date() - INTERVAL 2 days\n\"\"\"\n\n# Get record count BEFORE loading data (lightweight SQL count, no transformations)\nrecord_count = None\nif not IS_FULL_SYNC and COUNT_QUERY:\n record_count = spark.sql(COUNT_QUERY).collect()[0].cnt\n print(f\"Record count for sync: {record_count:,}\")\n\ndf = (\n spark.sql(SQL_QUERY)\n .withColumn(\"display_name\", F.col(\"title\"))\n # First cast to date/timestamp\n .withColumn(\"created_date\", F.to_timestamp(\"created_date\"))\n .withColumn(\"updated_date\", F.to_timestamp(\"updated_date\"))\n .withColumn(\"publication_date\", F.to_date(\"publication_date\"))\n .withColumn(\n \"concepts\",\n F.transform(\n F.col(\"concepts\"),\n lambda c: F.struct(\n F.concat(F.lit(\"https://openalex.org/C\"), c.id).alias(\"id\"),\n c.wikidata.alias(\"wikidata\"),\n c.display_name.alias(\"display_name\"),\n c.level.alias(\"level\"),\n c.score.alias(\"score\")\n )\n )\n )\n # Apply range checks using BETWEEN\n .withColumn(\n \"created_date\",\n F.when(\n F.col(\"created_date\").between(F.lit(\"1000-01-01\"), F.lit(\"9999-12-31\")),\n F.col(\"created_date\")\n ).otherwise(F.lit(None).cast(\"timestamp\"))\n )\n .withColumn(\n \"updated_date\",\n F.when(\n F.col(\"updated_date\").between(F.lit(\"1000-01-01\"), F.lit(\"9999-12-31\")),\n F.col(\"updated_date\")\n ).otherwise(F.lit(None).cast(\"timestamp\"))\n )\n .withColumn(\n \"publication_date\",\n F.when(\n F.col(\"publication_date\").between(F.lit(\"1000-01-01\"), F.lit(\"2050-12-31\")),\n F.col(\"publication_date\")\n ).otherwise(F.lit(None).cast(\"date\"))\n )\n .filter(F.col(\"id\").isNotNull())\n)\n\n# Dynamic partitioning based on record volume\n# Only apply partition optimization for non-full syncs\nif not IS_FULL_SYNC and record_count is not None:\n # Calculate optimal partition count:\n # - Small updates (<500k): use fewer partitions for efficiency\n # - Medium updates (500k-5M): moderate partitions \n # - Large updates (5M-20M): many partitions like full sync\n # - Very large updates (>20M): use repartitionByRange for even distribution\n RECORDS_PER_PARTITION = 10000 # Target ~10k records per partition\n \n if record_count < 2_000_000:\n # Small daily update - coalesce to reduce overhead\n optimal_partitions = max(64, record_count // RECORDS_PER_PARTITION)\n df = df.coalesce(optimal_partitions)\n print(f\"Small update: coalesced to {optimal_partitions} partitions\")\n elif record_count < 10_000_000:\n # Medium daily update - use more partitions\n optimal_partitions = max(1024, record_count // RECORDS_PER_PARTITION)\n df = df.repartition(optimal_partitions)\n print(f\"Medium update: repartitioned to {optimal_partitions} partitions\")\n elif record_count < 20_000_000:\n # Large daily update - repartition for better distribution\n optimal_partitions = min(4096, record_count // RECORDS_PER_PARTITION)\n df = df.repartition(optimal_partitions)\n print(f\"Large update: repartitioned to {optimal_partitions} partitions\")\n else:\n # Very large update - use repartitionByRange like full sync\n df = df.repartitionByRange(8096, \"id\")\n print(f\"Very large update: using repartitionByRange with 8096 partitions\")\n\nprint(f\"SQL query:\\n{SQL_QUERY}\")"
	`84`	+ "source": "if IS_FWCI_SYNC:\n # Sync all FWCI-eligible works (used after inline FWCI calculation change)\n FWCI_WHERE = \"\"\"primary_topic.subfield.id IS NOT NULL\n AND (type IN ('article', 'book', 'review', 'book-chapter')\n OR (type = 'article' AND primary_location.source.type = 'conference'))\"\"\"\n if FWCI_BATCH > 0:\n FWCI_WHERE += f\"\\n AND id % {FWCI_TOTAL_BATCHES} = {FWCI_BATCH - 1}\"\n SQL_QUERY = f\"SELECT * FROM openalex.works.openalex_works WHERE {FWCI_WHERE}\"\n COUNT_QUERY = f\"SELECT COUNT() as cnt FROM openalex.works.openalex_works WHERE {FWCI_WHERE}\"\nelif IS_FULL_SYNC:\n SQL_QUERY = \"\"\"SELECT FROM openalex.works.openalex_works\"\"\"\n COUNT_QUERY = None # Full sync doesn't need count-based optimization\nelse:\n SQL_QUERY = \"\"\"SELECT * FROM openalex.works.openalex_works\nWHERE updated_date >= current_date() - INTERVAL 2 days\n\"\"\"\n COUNT_QUERY = \"\"\"SELECT COUNT(*) as cnt FROM openalex.works.openalex_works\nWHERE updated_date >= current_date() - INTERVAL 2 days\n\"\"\"\n\n# Get record count BEFORE loading data (lightweight SQL count, no transformations)\nrecord_count = None\nif not IS_FULL_SYNC and COUNT_QUERY:\n record_count = spark.sql(COUNT_QUERY).collect()[0].cnt\n print(f\"Record count for sync: {record_count:,}\")\n\ndf = (\n spark.sql(SQL_QUERY)\n .withColumn(\"display_name\", F.col(\"title\"))\n # First cast to date/timestamp\n .withColumn(\"created_date\", F.to_timestamp(\"created_date\"))\n .withColumn(\"updated_date\", F.to_timestamp(\"updated_date\"))\n .withColumn(\"publication_date\", F.to_date(\"publication_date\"))\n .withColumn(\n \"concepts\",\n F.transform(\n F.col(\"concepts\"),\n lambda c: F.struct(\n F.concat(F.lit(\"https://openalex.org/C\"), c.id).alias(\"id\"),\n c.wikidata.alias(\"wikidata\"),\n c.display_name.alias(\"display_name\"),\n c.level.alias(\"level\"),\n c.score.alias(\"score\")\n )\n )\n )\n # Apply range checks using BETWEEN\n .withColumn(\n \"created_date\",\n F.when(\n F.col(\"created_date\").between(F.lit(\"1000-01-01\"), F.lit(\"9999-12-31\")),\n F.col(\"created_date\")\n ).otherwise(F.lit(None).cast(\"timestamp\"))\n )\n .withColumn(\n \"updated_date\",\n F.when(\n F.col(\"updated_date\").between(F.lit(\"1000-01-01\"), F.lit(\"9999-12-31\")),\n F.col(\"updated_date\")\n ).otherwise(F.lit(None).cast(\"timestamp\"))\n )\n .withColumn(\n \"publication_date\",\n F.when(\n F.col(\"publication_date\").between(F.lit(\"1000-01-01\"), F.lit(\"2050-12-31\")),\n F.col(\"publication_date\")\n ).otherwise(F.lit(None).cast(\"date\"))\n )\n .filter(F.col(\"id\").isNotNull())\n)\n\n# Dynamic partitioning based on record volume\n# Only apply partition optimization for non-full syncs\nif not IS_FULL_SYNC and record_count is not None:\n # Calculate optimal partition count:\n # - Small updates (<500k): use fewer partitions for efficiency\n # - Medium updates (500k-5M): moderate partitions \n # - Large updates (5M-20M): many partitions like full sync\n # - Very large updates (>20M): use repartitionByRange for even distribution\n RECORDS_PER_PARTITION = 10000 # Target ~10k records per partition\n \n if record_count < 2_000_000:\n # Small daily update - coalesce to reduce overhead\n optimal_partitions = max(64, record_count // RECORDS_PER_PARTITION)\n df = df.coalesce(optimal_partitions)\n print(f\"Small update: coalesced to {optimal_partitions} partitions\")\n elif record_count < 10_000_000:\n # Medium daily update - use more partitions\n optimal_partitions = max(1024, record_count // RECORDS_PER_PARTITION)\n df = df.repartition(optimal_partitions)\n print(f\"Medium update: repartitioned to {optimal_partitions} partitions\")\n elif record_count < 20_000_000:\n # Large daily update - repartition for better distribution\n optimal_partitions = min(4096, record_count // RECORDS_PER_PARTITION)\n df = df.repartition(optimal_partitions)\n print(f\"Large update: repartitioned to {optimal_partitions} partitions\")\n else:\n # Very large update - use repartitionByRange like full sync\n df = df.repartitionByRange(8096, \"id\")\n print(f\"Very large update: using repartitionByRange with 8096 partitions\")\n\nprint(f\"SQL query:\\n{SQL_QUERY}\")"
`85`	`85`	`},`
`86`	`86`	`{`
`87`	`87`	`"cell_type": "code",`