Add taxicab rescrape queue for generic URL rescraping

caseydm · claude · caseydm · commit a0bfda2b7049 · 2026-02-20T10:29:47.000-06:00
Adds a rescrape_queue table and rescrape_queue_only parameter to the
taxicab notebook. When enabled, reads native_id/namespace from the queue,
resolves URLs (DOIs constructed directly, others looked up from
taxicab_results), scrapes them, and truncates the queue. New
TaxiCab_Rescrape job (manual trigger only) uses this mode.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/databricks.yml b/databricks.yml
@@ -38,6 +38,7 @@ include:
   - jobs/pubmed.yaml
   - jobs/repo.yaml
   - jobs/taxicab.yaml
+  - jobs/taxicab_rescrape.yaml
   - jobs/parse_pdfs.yaml
   - jobs/walden_end2end.yaml
   - jobs/wunpaywall_weekly_data_feed.yaml
diff --git a/jobs/taxicab_rescrape.yaml b/jobs/taxicab_rescrape.yaml
@@ -0,0 +1,50 @@
+resources:
+  jobs:
+    TaxiCab_Rescrape:
+      name: TaxiCab_Rescrape
+      email_notifications:
+        on_failure:
+          - casey@ourresearch.org
+      max_concurrent_runs: 1
+      tasks:
+        - task_key: run_taxicab_rescrape
+          notebook_task:
+            notebook_path: notebooks/scraping/taxicab
+            source: GIT
+            base_parameters:
+              lookback_days: "3"
+              url_limit: "1000000"
+              rescrape_queue_only: "true"
+          job_cluster_key: Taxicab_Rescrape_job_cluster
+      job_clusters:
+        - job_cluster_key: Taxicab_Rescrape_job_cluster
+          new_cluster:
+            cluster_name: ""
+            spark_version: 16.4.x-scala2.13
+            spark_conf:
+              spark.master: local[*, 4]
+              spark.databricks.cluster.profile: singleNode
+            aws_attributes:
+              first_on_demand: 1
+              availability: SPOT_WITH_FALLBACK
+              zone_id: us-east-1f
+              spot_bid_price_percent: 100
+            node_type_id: m5d.4xlarge
+            driver_node_type_id: m5d.4xlarge
+            custom_tags:
+              ResourceClass: SingleNode
+            enable_elastic_disk: true
+            data_security_mode: SINGLE_USER
+            runtime_engine: STANDARD
+            num_workers: 0
+      git_source:
+        git_url: https://github.com/ourresearch/openalex-walden.git
+        git_provider: gitHub
+        git_branch: main
+      parameters:
+        - name: lookback_days
+          default: "3"
+        - name: url_limit
+          default: "1000000"
+        - name: rescrape_queue_only
+          default: "true"
diff --git a/notebooks/scraping/taxicab.ipynb b/notebooks/scraping/taxicab.ipynb
@@ -64,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": null,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {
@@ -80,24 +80,14 @@
     }
    },
    "outputs": [],
-   "source": [
-    "%sql\n",
-    "CREATE TABLE IF NOT EXISTS openalex.taxicab.taxicab_results (\n",
-    "  taxicab_id STRING,\n",
-    "  url STRING,\n",
-    "  resolved_url STRING,\n",
-    "  status_code INT,\n",
-    "  content_type STRING,\n",
-    "  native_id STRING,\n",
-    "  native_id_namespace STRING,\n",
-    "  s3_path STRING,\n",
-    "  is_soft_block BOOLEAN,\n",
-    "  created_date TIMESTAMP,\n",
-    "  processed_date TIMESTAMP,\n",
-    "  error STRING\n",
-    ")\n",
-    "USING DELTA;"
-   ]
+   "source": "%sql\nCREATE TABLE IF NOT EXISTS openalex.taxicab.taxicab_results (\n  taxicab_id STRING,\n  url STRING,\n  resolved_url STRING,\n  status_code INT,\n  content_type STRING,\n  native_id STRING,\n  native_id_namespace STRING,\n  s3_path STRING,\n  is_soft_block BOOLEAN,\n  created_date TIMESTAMP,\n  processed_date TIMESTAMP,\n  error STRING\n)\nUSING DELTA;"
+  },
+  {
+   "cell_type": "code",
+   "source": "%sql\nCREATE TABLE IF NOT EXISTS openalex.taxicab.rescrape_queue (\n  native_id STRING,\n  native_id_namespace STRING,\n  created_date TIMESTAMP DEFAULT current_timestamp()\n)\nUSING DELTA;",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "code",
@@ -123,7 +113,7 @@
     }
    },
    "outputs": [],
-   "source": "dbutils.widgets.text(\"lookback_days\", \"3\", \"Lookback window (days)\")\nlookback_days = int(dbutils.widgets.get(\"lookback_days\"))\n\nlast_processed_date = datetime.datetime.now(timezone.utc) - datetime.timedelta(days=lookback_days)\nprint(f\"Looking back {lookback_days} days from: {last_processed_date}\")"
+   "source": "dbutils.widgets.text(\"lookback_days\", \"3\", \"Lookback window (days)\")\ndbutils.widgets.text(\"rescrape_queue_only\", \"false\", \"Rescrape queue only (true/false)\")\n\nrescrape_queue_only = dbutils.widgets.get(\"rescrape_queue_only\").strip().lower() == \"true\"\nlookback_days = int(dbutils.widgets.get(\"lookback_days\"))\n\nif rescrape_queue_only:\n    print(\"RESCRAPE MODE: processing queue table\")\nelse:\n    last_processed_date = datetime.datetime.now(timezone.utc) - datetime.timedelta(days=lookback_days)\n    print(f\"Looking back {lookback_days} days from: {last_processed_date}\")"
   },
   {
    "cell_type": "code",
@@ -142,7 +132,7 @@
     }
    },
    "outputs": [],
-   "source": "# urls to scrape\n\ndbutils.widgets.text(\"url_limit\", \"250000\", \"Max URLs to process\")\nurl_limit = int(dbutils.widgets.get(\"url_limit\"))\n\n# Source 1: Crossref works\nrecent_crossref_works = (\n    spark.read\n    .table(\"openalex.crossref.crossref_works\")\n    .filter(F.col(\"created_date\") >= F.lit(last_processed_date))\n    .select(\n        \"native_id\",\n        \"native_id_namespace\",\n        F.expr(\"get(filter(urls, x -> x.url like '%doi.org%'), 0).url\").alias(\"url\"),\n        F.to_timestamp(\"created_date\").alias(\"source_created_date\"),\n    )\n)\n\n# Source 2: Repo works\nrecent_repo_works = (\n    spark.read.table(\"openalex.repo.repo_works\")\n    .filter(F.col(\"created_date\") >= F.lit(last_processed_date))\n    .select(\n        \"native_id\",\n        \"native_id_namespace\",\n        F.slice(\"urls\", 1, 3).alias(\"urls\"),\n        F.to_timestamp(\"created_date\").alias(\"source_created_date\"),\n    )\n    .filter(F.col(\"urls\").isNotNull())\n    .select(\"*\", F.explode(\"urls\").alias(\"url_struct\"))\n    .select(\n        \"native_id\",\n        \"native_id_namespace\",\n        \"source_created_date\",\n        F.col(\"url_struct.url\").alias(\"url\")\n    )\n    .filter(~F.col(\"url\").contains(\"doi.org\"))\n)\n\n# Source 3: Landing page PDF URLs\nrecent_pdf_works = (\n    spark.read\n    .table(\"openalex.landing_page.landing_page_works\")\n    .filter(F.col(\"created_date\") >= F.lit(last_processed_date))\n    .select(\n        \"ids\",\n        \"native_id\",\n        \"native_id_namespace\",\n        F.expr(\"get(filter(urls, x -> x.content_type = 'pdf'), 0).url\").alias(\"url\"),\n        F.to_timestamp(\"created_date\").alias(\"source_created_date\"),\n    )\n    .withColumn(\"pmh_id\", F.expr(\"get(filter(ids, x -> x.namespace = 'pmh'), 0).id\"))\n    .withColumn(\"doi_id\", F.expr(\"get(filter(ids, x -> x.namespace = 'doi'), 0).id\"))\n    # Set priority: PMH first, then DOI, then original\n    .withColumn(\"final_native_id\", \n        F.when(F.col(\"pmh_id\").isNotNull(), F.col(\"pmh_id\"))\n        .when(F.col(\"doi_id\").isNotNull(), F.col(\"doi_id\"))\n        .otherwise(F.col(\"native_id\")))\n    .withColumn(\"final_namespace\", \n        F.when(F.col(\"pmh_id\").isNotNull(), F.lit(\"pmh\"))\n        .when(F.col(\"doi_id\").isNotNull(), F.lit(\"doi\"))\n        .otherwise(F.col(\"native_id_namespace\")))\n    .select(\n        F.col(\"final_native_id\").alias(\"native_id\"),\n        F.col(\"final_namespace\").alias(\"native_id_namespace\"),\n        \"url\",\n        \"source_created_date\",\n    )\n    .filter(F.col(\"url\").isNotNull())\n)\n\n# Union all sources, clean native_id, dedup, order newest first, then drop the ordering column\ntaxicab_results = spark.table(\"openalex.taxicab.taxicab_results\").select(\"url\")\n\nall_urls = (\n    recent_crossref_works\n    .unionByName(recent_repo_works)\n    .unionByName(recent_pdf_works)\n    .withColumn(\"native_id\", F.regexp_replace(\"native_id\", \"^https://doi\\\\.org/\", \"\"))\n    .join(taxicab_results, [\"url\"], \"left_anti\")\n    .orderBy(F.col(\"source_created_date\").desc())\n    .limit(url_limit)\n    .drop(\"source_created_date\")\n)\n\nall_urls_pd = all_urls.toPandas()\n\njsonUrls = [\n    {\n        \"url\": row[\"url\"],\n        \"native_id\": row.get(\"native_id\", \"\"),\n        \"native_id_namespace\": row.get(\"native_id_namespace\", \"\")\n    }\n    for row in all_urls_pd.to_dict('records')\n    if row[\"url\"] is not None\n]\n\ntotal_urls = len(jsonUrls)\npdf_urls = sum(1 for url in jsonUrls if '.pdf' in url['url'].lower())\ndoi_urls = sum(1 for url in jsonUrls if 'doi.org' in url['url'].lower())\nother_urls = total_urls - pdf_urls - doi_urls\n\nprint(f\"Harvesting {total_urls} URLs ({pdf_urls} PDFs, {doi_urls} DOIs, {other_urls} other URLs)\")"
+   "source": "# urls to scrape\n\ndbutils.widgets.text(\"url_limit\", \"250000\", \"Max URLs to process\")\nurl_limit = int(dbutils.widgets.get(\"url_limit\"))\n\nif rescrape_queue_only:\n    queue_df = spark.read.table(\"openalex.taxicab.rescrape_queue\")\n    queue_count = queue_df.count()\n    if queue_count == 0:\n        dbutils.notebook.exit(\"Queue empty — nothing to rescrape\")\n\n    print(f\"Rescrape queue has {queue_count} records\")\n\n    # DOIs: construct URL directly\n    doi_urls = (\n        queue_df.filter(F.col(\"native_id_namespace\") == \"doi\")\n        .withColumn(\"url\", F.concat(F.lit(\"https://doi.org/\"), F.col(\"native_id\")))\n        .select(\"native_id\", \"native_id_namespace\", \"url\")\n    )\n\n    # Non-DOIs: look up most recent URL from taxicab_results\n    non_doi_urls = (\n        queue_df.filter(F.col(\"native_id_namespace\") != \"doi\")\n        .join(\n            spark.read.table(\"openalex.taxicab.taxicab_results\")\n                .select(\"native_id\", \"native_id_namespace\", \"url\")\n                .dropDuplicates([\"native_id\", \"native_id_namespace\", \"url\"]),\n            [\"native_id\", \"native_id_namespace\"], \"inner\"\n        )\n        .select(\"native_id\", \"native_id_namespace\", \"url\")\n    )\n\n    all_urls = doi_urls.unionByName(non_doi_urls).limit(url_limit)\n\n    all_urls_pd = all_urls.toPandas()\n\n    jsonUrls = [\n        {\n            \"url\": row[\"url\"],\n            \"native_id\": row.get(\"native_id\", \"\"),\n            \"native_id_namespace\": row.get(\"native_id_namespace\", \"\")\n        }\n        for row in all_urls_pd.to_dict('records')\n        if row[\"url\"] is not None\n    ]\n\nelse:\n    # Source 1: Crossref works\n    recent_crossref_works = (\n        spark.read\n        .table(\"openalex.crossref.crossref_works\")\n        .filter(F.col(\"created_date\") >= F.lit(last_processed_date))\n        .select(\n            \"native_id\",\n            \"native_id_namespace\",\n            F.expr(\"get(filter(urls, x -> x.url like '%doi.org%'), 0).url\").alias(\"url\"),\n            F.to_timestamp(\"created_date\").alias(\"source_created_date\"),\n        )\n    )\n\n    # Source 2: Repo works\n    recent_repo_works = (\n        spark.read.table(\"openalex.repo.repo_works\")\n        .filter(F.col(\"created_date\") >= F.lit(last_processed_date))\n        .select(\n            \"native_id\",\n            \"native_id_namespace\",\n            F.slice(\"urls\", 1, 3).alias(\"urls\"),\n            F.to_timestamp(\"created_date\").alias(\"source_created_date\"),\n        )\n        .filter(F.col(\"urls\").isNotNull())\n        .select(\"*\", F.explode(\"urls\").alias(\"url_struct\"))\n        .select(\n            \"native_id\",\n            \"native_id_namespace\",\n            \"source_created_date\",\n            F.col(\"url_struct.url\").alias(\"url\")\n        )\n        .filter(~F.col(\"url\").contains(\"doi.org\"))\n    )\n\n    # Source 3: Landing page PDF URLs\n    recent_pdf_works = (\n        spark.read\n        .table(\"openalex.landing_page.landing_page_works\")\n        .filter(F.col(\"created_date\") >= F.lit(last_processed_date))\n        .select(\n            \"ids\",\n            \"native_id\",\n            \"native_id_namespace\",\n            F.expr(\"get(filter(urls, x -> x.content_type = 'pdf'), 0).url\").alias(\"url\"),\n            F.to_timestamp(\"created_date\").alias(\"source_created_date\"),\n        )\n        .withColumn(\"pmh_id\", F.expr(\"get(filter(ids, x -> x.namespace = 'pmh'), 0).id\"))\n        .withColumn(\"doi_id\", F.expr(\"get(filter(ids, x -> x.namespace = 'doi'), 0).id\"))\n        # Set priority: PMH first, then DOI, then original\n        .withColumn(\"final_native_id\", \n            F.when(F.col(\"pmh_id\").isNotNull(), F.col(\"pmh_id\"))\n            .when(F.col(\"doi_id\").isNotNull(), F.col(\"doi_id\"))\n            .otherwise(F.col(\"native_id\")))\n        .withColumn(\"final_namespace\", \n            F.when(F.col(\"pmh_id\").isNotNull(), F.lit(\"pmh\"))\n            .when(F.col(\"doi_id\").isNotNull(), F.lit(\"doi\"))\n            .otherwise(F.col(\"native_id_namespace\")))\n        .select(\n            F.col(\"final_native_id\").alias(\"native_id\"),\n            F.col(\"final_namespace\").alias(\"native_id_namespace\"),\n            \"url\",\n            \"source_created_date\",\n        )\n        .filter(F.col(\"url\").isNotNull())\n    )\n\n    # Union all sources, clean native_id, dedup, order newest first, then drop the ordering column\n    taxicab_results = spark.table(\"openalex.taxicab.taxicab_results\").select(\"url\")\n\n    all_urls = (\n        recent_crossref_works\n        .unionByName(recent_repo_works)\n        .unionByName(recent_pdf_works)\n        .withColumn(\"native_id\", F.regexp_replace(\"native_id\", \"^https://doi\\\\.org/\", \"\"))\n        .join(taxicab_results, [\"url\"], \"left_anti\")\n        .orderBy(F.col(\"source_created_date\").desc())\n        .limit(url_limit)\n        .drop(\"source_created_date\")\n    )\n\n    all_urls_pd = all_urls.toPandas()\n\n    jsonUrls = [\n        {\n            \"url\": row[\"url\"],\n            \"native_id\": row.get(\"native_id\", \"\"),\n            \"native_id_namespace\": row.get(\"native_id_namespace\", \"\")\n        }\n        for row in all_urls_pd.to_dict('records')\n        if row[\"url\"] is not None\n    ]\n\ntotal_urls = len(jsonUrls)\npdf_urls = sum(1 for url in jsonUrls if '.pdf' in url['url'].lower())\ndoi_urls_count = sum(1 for url in jsonUrls if 'doi.org' in url['url'].lower())\nother_urls = total_urls - pdf_urls - doi_urls_count\n\nprint(f\"Harvesting {total_urls} URLs ({pdf_urls} PDFs, {doi_urls_count} DOIs, {other_urls} other URLs)\")"
   },
   {
    "cell_type": "code",
@@ -199,7 +189,7 @@
     }
    },
    "outputs": [],
-   "source": "# run it all\nresults = process_urls_with_threadpool(jsonUrls, max_workers=120)\n\nnow = datetime.datetime.now(timezone.utc)\n\nfor result in results:\n    result[\"created_date\"] = now\n    result[\"processed_date\"] = now\n\n# create DataFrame directly from results and save to table\nresults_df = spark.createDataFrame(results, schema=results_schema)\nresults_df.write.mode(\"append\").format(\"delta\").saveAsTable(\"openalex.taxicab.taxicab_results\")\n\nprint(f\"Updated {results_df.count()} records in the results table\")"
+   "source": "# run it all\nresults = process_urls_with_threadpool(jsonUrls, max_workers=120)\n\nnow = datetime.datetime.now(timezone.utc)\n\nfor result in results:\n    result[\"created_date\"] = now\n    result[\"processed_date\"] = now\n\n# create DataFrame directly from results and save to table\nresults_df = spark.createDataFrame(results, schema=results_schema)\nresults_df.write.mode(\"append\").format(\"delta\").saveAsTable(\"openalex.taxicab.taxicab_results\")\n\nprint(f\"Updated {results_df.count()} records in the results table\")\n\nif rescrape_queue_only:\n    spark.sql(\"TRUNCATE TABLE openalex.taxicab.rescrape_queue\")\n    print(\"Rescrape queue cleared\")"
   }
  ],
  "metadata": {