|
233 | 233 | " # populate the in-memory data structure\n", |
234 | 234 | " inflight_configuration_data[crawler_oid][\"domains_temp\"][\n", |
235 | 235 | " domain_oid\n", |
236 | | - " ] = temp_domain_conf" |
| 236 | + " ] = temp_domain_conf\n", |
| 237 | + " print()" |
237 | 238 | ] |
238 | 239 | }, |
239 | 240 | { |
|
258 | 259 | " _source=[\"configuration_oid\", \"domain_oid\", \"rules\", \"url_filters\"],\n", |
259 | 260 | ")\n", |
260 | 261 | "\n", |
261 | | - "extr_count = 0\n", |
| 262 | + "extr_count = 1\n", |
262 | 263 | "for exr_rule in extraction_rules[\"hits\"][\"hits\"]:\n", |
263 | 264 | " source = exr_rule[\"_source\"]\n", |
264 | 265 | "\n", |
|
306 | 307 | " \"rules\": ruleset,\n", |
307 | 308 | " }\n", |
308 | 309 | " ]\n", |
| 310 | + "\n", |
| 311 | + " print(\n", |
| 312 | + " f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n", |
| 313 | + " )\n", |
309 | 314 | " extr_count += 1\n", |
310 | 315 | " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n", |
311 | 316 | " \"extraction_rulesets\"\n", |
312 | | - " ] = temp_extraction_rulesets\n", |
313 | | - "\n", |
314 | | - "print(f\"{extr_count} total extraction rules found!\")" |
| 317 | + " ] = temp_extraction_rulesets" |
315 | 318 | ] |
316 | 319 | }, |
317 | 320 | { |
|
348 | 351 | "\n", |
349 | 352 | " # ? comes from Quartz Cron, regular cron doesn't handle it well\n", |
350 | 353 | " repackaged_definition = repackaged_definition.replace(\"?\", \"*\")\n", |
351 | | - " print(repackaged_definition)\n", |
352 | 354 | " return repackaged_definition\n", |
353 | 355 | "\n", |
354 | 356 | "\n", |
|
402 | 404 | "source": [ |
403 | 405 | "# Final transform of the in-memory data structure to a form we can dump to YAML\n", |
404 | 406 | "# for each crawler, collect all of its domain configurations into a list\n", |
405 | | - "for crawler_config in inflight_configuration_data.values():\n", |
| 407 | + "for crawler_oid, crawler_config in inflight_configuration_data.items():\n", |
406 | 408 | " all_crawler_domains = []\n", |
407 | 409 | "\n", |
408 | 410 | " for domain_config in crawler_config[\"domains_temp\"].values():\n", |
409 | 411 | " all_crawler_domains.append(domain_config)\n", |
410 | 412 | " # create a new key called \"domains\" that points to a list of domain configs only - no domain_oid values as keys\n", |
411 | 413 | " crawler_config[\"domains\"] = all_crawler_domains\n", |
412 | 414 | " # delete the temporary domain key\n", |
413 | | - " del crawler_config[\"domains_temp\"]" |
| 415 | + " del crawler_config[\"domains_temp\"]\n", |
| 416 | + " print(f\"Transform for {crawler_oid} complete!\")" |
414 | 417 | ] |
415 | 418 | }, |
416 | 419 | { |
|
518 | 521 | " \"--------------------------------------------------------------------------------\"\n", |
519 | 522 | " )" |
520 | 523 | ] |
| 524 | + }, |
| 525 | + { |
| 526 | + "cell_type": "code", |
| 527 | + "execution_count": null, |
| 528 | + "id": "7aaee4e8-c388-4b22-a8ad-a657550d92c7", |
| 529 | + "metadata": {}, |
| 530 | + "outputs": [], |
| 531 | + "source": [] |
521 | 532 | } |
522 | 533 | ], |
523 | 534 | "metadata": { |
|
0 commit comments