|
26 | 26 | " - [Find most inefficient jobs hoarding node RAM based on `ram_hoarding_fraction_diff`](#toc1_5_2_1_) \n", |
27 | 27 | " - [Find most inefficient jobs hoarding CPU cores based on `core_hoarding_fraction_diff`](#toc1_5_2_2_) \n", |
28 | 28 | " - [User-Level Analysis](#toc1_6_) \n", |
29 | | - " - [Find Inefficient Users based on `requested_vram_efficiency_score`](#toc1_6_1_) \n", |
30 | | - " - [Generate all hoarding analysis metrics for users:](#toc1_6_2_) \n", |
31 | | - " - [Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc1_6_2_1_) \n", |
32 | | - " - [Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc1_6_2_2_) \n", |
| 29 | + " - [Find Inefficient Users based on `avg_requested_vram_efficiency_score`](#toc1_6_1_) \n", |
| 30 | + " - [Find Inefficient Users based on EV(alloc_vram_efficiency)](#toc1_6_2_) \n", |
| 31 | + " - [Generate all hoarding analysis metrics for users:](#toc1_6_3_) \n", |
| 32 | + " - [Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc1_6_3_1_) \n", |
| 33 | + " - [Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc1_6_3_2_) \n", |
33 | 34 | " - [PI Group Analysis](#toc1_7_) \n", |
34 | 35 | " - [Find Inefficient PIs based on `avg_requested_vram_efficiency_score`](#toc1_7_1_1_) \n", |
35 | 36 | "\n", |
|
325 | 326 | " metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.JOBS_WITH_RESOURCE_HOARDING_METRICS,\n", |
326 | 327 | " sorting_key=\"core_hoarding_fraction_diff\",\n", |
327 | 328 | " ascending=False, # Sort in descending order\n", |
328 | | - " filter_criteria={\"core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True}},\n", |
| 329 | + " filter_criteria={\n", |
| 330 | + " \"core_hoarding_fraction_diff\": {\"min\": 0, \"inclusive\": True},\n", |
| 331 | + " \"job_count\": {\"min\": 15, \"inclusive\": True},\n", |
| 332 | + " },\n", |
329 | 333 | ")\n", |
330 | 334 | "\n", |
331 | 335 | "# Plot top inefficient jobs by CPU core hoarding fraction, with CPU core hoarding fraction as labels\n", |
|
360 | 364 | "id": "26", |
361 | 365 | "metadata": {}, |
362 | 366 | "source": [ |
363 | | - "### <a id='toc1_6_1_'></a>[Find Inefficient Users based on `requested_vram_efficiency_score`](#toc0_)" |
| 367 | + "### <a id='toc1_6_1_'></a>[Find Inefficient Users based on `avg_requested_vram_efficiency_score`](#toc0_)" |
364 | 368 | ] |
365 | 369 | }, |
366 | 370 | { |
|
376 | 380 | " ascending=True, # Sort by avg_requested_vram_efficiency_score in ascending order\n", |
377 | 381 | " filter_criteria={\n", |
378 | 382 | " \"avg_requested_vram_efficiency_score\": {\"max\": -10, \"inclusive\": True}, # score threshold\n", |
379 | | - " \"job_count\": {\"min\": 5, \"inclusive\": True}, # minimum job count threshold\n", |
| 383 | + " \"job_count\": {\"min\": 15, \"inclusive\": True}, # minimum job count threshold\n", |
380 | 384 | " },\n", |
381 | 385 | ")\n", |
382 | 386 | "# Plot top inefficient users by Avg Requested VRAM Efficiency Score, with avg_requested_vram_efficiency_score as labels\n", |
|
402 | 406 | ")" |
403 | 407 | ] |
404 | 408 | }, |
| 409 | + { |
| 410 | + "cell_type": "markdown", |
| 411 | + "id": "29", |
| 412 | + "metadata": {}, |
| 413 | + "source": [ |
| 414 | + "### <a id='toc1_6_2_'></a>[Find Inefficient Users based on EV(alloc_vram_efficiency)](#toc0_)" |
| 415 | + ] |
| 416 | + }, |
405 | 417 | { |
406 | 418 | "cell_type": "code", |
407 | 419 | "execution_count": null, |
408 | | - "id": "29", |
| 420 | + "id": "30", |
| 421 | + "metadata": {}, |
| 422 | + "outputs": [], |
| 423 | + "source": [ |
| 424 | + "inefficient_users_avg_req_vram_eff_score = analyzer.sort_and_filter_records_with_metrics(\n", |
| 425 | + " metrics_df_name_enum=ResourceHoardingDataFrameNameEnum.USERS,\n", |
| 426 | + " sorting_key=\"expected_value_requested_vram_efficiency\",\n", |
| 427 | + " ascending=True, # Sort by expected_value_requested_vram_efficiency in ascending order\n", |
| 428 | + " filter_criteria={\n", |
| 429 | + " \"job_count\": {\"min\": 15, \"inclusive\": True}, # minimum job count threshold\n", |
| 430 | + " },\n", |
| 431 | + ")\n", |
| 432 | + "# Plot top inefficient users by Expected Value Requested VRAM Efficiency\n", |
| 433 | + "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_avg_req_vram_eff_score.head(10))\n", |
| 434 | + "users_with_metrics_visualizer.visualize(\n", |
| 435 | + " column=\"expected_value_requested_vram_efficiency\",\n", |
| 436 | + " bar_label_columns=[\"vram_hours\", \"job_count\"],\n", |
| 437 | + " figsize=(10, 6),\n", |
| 438 | + " anonymize=True,\n", |
| 439 | + ")" |
| 440 | + ] |
| 441 | + }, |
| 442 | + { |
| 443 | + "cell_type": "code", |
| 444 | + "execution_count": null, |
| 445 | + "id": "31", |
409 | 446 | "metadata": {}, |
410 | 447 | "outputs": [], |
411 | 448 | "source": [ |
|
425 | 462 | }, |
426 | 463 | { |
427 | 464 | "cell_type": "markdown", |
428 | | - "id": "30", |
| 465 | + "id": "32", |
429 | 466 | "metadata": {}, |
430 | 467 | "source": [ |
431 | | - "### <a id='toc1_6_2_'></a>[Generate all hoarding analysis metrics for users:](#toc0_)" |
| 468 | + "### <a id='toc1_6_3_'></a>[Generate all hoarding analysis metrics for users:](#toc0_)" |
432 | 469 | ] |
433 | 470 | }, |
434 | 471 | { |
435 | 472 | "cell_type": "code", |
436 | 473 | "execution_count": null, |
437 | | - "id": "31", |
| 474 | + "id": "33", |
438 | 475 | "metadata": {}, |
439 | 476 | "outputs": [], |
440 | 477 | "source": [ |
|
444 | 481 | }, |
445 | 482 | { |
446 | 483 | "cell_type": "markdown", |
447 | | - "id": "32", |
| 484 | + "id": "34", |
448 | 485 | "metadata": {}, |
449 | 486 | "source": [ |
450 | | - "#### <a id='toc1_6_2_1_'></a>[Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc0_)" |
| 487 | + "#### <a id='toc1_6_3_1_'></a>[Find most inefficient users hoarding node RAM based on `expected_value_ram_hoarding_fraction_diff`](#toc0_)" |
451 | 488 | ] |
452 | 489 | }, |
453 | 490 | { |
454 | 491 | "cell_type": "code", |
455 | 492 | "execution_count": null, |
456 | | - "id": "33", |
| 493 | + "id": "35", |
457 | 494 | "metadata": {}, |
458 | 495 | "outputs": [], |
459 | 496 | "source": [ |
|
476 | 513 | }, |
477 | 514 | { |
478 | 515 | "cell_type": "markdown", |
479 | | - "id": "34", |
| 516 | + "id": "36", |
480 | 517 | "metadata": {}, |
481 | 518 | "source": [ |
482 | | - "#### <a id='toc1_6_2_2_'></a>[Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc0_)" |
| 519 | + "#### <a id='toc1_6_3_2_'></a>[Find most inefficient users hoarding CPU cores based on `expected_value_core_hoarding_fraction_diff`](#toc0_)" |
483 | 520 | ] |
484 | 521 | }, |
485 | 522 | { |
486 | 523 | "cell_type": "code", |
487 | 524 | "execution_count": null, |
488 | | - "id": "35", |
| 525 | + "id": "37", |
489 | 526 | "metadata": {}, |
490 | 527 | "outputs": [], |
491 | 528 | "source": [ |
|
508 | 545 | }, |
509 | 546 | { |
510 | 547 | "cell_type": "markdown", |
511 | | - "id": "36", |
| 548 | + "id": "38", |
512 | 549 | "metadata": {}, |
513 | 550 | "source": [ |
514 | 551 | "## <a id='toc1_7_'></a>[PI Group Analysis](#toc0_)" |
|
517 | 554 | { |
518 | 555 | "cell_type": "code", |
519 | 556 | "execution_count": null, |
520 | | - "id": "37", |
| 557 | + "id": "39", |
521 | 558 | "metadata": {}, |
522 | 559 | "outputs": [], |
523 | 560 | "source": [ |
|
526 | 563 | }, |
527 | 564 | { |
528 | 565 | "cell_type": "markdown", |
529 | | - "id": "38", |
| 566 | + "id": "40", |
530 | 567 | "metadata": {}, |
531 | 568 | "source": [ |
532 | 569 | "#### <a id='toc1_7_1_1_'></a>[Find Inefficient PIs based on `avg_requested_vram_efficiency_score`](#toc0_)" |
|
535 | 572 | { |
536 | 573 | "cell_type": "code", |
537 | 574 | "execution_count": null, |
538 | | - "id": "39", |
| 575 | + "id": "41", |
539 | 576 | "metadata": {}, |
540 | 577 | "outputs": [], |
541 | 578 | "source": [ |
|
0 commit comments