|
5 | 5 | "id": "0", |
6 | 6 | "metadata": {}, |
7 | 7 | "source": [ |
8 | | - "# <a id='toc1_'></a>[Analysis of Jobs that Requested and Used VRAM](#toc0_)\n", |
| 8 | + "# <a id='toc1_'></a>[Analysis of GPU Jobs that Requested and Used VRAM](#toc0_)\n", |
9 | 9 | "This notebook generates the analysis for jobs that requested some VRAM and run on partitions that their type is GPU and some GPU VRAM is used. It looks at these jobs, corresponding users, and PI groups." |
10 | 10 | ] |
11 | 11 | }, |
|
15 | 15 | "metadata": {}, |
16 | 16 | "source": [ |
17 | 17 | "**Table of contents**<a id='toc0_'></a> \n", |
18 | | - "- [Analysis of Jobs that Requested and Used VRAM](#toc1_) \n", |
| 18 | + "- [Analysis of GPU Jobs that Requested and Used VRAM](#toc1_) \n", |
19 | 19 | " - [Setup](#toc1_1_) \n", |
20 | 20 | " - [Data Digestion and Preprocessing](#toc1_2_) \n", |
21 | 21 | " - [Narrowing Dataset to Relevant Partition](#toc1_3_) \n", |
|
61 | 61 | "# Import required modules\n", |
62 | 62 | "import sys\n", |
63 | 63 | "from pathlib import Path\n", |
64 | | - "import pandas as pd\n", |
65 | | - "import matplotlib.pyplot as plt\n", |
66 | | - "import seaborn as sns" |
| 64 | + "import pandas as pd" |
67 | 65 | ] |
68 | 66 | }, |
69 | 67 | { |
|
102 | 100 | "\n", |
103 | 101 | "from src.analysis import ResourceHoarding\n", |
104 | 102 | "from src.analysis import efficiency_analysis as ea\n", |
105 | | - "from src.visualization import JobsWithMetricsVisualizer, UsersWithMetricsVisualizer, PIGroupsWithMetricsVisualizer\n", |
| 103 | + "from src.visualization import (\n", |
| 104 | + " JobsWithMetricsVisualizer,\n", |
| 105 | + " UsersWithMetricsVisualizer,\n", |
| 106 | + " PIGroupsWithMetricsVisualizer,\n", |
| 107 | + ")\n", |
106 | 108 | "from src.config.enum_constants import ResourceHoardingDataFrameNameEnum\n", |
107 | | - "from src.config.paths import PI_GROUPS_VISUALIZATION_DATA_DIR" |
| 109 | + "from src.config.paths import (\n", |
| 110 | + " JOBS_VISUALIZATION_DATA_DIR,\n", |
| 111 | + " USERS_VISUALIZATION_DATA_DIR,\n", |
| 112 | + " PI_GROUPS_VISUALIZATION_DATA_DIR,\n", |
| 113 | + ")" |
108 | 114 | ] |
109 | 115 | }, |
110 | 116 | { |
|
239 | 245 | "# Plot top inefficient jobs by requested VRAM efficiency score, with VRAM-hours as labels\n", |
240 | 246 | "jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_vram_hours.head(10))\n", |
241 | 247 | "jobs_with_metrics_visualizer.visualize(\n", |
242 | | - " column=\"requested_vram_efficiency_score\", bar_label_columns=[\"vram_hours\", \"allocated_vram\"], figsize=(10, 6)\n", |
| 248 | + " output_dir_path=JOBS_VISUALIZATION_DATA_DIR,\n", |
| 249 | + " column=\"requested_vram_efficiency_score\",\n", |
| 250 | + " bar_label_columns=[\"vram_hours\", \"allocated_vram\"],\n", |
| 251 | + " figsize=(10, 6),\n", |
243 | 252 | ")" |
244 | 253 | ] |
245 | 254 | }, |
|
399 | 408 | "metadata": {}, |
400 | 409 | "outputs": [], |
401 | 410 | "source": [ |
402 | | - "# Distribution of Avg Requested VRAM Efficiency Score (actual values; all are <= 0)\n", |
403 | | - "# We keep scores as-is (negative or zero) and construct bins that respect the skew while\n", |
404 | | - "# still giving higher resolution near zero using log-spaced absolute values mapped back to negatives.\n", |
405 | | - "scores = inefficient_users_avg_req_vram_eff_score[\"avg_requested_vram_efficiency_score\"].dropna()\n", |
406 | | - "print(len(scores), \"scores found for plotting.\")\n", |
407 | | - "if scores.empty:\n", |
408 | | - " print(\"No scores to plot.\")\n", |
409 | | - "else:\n", |
410 | | - " # If all scores are exactly zero, a histogram is not informative\n", |
411 | | - " if (scores != 0).sum() == 0:\n", |
412 | | - " print(\"All scores are zero; histogram not informative.\")\n", |
413 | | - " else:\n", |
414 | | - " import numpy as np\n", |
415 | | - "\n", |
416 | | - " fig, ax = plt.subplots(figsize=(8, 5))\n", |
417 | | - "\n", |
418 | | - " # Separate negatives (expected) from zeros\n", |
419 | | - " neg_scores = scores[scores < 0]\n", |
420 | | - " zero_scores = scores[scores == 0]\n", |
421 | | - "\n", |
422 | | - " min_abs = None # track smallest non-zero absolute value for symlog threshold\n", |
423 | | - "\n", |
424 | | - " # Build bins: if we have negative values, create log-spaced absolute edges then map back\n", |
425 | | - " if not neg_scores.empty:\n", |
426 | | - " n_bins = 100\n", |
427 | | - " min_abs = neg_scores.abs().min()\n", |
428 | | - " max_abs = neg_scores.abs().max()\n", |
429 | | - " if min_abs == max_abs:\n", |
430 | | - " # Degenerate case: all negative values identical -> fall back to linear bins\n", |
431 | | - " bins = np.linspace(neg_scores.min(), 0, 20)\n", |
432 | | - " else:\n", |
433 | | - " abs_edges = np.logspace(np.log10(min_abs), np.log10(max_abs), n_bins)\n", |
434 | | - " # Convert absolute edges to negative edges (descending), then append 0 as the last edge\n", |
435 | | - " neg_edges = -abs_edges[::-1]\n", |
436 | | - " bins = np.unique(np.concatenate([neg_edges, [0]])) # ensure strictly increasing\n", |
437 | | - " else:\n", |
438 | | - " # No negative values (only zeros) already handled earlier; fallback just in case\n", |
439 | | - " bins = 3\n", |
440 | | - "\n", |
441 | | - " sns.histplot(scores, bins=bins, color=\"#1f77b4\", ax=ax)\n", |
442 | | - " ax.set_xlabel(\"Avg Requested VRAM Efficiency Score (<= 0)\")\n", |
443 | | - " ax.set_ylabel(\"Count\")\n", |
444 | | - " ax.set_title(\"Distribution of Avg Requested VRAM Efficiency Scores (Actual Values, Log X)\")\n", |
445 | | - "\n", |
446 | | - " # Apply symmetrical log scale to x-axis to compress the long negative tail while keeping zero.\n", |
447 | | - " # linthresh defines the range around zero that stays linear; choose smallest non-zero magnitude.\n", |
448 | | - " if min_abs is not None and min_abs > 0:\n", |
449 | | - " linthresh = min_abs\n", |
450 | | - " else:\n", |
451 | | - " linthresh = 1e-6 # fallback small threshold\n", |
452 | | - " ax.set_xscale(\"symlog\", linthresh=linthresh, linscale=1.0, base=10)\n", |
453 | | - "\n", |
454 | | - " # Annotation: counts (negative & zero) and total\n", |
455 | | - " neg_count = (scores < 0).sum()\n", |
456 | | - " zero_count = (scores == 0).sum()\n", |
457 | | - " total = len(scores)\n", |
458 | | - " ax.text(\n", |
459 | | - " 0.98,\n", |
460 | | - " 0.95,\n", |
461 | | - " (f\"Counts:\\nNegative: {neg_count}\\nZero: {zero_count}\\n# of Users: {total}\"),\n", |
462 | | - " transform=ax.transAxes,\n", |
463 | | - " ha=\"right\",\n", |
464 | | - " va=\"top\",\n", |
465 | | - " fontsize=9,\n", |
466 | | - " bbox=dict(boxstyle=\"round,pad=0.3\", fc=\"white\", ec=\"gray\", alpha=0.9),\n", |
467 | | - " )\n", |
468 | | - "\n", |
469 | | - " # Cumulative distribution (CDF) over actual score values\n", |
470 | | - " counts, bin_edges = np.histogram(scores, bins=bins)\n", |
471 | | - " cdf = np.cumsum(counts) / counts.sum()\n", |
472 | | - " mids = (bin_edges[1:] + bin_edges[:-1]) / 2\n", |
473 | | - " ax2 = ax.twinx()\n", |
474 | | - " ax2.plot(mids, cdf, color=\"crimson\", marker=\"o\", linestyle=\"-\", linewidth=1, markersize=3)\n", |
475 | | - " ax2.set_ylim(0, 1)\n", |
476 | | - " ax2.set_ylabel(\"Cumulative Fraction\", color=\"crimson\")\n", |
477 | | - " ax2.tick_params(axis=\"y\", colors=\"crimson\")\n", |
478 | | - "\n", |
479 | | - " # Ensure x-axis spans to (slightly) include zero for clarity\n", |
480 | | - " left, right = ax.get_xlim()\n", |
481 | | - " if right < 0:\n", |
482 | | - " ax.set_xlim(left, 0)\n", |
483 | | - "\n", |
484 | | - " plt.tight_layout()\n", |
485 | | - " plt.show()\n", |
486 | | - "\n", |
487 | | - "# Notes:\n", |
488 | | - "# - We plot the actual (negative/zero) scores instead of absolute values.\n", |
489 | | - "# - symlog x-scale provides a log-like compression for large negative magnitudes while keeping zero.\n", |
490 | | - "# - linthresh picks the smallest non-zero magnitude so near-zero structure is visible.\n", |
491 | | - "# - CDF is computed over actual values to show accumulation from most negative toward zero." |
| 411 | + "users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_avg_req_vram_eff_score)\n", |
| 412 | + "users_with_metrics_visualizer.visualize_metric_distribution(\n", |
| 413 | + " output_dir_path=USERS_VISUALIZATION_DATA_DIR, column=\"avg_requested_vram_efficiency_score\", figsize=(8, 5)\n", |
| 414 | + ")" |
492 | 415 | ] |
493 | 416 | }, |
494 | 417 | { |
|
0 commit comments