diff --git a/_config.yml b/_config.yml index 1e1a45eb..89927eb0 100644 --- a/_config.yml +++ b/_config.yml @@ -33,6 +33,11 @@ html: - _static/overlay-label-accessibility.js - _static/search_bar_submit_button.js +sphinx: + config: + html_extra_path: + - _static/ + launch_buttons: thebe : false @@ -119,3 +124,9 @@ execute: - 'short_courses/python_language_features.ipynb' - 'short_courses/python_testing.ipynb' - 'individual_modules/markdown_with_python/liveOutput.ipynb' + - 'individual_modules/intro_to_GPUs/theory.ipynb' + - 'individual_modules/intro_to_GPUs/spack.ipynb' + - 'individual_modules/intro_to_GPUs/slurm.ipynb' + - 'individual_modules/intro_to_GPUs/conways_game_of_life.ipynb' + - 'individual_modules/intro_to_GPUs/profiling.ipynb' + - 'individual_modules/intro_to_GPUs/temperature_diffusion.ipynb' \ No newline at end of file diff --git a/_static/intro_to_gpu_figures/all_methods_hardware.png b/_static/intro_to_gpu_figures/all_methods_hardware.png new file mode 100644 index 00000000..ff490fb4 Binary files /dev/null and b/_static/intro_to_gpu_figures/all_methods_hardware.png differ diff --git a/_static/intro_to_gpu_figures/cupy_across_hardware.png b/_static/intro_to_gpu_figures/cupy_across_hardware.png new file mode 100644 index 00000000..cb0354b3 Binary files /dev/null and b/_static/intro_to_gpu_figures/cupy_across_hardware.png differ diff --git a/_static/intro_to_gpu_figures/game_of_life_example.gif b/_static/intro_to_gpu_figures/game_of_life_example.gif new file mode 100644 index 00000000..c43598f4 Binary files /dev/null and b/_static/intro_to_gpu_figures/game_of_life_example.gif differ diff --git a/_static/intro_to_gpu_figures/naive_across_hardware.png b/_static/intro_to_gpu_figures/naive_across_hardware.png new file mode 100644 index 00000000..57939087 Binary files /dev/null and b/_static/intro_to_gpu_figures/naive_across_hardware.png differ diff --git a/_static/intro_to_gpu_figures/numpy_across_hardware.png b/_static/intro_to_gpu_figures/numpy_across_hardware.png new file mode 100644 index 00000000..d5c9c507 Binary files /dev/null and b/_static/intro_to_gpu_figures/numpy_across_hardware.png differ diff --git a/_static/intro_to_gpu_figures/perf_amd_epyc_7v12_64-core_processor_nv_a100.png b/_static/intro_to_gpu_figures/perf_amd_epyc_7v12_64-core_processor_nv_a100.png new file mode 100644 index 00000000..f275211e Binary files /dev/null and b/_static/intro_to_gpu_figures/perf_amd_epyc_7v12_64-core_processor_nv_a100.png differ diff --git a/_static/intro_to_gpu_figures/perf_amd_epyc_9v84_96-core_processor_nv_h100.png b/_static/intro_to_gpu_figures/perf_amd_epyc_9v84_96-core_processor_nv_h100.png new file mode 100644 index 00000000..254db46f Binary files /dev/null and b/_static/intro_to_gpu_figures/perf_amd_epyc_9v84_96-core_processor_nv_h100.png differ diff --git a/_static/intro_to_gpu_figures/perf_amd_ryzen_9_5950x_16-core_processor_nv_rtx_3070.png b/_static/intro_to_gpu_figures/perf_amd_ryzen_9_5950x_16-core_processor_nv_rtx_3070.png new file mode 100644 index 00000000..f74c5199 Binary files /dev/null and b/_static/intro_to_gpu_figures/perf_amd_ryzen_9_5950x_16-core_processor_nv_rtx_3070.png differ diff --git a/_static/intro_to_gpu_figures/temperature_cube.html b/_static/intro_to_gpu_figures/temperature_cube.html new file mode 100644 index 00000000..0db175ff --- /dev/null +++ b/_static/intro_to_gpu_figures/temperature_cube.html @@ -0,0 +1,18 @@ + + + +
+
+ + \ No newline at end of file diff --git a/_static/intro_to_gpu_figures/temperature_diffusion_timings.csv b/_static/intro_to_gpu_figures/temperature_diffusion_timings.csv new file mode 100644 index 00000000..89b26001 --- /dev/null +++ b/_static/intro_to_gpu_figures/temperature_diffusion_timings.csv @@ -0,0 +1,13 @@ +gpu_name,cpu_name,method,num_timesteps,mean_time_sec,std_dev_sec +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,Pure Python,10,43.698821,0.084230 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,Pure Python,25,110.014916,0.939186 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,Pure Python,50,223.748059,1.287061 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,Pure Python,100,461.348654,7.234776 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,NumPy (CPU),10,4.980454,0.173302 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,NumPy (CPU),25,22.786639,0.205813 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,NumPy (CPU),50,86.931572,0.502695 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,NumPy (CPU),100,347.883550,3.980833 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,CuPy (GPU),10,3.854066,0.140653 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,CuPy (GPU),25,12.885241,0.022276 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,CuPy (GPU),50,46.622783,0.178398 +NVIDIA H100 NVL,AMD EPYC 9V84 96-Core Processor,CuPy (GPU),100,182.261322,0.193946 diff --git a/_static/intro_to_gpu_figures/temperature_diffusion_timings.png b/_static/intro_to_gpu_figures/temperature_diffusion_timings.png new file mode 100644 index 00000000..eeaa61ca Binary files /dev/null and b/_static/intro_to_gpu_figures/temperature_diffusion_timings.png differ diff --git a/_static/intro_to_gpu_figures/temperature_slice.html b/_static/intro_to_gpu_figures/temperature_slice.html new file mode 100644 index 00000000..7e17843f --- /dev/null +++ b/_static/intro_to_gpu_figures/temperature_slice.html @@ -0,0 +1,18 @@ + + + +
+
+ + \ No newline at end of file diff --git a/_static/multiple_regression_hyperplane.html b/_static/multiple_regression_hyperplane.html index e657a871..ef82e7e2 100644 --- a/_static/multiple_regression_hyperplane.html +++ b/_static/multiple_regression_hyperplane.html @@ -2,6 +2,6 @@
-
+
\ No newline at end of file diff --git a/_static/multiple_regression_hyperplane_2.html b/_static/multiple_regression_hyperplane_2.html index eac56f47..1da15288 100644 --- a/_static/multiple_regression_hyperplane_2.html +++ b/_static/multiple_regression_hyperplane_2.html @@ -2,6 +2,6 @@
-
+
\ No newline at end of file diff --git a/_static/workshop_prereqs_advanced_regression_analysis_with_r_.html b/_static/workshop_prereqs_advanced_regression_analysis_with_r_.html index 1c628a68..0d276aff 100644 --- a/_static/workshop_prereqs_advanced_regression_analysis_with_r_.html +++ b/_static/workshop_prereqs_advanced_regression_analysis_with_r_.html @@ -282,8 +282,8 @@

// parsing and collecting nodes and edges from the python - nodes = new vis.DataSet([{"color": "#B0B0B0", "id": "Regression Analysis with R Adapting to Varied Data Types", "label": "Regression Analysis with R Adapting to Varied Data Types", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Regression Analysis with R Adapting to Varied Data Types\nCourse Pre-reqs: Introduction to Regression with R\nSubsequent Courses: Advanced Regression Analysis With R "}, {"color": "#B0B0B0", "id": "Advanced Regression Analysis With R ", "label": "Advanced Regression Analysis With R ", "level": 4, "shape": "dot", "size": 10, "title": "Course Name: Advanced Regression Analysis With R \nCourse Pre-reqs: Regression Analysis with R Adapting to Varied Data Types\nSubsequent Courses: None"}, {"color": "#B0B0B0", "id": "Introduction to Regression with R", "label": "Introduction to Regression with R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Regression with R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: Regression Analysis with R Adapting to Varied Data Types"}, {"color": "#B0B0B0", "id": "Introduction to R", "label": "Introduction to R", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to R\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Regression with R"}]); - edges = new vis.DataSet([{"arrows": "to", "from": "Regression Analysis with R Adapting to Varied Data Types", "to": "Advanced Regression Analysis With R ", "width": 1}, {"arrows": "to", "from": "Introduction to Regression with R", "to": "Regression Analysis with R Adapting to Varied Data Types", "width": 1}, {"arrows": "to", "from": "Introduction to R", "to": "Introduction to Regression with R", "width": 1}]); + nodes = new vis.DataSet([{"color": "#B0B0B0", "id": "Regression Analysis with R Adapting to Varied Data Types", "label": "Regression Analysis with R Adapting to Varied Data Types", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Regression Analysis with R Adapting to Varied Data Types\nCourse Pre-reqs: Introduction to Regression with R\nSubsequent Courses: Advanced Regression Analysis With R "}, {"color": "#B0B0B0", "id": "Advanced Regression Analysis With R ", "label": "Advanced Regression Analysis With R ", "level": 4, "shape": "dot", "size": 10, "title": "Course Name: Advanced Regression Analysis With R \nCourse Pre-reqs: Regression Analysis with R Adapting to Varied Data Types\nSubsequent Courses: None"}, {"color": "#B0B0B0", "id": "Introduction to R", "label": "Introduction to R", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to R\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Regression with R"}, {"color": "#B0B0B0", "id": "Introduction to Regression with R", "label": "Introduction to Regression with R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Regression with R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: Regression Analysis with R Adapting to Varied Data Types"}]); + edges = new vis.DataSet([{"arrows": "to", "from": "Regression Analysis with R Adapting to Varied Data Types", "to": "Advanced Regression Analysis With R ", "width": 1}, {"arrows": "to", "from": "Introduction to R", "to": "Introduction to Regression with R", "width": 1}, {"arrows": "to", "from": "Introduction to Regression with R", "to": "Regression Analysis with R Adapting to Varied Data Types", "width": 1}]); nodeColors = {}; allNodes = nodes.get({ returnType: "Object" }); diff --git a/_static/workshop_prereqs_introduction_to_machine_learning.html b/_static/workshop_prereqs_introduction_to_machine_learning.html index e0e3f03b..1ae88bbc 100644 --- a/_static/workshop_prereqs_introduction_to_machine_learning.html +++ b/_static/workshop_prereqs_introduction_to_machine_learning.html @@ -282,8 +282,8 @@

// parsing and collecting nodes and edges from the python - nodes = new vis.DataSet([{"color": "#B0B0B0", "id": "Python for Data Analysis", "label": "Python for Data Analysis", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Python for Data Analysis\nCourse Pre-reqs: Introduction to Python\nSubsequent Courses: Introduction to Machine Learning"}, {"color": "#B0B0B0", "id": "Introduction to Machine Learning", "label": "Introduction to Machine Learning", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Machine Learning\nCourse Pre-reqs: Python for Data Analysis\nSubsequent Courses: None"}, {"color": "#B0B0B0", "id": "Introduction to Python", "label": "Introduction to Python", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Python\nCourse Pre-reqs: None\nSubsequent Courses: Python for Data Analysis"}]); - edges = new vis.DataSet([{"arrows": "to", "from": "Python for Data Analysis", "to": "Introduction to Machine Learning", "width": 1}, {"arrows": "to", "from": "Introduction to Python", "to": "Python for Data Analysis", "width": 1}]); + nodes = new vis.DataSet([{"color": "#B0B0B0", "id": "Introduction to Python", "label": "Introduction to Python", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Python\nCourse Pre-reqs: None\nSubsequent Courses: Python for Data Analysis"}, {"color": "#B0B0B0", "id": "Python for Data Analysis", "label": "Python for Data Analysis", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Python for Data Analysis\nCourse Pre-reqs: Introduction to Python\nSubsequent Courses: Introduction to Machine Learning"}, {"color": "#B0B0B0", "id": "Introduction to Machine Learning", "label": "Introduction to Machine Learning", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Machine Learning\nCourse Pre-reqs: Python for Data Analysis\nSubsequent Courses: None"}]); + edges = new vis.DataSet([{"arrows": "to", "from": "Introduction to Python", "to": "Python for Data Analysis", "width": 1}, {"arrows": "to", "from": "Python for Data Analysis", "to": "Introduction to Machine Learning", "width": 1}]); nodeColors = {}; allNodes = nodes.get({ returnType: "Object" }); diff --git a/_static/workshop_prereqs_parallel_computing.html b/_static/workshop_prereqs_parallel_computing.html index 54885b27..fe8a9c41 100644 --- a/_static/workshop_prereqs_parallel_computing.html +++ b/_static/workshop_prereqs_parallel_computing.html @@ -282,8 +282,8 @@

// parsing and collecting nodes and edges from the python - nodes = new vis.DataSet([{"color": "#B0B0B0", "id": "Introduction to Unix", "label": "Introduction to Unix", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Unix\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to HPC"}, {"color": "#B0B0B0", "id": "Introduction to HPC", "label": "Introduction to HPC", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to HPC\nCourse Pre-reqs: Introduction to Unix\nSubsequent Courses: Parallel Computing"}, {"color": "#B0B0B0", "id": "Parallel Computing", "label": "Parallel Computing", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Parallel Computing\nCourse Pre-reqs: Introduction to HPC\nSubsequent Courses: None"}]); - edges = new vis.DataSet([{"arrows": "to", "from": "Introduction to Unix", "to": "Introduction to HPC", "width": 1}, {"arrows": "to", "from": "Introduction to HPC", "to": "Parallel Computing", "width": 1}]); + nodes = new vis.DataSet([{"color": "#B0B0B0", "id": "Introduction to HPC", "label": "Introduction to HPC", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to HPC\nCourse Pre-reqs: Introduction to Unix\nSubsequent Courses: Parallel Computing"}, {"color": "#B0B0B0", "id": "Parallel Computing", "label": "Parallel Computing", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Parallel Computing\nCourse Pre-reqs: Introduction to HPC\nSubsequent Courses: None"}, {"color": "#B0B0B0", "id": "Introduction to Unix", "label": "Introduction to Unix", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Unix\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to HPC"}]); + edges = new vis.DataSet([{"arrows": "to", "from": "Introduction to HPC", "to": "Parallel Computing", "width": 1}, {"arrows": "to", "from": "Introduction to Unix", "to": "Introduction to HPC", "width": 1}]); nodeColors = {}; allNodes = nodes.get({ returnType: "Object" }); diff --git a/_static/workshop_prereqs_regression_analysis_with_r_adapting_to_varied_data_types.html b/_static/workshop_prereqs_regression_analysis_with_r_adapting_to_varied_data_types.html index c2443843..ebaa7777 100644 --- a/_static/workshop_prereqs_regression_analysis_with_r_adapting_to_varied_data_types.html +++ b/_static/workshop_prereqs_regression_analysis_with_r_adapting_to_varied_data_types.html @@ -282,8 +282,8 @@

// parsing and collecting nodes and edges from the python - nodes = new vis.DataSet([{"color": "#B0B0B0", "id": "Introduction to Regression with R", "label": "Introduction to Regression with R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Regression with R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: Regression Analysis with R Adapting to Varied Data Types"}, {"color": "#B0B0B0", "id": "Regression Analysis with R Adapting to Varied Data Types", "label": "Regression Analysis with R Adapting to Varied Data Types", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Regression Analysis with R Adapting to Varied Data Types\nCourse Pre-reqs: Introduction to Regression with R\nSubsequent Courses: None"}, {"color": "#B0B0B0", "id": "Introduction to R", "label": "Introduction to R", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to R\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Regression with R"}]); - edges = new vis.DataSet([{"arrows": "to", "from": "Introduction to Regression with R", "to": "Regression Analysis with R Adapting to Varied Data Types", "width": 1}, {"arrows": "to", "from": "Introduction to R", "to": "Introduction to Regression with R", "width": 1}]); + nodes = new vis.DataSet([{"color": "#B0B0B0", "id": "Introduction to R", "label": "Introduction to R", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to R\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Regression with R"}, {"color": "#B0B0B0", "id": "Introduction to Regression with R", "label": "Introduction to Regression with R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Regression with R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: Regression Analysis with R Adapting to Varied Data Types"}, {"color": "#B0B0B0", "id": "Regression Analysis with R Adapting to Varied Data Types", "label": "Regression Analysis with R Adapting to Varied Data Types", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Regression Analysis with R Adapting to Varied Data Types\nCourse Pre-reqs: Introduction to Regression with R\nSubsequent Courses: None"}]); + edges = new vis.DataSet([{"arrows": "to", "from": "Introduction to R", "to": "Introduction to Regression with R", "width": 1}, {"arrows": "to", "from": "Introduction to Regression with R", "to": "Regression Analysis with R Adapting to Varied Data Types", "width": 1}]); nodeColors = {}; allNodes = nodes.get({ returnType: "Object" }); diff --git a/_static/workshops_network_python_ds.html b/_static/workshops_network_python_ds.html index d430b69f..0a25753a 100644 --- a/_static/workshops_network_python_ds.html +++ b/_static/workshops_network_python_ds.html @@ -282,7 +282,7 @@

// parsing and collecting nodes and edges from the python - nodes = new vis.DataSet([{"color": "#FFD700", "id": "Introduction to Version Control with Git and GitHub", "label": "Introduction to Version Control with Git and GitHub", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Version Control with Git and GitHub\nCourse Pre-reqs: Introduction to Unix\nSubsequent Courses: Intermediate Version Control"}, {"color": "#FFD700", "id": "Intermediate Version Control", "label": "Intermediate Version Control", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Intermediate Version Control\nCourse Pre-reqs: Introduction to Version Control with Git and GitHub\nSubsequent Courses: None"}, {"color": "#FFD700", "id": "Introduction to Unix", "label": "Introduction to Unix", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Unix\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Version Control with Git and GitHub"}, {"color": "#FFD700", "id": "Introduction to Python", "label": "Introduction to Python", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Python\nCourse Pre-reqs: None\nSubsequent Courses: Python for Data Analysis, Using Markdown for Python"}, {"color": "#FFD700", "id": "Python for Data Analysis", "label": "Python for Data Analysis", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Python for Data Analysis\nCourse Pre-reqs: Introduction to Python\nSubsequent Courses: Introduction to Machine Learning"}, {"color": "#FFD700", "id": "Using Markdown for Python", "label": "Using Markdown for Python", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Using Markdown for Python\nCourse Pre-reqs: Introduction to Python\nSubsequent Courses: None"}, {"color": "#FFD700", "id": "Introduction to Machine Learning", "label": "Introduction to Machine Learning", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Machine Learning\nCourse Pre-reqs: Python for Data Analysis\nSubsequent Courses: None"}]); + nodes = new vis.DataSet([{"color": "#4682B4", "id": "Introduction to Version Control with Git and GitHub", "label": "Introduction to Version Control with Git and GitHub", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Version Control with Git and GitHub\nCourse Pre-reqs: Introduction to Unix\nSubsequent Courses: Intermediate Version Control"}, {"color": "#4682B4", "id": "Intermediate Version Control", "label": "Intermediate Version Control", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Intermediate Version Control\nCourse Pre-reqs: Introduction to Version Control with Git and GitHub\nSubsequent Courses: None"}, {"color": "#4682B4", "id": "Introduction to Unix", "label": "Introduction to Unix", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Unix\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Version Control with Git and GitHub"}, {"color": "#4682B4", "id": "Introduction to Python", "label": "Introduction to Python", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Python\nCourse Pre-reqs: None\nSubsequent Courses: Python for Data Analysis, Using Markdown for Python"}, {"color": "#4682B4", "id": "Python for Data Analysis", "label": "Python for Data Analysis", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Python for Data Analysis\nCourse Pre-reqs: Introduction to Python\nSubsequent Courses: Introduction to Machine Learning"}, {"color": "#4682B4", "id": "Using Markdown for Python", "label": "Using Markdown for Python", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Using Markdown for Python\nCourse Pre-reqs: Introduction to Python\nSubsequent Courses: None"}, {"color": "#4682B4", "id": "Introduction to Machine Learning", "label": "Introduction to Machine Learning", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Machine Learning\nCourse Pre-reqs: Python for Data Analysis\nSubsequent Courses: None"}]); edges = new vis.DataSet([{"arrows": "to", "from": "Introduction to Version Control with Git and GitHub", "to": "Intermediate Version Control", "width": 1}, {"arrows": "to", "from": "Introduction to Unix", "to": "Introduction to Version Control with Git and GitHub", "width": 1}, {"arrows": "to", "from": "Introduction to Python", "to": "Python for Data Analysis", "width": 1}, {"arrows": "to", "from": "Introduction to Python", "to": "Using Markdown for Python", "width": 1}, {"arrows": "to", "from": "Python for Data Analysis", "to": "Introduction to Machine Learning", "width": 1}]); nodeColors = {}; diff --git a/_static/workshops_network_r_ds.html b/_static/workshops_network_r_ds.html index fc6ca45f..73fcc6fe 100644 --- a/_static/workshops_network_r_ds.html +++ b/_static/workshops_network_r_ds.html @@ -282,7 +282,7 @@

// parsing and collecting nodes and edges from the python - nodes = new vis.DataSet([{"color": "#4682B4", "id": "Introduction to R", "label": "Introduction to R", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to R\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Regression with R, Working With Data In R, Improve Your R Code, Introduction to Markdown in R"}, {"color": "#4682B4", "id": "Introduction to Regression with R", "label": "Introduction to Regression with R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Regression with R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: Regression Analysis with R Adapting to Varied Data Types"}, {"color": "#4682B4", "id": "Working With Data In R", "label": "Working With Data In R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Working With Data In R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: None"}, {"color": "#4682B4", "id": "Improve Your R Code", "label": "Improve Your R Code", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Improve Your R Code\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: None"}, {"color": "#4682B4", "id": "Introduction to Markdown in R", "label": "Introduction to Markdown in R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Markdown in R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: None"}, {"color": "#4682B4", "id": "Regression Analysis with R Adapting to Varied Data Types", "label": "Regression Analysis with R Adapting to Varied Data Types", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Regression Analysis with R Adapting to Varied Data Types\nCourse Pre-reqs: Introduction to Regression with R\nSubsequent Courses: Advanced Regression Analysis With R "}, {"color": "#4682B4", "id": "Advanced Regression Analysis With R ", "label": "Advanced Regression Analysis With R ", "level": 4, "shape": "dot", "size": 10, "title": "Course Name: Advanced Regression Analysis With R \nCourse Pre-reqs: Regression Analysis with R Adapting to Varied Data Types\nSubsequent Courses: None"}, {"color": "#4682B4", "id": "Introduction to Version Control with Git and GitHub", "label": "Introduction to Version Control with Git and GitHub", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Version Control with Git and GitHub\nCourse Pre-reqs: Introduction to Unix\nSubsequent Courses: Intermediate Version Control"}, {"color": "#4682B4", "id": "Intermediate Version Control", "label": "Intermediate Version Control", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Intermediate Version Control\nCourse Pre-reqs: Introduction to Version Control with Git and GitHub\nSubsequent Courses: None"}, {"color": "#4682B4", "id": "Introduction to Unix", "label": "Introduction to Unix", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Unix\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Version Control with Git and GitHub"}]); + nodes = new vis.DataSet([{"color": "#FFD700", "id": "Introduction to R", "label": "Introduction to R", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to R\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Regression with R, Working With Data In R, Improve Your R Code, Introduction to Markdown in R"}, {"color": "#FFD700", "id": "Introduction to Regression with R", "label": "Introduction to Regression with R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Regression with R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: Regression Analysis with R Adapting to Varied Data Types"}, {"color": "#FFD700", "id": "Working With Data In R", "label": "Working With Data In R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Working With Data In R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: None"}, {"color": "#FFD700", "id": "Improve Your R Code", "label": "Improve Your R Code", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Improve Your R Code\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: None"}, {"color": "#FFD700", "id": "Introduction to Markdown in R", "label": "Introduction to Markdown in R", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Markdown in R\nCourse Pre-reqs: Introduction to R\nSubsequent Courses: None"}, {"color": "#FFD700", "id": "Regression Analysis with R Adapting to Varied Data Types", "label": "Regression Analysis with R Adapting to Varied Data Types", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Regression Analysis with R Adapting to Varied Data Types\nCourse Pre-reqs: Introduction to Regression with R\nSubsequent Courses: Advanced Regression Analysis With R "}, {"color": "#FFD700", "id": "Advanced Regression Analysis With R ", "label": "Advanced Regression Analysis With R ", "level": 4, "shape": "dot", "size": 10, "title": "Course Name: Advanced Regression Analysis With R \nCourse Pre-reqs: Regression Analysis with R Adapting to Varied Data Types\nSubsequent Courses: None"}, {"color": "#FFD700", "id": "Introduction to Version Control with Git and GitHub", "label": "Introduction to Version Control with Git and GitHub", "level": 2, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Version Control with Git and GitHub\nCourse Pre-reqs: Introduction to Unix\nSubsequent Courses: Intermediate Version Control"}, {"color": "#FFD700", "id": "Intermediate Version Control", "label": "Intermediate Version Control", "level": 3, "shape": "dot", "size": 10, "title": "Course Name: Intermediate Version Control\nCourse Pre-reqs: Introduction to Version Control with Git and GitHub\nSubsequent Courses: None"}, {"color": "#FFD700", "id": "Introduction to Unix", "label": "Introduction to Unix", "level": 1, "shape": "dot", "size": 10, "title": "Course Name: Introduction to Unix\nCourse Pre-reqs: None\nSubsequent Courses: Introduction to Version Control with Git and GitHub"}]); edges = new vis.DataSet([{"arrows": "to", "from": "Introduction to R", "to": "Introduction to Regression with R", "width": 1}, {"arrows": "to", "from": "Introduction to R", "to": "Working With Data In R", "width": 1}, {"arrows": "to", "from": "Introduction to R", "to": "Improve Your R Code", "width": 1}, {"arrows": "to", "from": "Introduction to R", "to": "Introduction to Markdown in R", "width": 1}, {"arrows": "to", "from": "Introduction to Regression with R", "to": "Regression Analysis with R Adapting to Varied Data Types", "width": 1}, {"arrows": "to", "from": "Regression Analysis with R Adapting to Varied Data Types", "to": "Advanced Regression Analysis With R ", "width": 1}, {"arrows": "to", "from": "Introduction to Version Control with Git and GitHub", "to": "Intermediate Version Control", "width": 1}, {"arrows": "to", "from": "Introduction to Unix", "to": "Introduction to Version Control with Git and GitHub", "width": 1}]); nodeColors = {}; diff --git a/_toc.yml b/_toc.yml index 2c759b51..10229002 100644 --- a/_toc.yml +++ b/_toc.yml @@ -96,6 +96,9 @@ parts: sections: - file: where_is_my_understanding/intro_to_version_control - file: where_is_my_understanding/intermediate_version_control + - file: where_is_my_understanding/GPUs + sections: + - file: where_is_my_understanding/introduction_to_GPUs - file: pathways/related_courses sections: - file: pathways/hpc @@ -328,6 +331,17 @@ parts: - file: individual_modules/intermediate_version_control/rewriting_history - file: individual_modules/intermediate_version_control/further_topics - file: individual_modules/intermediate_version_control/resources + - file: course_homepages/GPUs + sections: + - file: individual_modules/section_landing_pages/introduction_to_GPUs + sections: + - file: individual_modules/intro_to_GPUs/setup + - file: individual_modules/intro_to_GPUs/theory + - file: individual_modules/intro_to_GPUs/spack + - file: individual_modules/intro_to_GPUs/slurm + - file: individual_modules/intro_to_GPUs/conways_game_of_life + - file: individual_modules/intro_to_GPUs/profiling + - file: individual_modules/intro_to_GPUs/temperature_diffusion - file: individual_modules/bibliography - caption: Short Courses chapters: diff --git a/course_homepages/GPUs.ipynb b/course_homepages/GPUs.ipynb new file mode 100644 index 00000000..c7710c92 --- /dev/null +++ b/course_homepages/GPUs.ipynb @@ -0,0 +1,41 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4cd0d89f-b3b7-4360-8ae3-f47c15958c8e", + "metadata": {}, + "source": [ + "# GPUs\n", + "\n", + "The following set of courses discuss GPUs. The details for each of the courses can be found on their respective pages. \n", + "\n", + "## Introduction to GPUS Self Study Notes\n", + "\n", + "[Clickable Link to Self Study Notes](../individual_modules/section_landing_pages/introduction_to_GPUs.md)\n", + "\n", + "This course provides all the essential tools to leverage GPUs effectively, guiding participants through GPU programming, software management, and performance optimisation while offering direct comparisons with CPU implementations. Through hands-on examples such as Conway’s Game of Life and temperature diffusion, learners explore practical high-performance computing techniques, including profiling, Slurm job scheduling, and Spack-based environment setup. The material bridges theory and practice, equipping participants to translate computational concepts into efficient, scalable workflows on modern HPC systems." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/individual_modules/intro_to_GPUs/conways_game_of_life.ipynb b/individual_modules/intro_to_GPUs/conways_game_of_life.ipynb new file mode 100644 index 00000000..71b3b102 --- /dev/null +++ b/individual_modules/intro_to_GPUs/conways_game_of_life.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1461acc1-00d3-4cf0-8a2a-5f201a7840b1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Project: Conway's Game of Life - CPU vs GPU Implementation \n", + "\n", + "## Resource Files \n", + "\n", + "The job submission scripts specifically configured for use on the University of Exeter ISCA HPC system are available [here](../intro_to_GPUs/zip_files/exeter_isca_slurm_submission_scripts.zip). \n", + "\n", + "General-purpose job submission scripts, which can serve as a starting point for use on other HPC systems (with minor modifications required for this course), are available [here](../intro_to_GPUs/zip_files/slurm_submission_scripts.zip). \n", + "\n", + "The Python scripts used in this course can be downloaded [here](../intro_to_GPUs/zip_files/scripts.zip). \n", + "\n", + "All supplementary files required for the course are available [here](../intro_to_GPUs/zip_files/files.zip). \n", + "\n", + "The presentation slides for this course can be accessed [here](../intro_to_GPUs/slides/GPU_Training_Day_Slides.pptx).\n", + "\n", + "## Overview \n", + "In this project, we are going to see implementations of **Conway's Game of Life**, a classic cellular automaton in three ways: a pure python approach (to run on the CPU), a vectorised approach using NumPy (to run on the CPU) and then using CuPy (to run on the GPU). We'll also visualise the evolution of the Game of Life grid to see the computation in action. \n", + "\n", + "## What is Conway's Game of Life?\n", + "\n", + "It's a zero-player game devised by John Conway, where you have a grid of cells that live or die based on a few simple rules:\n", + "- Each cell can be \"alive\" (1) or \"dead\" (0).\n", + "- At each time step (generation), the following rules apply to every cell simultaneously:\n", + " - Any live cell with fewer than 2 live neighbours dies (underpopulation).\n", + " - Any live cell with 2 or 3 live neighbours lives on to the next generation (survival).\n", + " - Any live cell with more than 3 live neighbours dies (overpopulation).\n", + " - Any dead cell with exactly 3 live neighbours becomes a live cell (reproduction).\n", + "- Neighbours are the 8 cells touching a given cell horizontally, vertically, or diagonally.\n", + "- From these simple rules emerges a lot of interesting behaviour – stable patterns, oscillators, spaceships (patterns that move), etc. It's a good example of a grid-based simulation that can benefit from parallel computation because the state of each cell for the next generation can be computed independently (based on the current generation).\n", + "\n", + "## Visualisation of Game of Life\n", + "\n", + "To make this project more visually engaging, below is an **animated GIF** showing an example of a Game of Life simulation starting from a random initial configuration. White pixels represent live cells, and black pixels represent dead cells. You can see patterns forming, moving, and changing over time:\n", + "An example evolution of Conway's Game of Life over a few generations (white = alive, black = dead).\n", + "The animation demonstrates how random initial clusters of cells can evolve into interesting patterns. Notice some cells blink on and off or form moving patterns.\n", + "\n", + "The animation shows 50 timesteps on a 100x100 grid. \n", + "\n", + "![Conways Game of Life](files/game_of_life_figures/game_of_life_example.gif)\n", + "\n", + "\n", + "## Implementations\n", + "\n", + "All of the implementation for the three different versions (Pure Python, NumPy and CuPy) are contained within the `.py` located at `game_of_life.py` within the `scripts` folder that can be downloaded at the top of this page. \n", + "\n", + "To run the different versions of the code, you can use:\n", + "\n", + "**Naïve Python Version**\n", + "\n", + "```bash\n", + "python game_of_life.py run_life_naive --size 100 --timesteps 50\n", + "```\n", + "\n", + "which will produce a file called `game_of_life_naive.gif`.\n", + "\n", + "**CPU-Vectorized Version**\n", + "\n", + "```bash\n", + "python game_of_life.py run_life_numpy --size 100 --timesteps 50\n", + "```\n", + "\n", + "which will produce a file called `game_of_life_cpu.gif`.\n", + "\n", + "**GPU-Accelerated Version**\n", + "\n", + "```bash\n", + "python game_of_life.py run_life_cupy --size 100 --timesteps 50\n", + "```\n", + "\n", + "which will produce a file called `game_of_life_gpu.gif`.\n", + "\n", + "## Naive Implementation\n", + "\n", + "The core computation that is being performed for the naive implementation is: \n", + "\n", + "```python\n", + "def life_step_naive(grid: np.ndarray) -> np.ndarray:\n", + " N, M = grid.shape\n", + " new = np.zeros((N, M), dtype=int)\n", + " for i in range(N):\n", + " for j in range(M):\n", + " cnt = 0\n", + " for di in (-1, 0, 1):\n", + " for dj in (-1, 0, 1):\n", + " if di == 0 and dj == 0:\n", + " continue\n", + " ni, nj = (i + di) % N, (j + dj) % M\n", + " cnt += grid[ni, nj]\n", + " if grid[i, j] == 1:\n", + " new[i, j] = 1 if (cnt == 2 or cnt == 3) else 0\n", + " else:\n", + " new[i, j] = 1 if (cnt == 3) else 0\n", + " return new\n", + "\n", + "def simulate_life_naive(N: int, timesteps: int, p_alive: float = 0.2):\n", + " grid = np.random.choice([0, 1], size=(N, N), p=[1-p_alive, p_alive])\n", + " history = []\n", + " for _ in range(timesteps):\n", + " history.append(grid.copy())\n", + " grid = life_step_naive(grid)\n", + " return history\n", + "```\n", + "### Explanation \n", + "\n", + "There are a number of different reasons that the naive implementation runs slow, including: \n", + "\n", + "- **Nested Python Loops**: Instead of eight `np.roll` calls and one `np.where`, we make two loops over `i, j` (10^4 iterations) and two more loops over `di, dj` (9 checks each), for roughly 9x10^4 Python level operation per step. \n", + "- **Manual edge-wrapping logic**: Branching (`if ni < 0 … elif …`) for each neighbour check, instead of the single fast shift that `np.roll` does in C. \n", + "- **Per-cell rule application** The game of life rule is applied with Python `if/else` instead of the single vectorised Boolean mask. \n", + "- **Rebuilding a new NumPy array element-by-element**: writing into `new_grid[i, j]` in Python is orders of magnitude slower than one-shot `np.where`. \n", + "\n", + "Together, these overheads make this version run very slow, particularly as `N` begins to increase, and would not leverage any low-level C loops or GPU acceleration. \n", + "\n", + "## CPU-Vectorised Implementation \n", + "\n", + "```python\n", + "def life_step_numpy(grid: np.ndarray) -> np.ndarray:\n", + " neighbours = (\n", + " np.roll(np.roll(grid, 1, axis=0), 1, axis=1) +\n", + " np.roll(np.roll(grid, 1, axis=0), -1, axis=1) +\n", + " np.roll(np.roll(grid, -1, axis=0), 1, axis=1) +\n", + " np.roll(np.roll(grid, -1, axis=0), -1, axis=1) +\n", + " np.roll(grid, 1, axis=0) +\n", + " np.roll(grid, -1, axis=0) +\n", + " np.roll(grid, 1, axis=1) +\n", + " np.roll(grid, -1, axis=1)\n", + " )\n", + " return np.where((neighbours == 3) | ((grid == 1) & (neighbours == 2)), 1, 0)\n", + "\n", + "\n", + "def simulate_life_numpy(N: int, timesteps: int, p_alive: float = 0.2):\n", + " grid = np.random.choice([0, 1], size=(N, N), p=[1-p_alive, p_alive])\n", + " history = []\n", + " for _ in range(timesteps):\n", + " history.append(grid.copy())\n", + " grid = life_step_numpy(grid)\n", + " return history\n", + "```\n", + "\n", + "### Explanation\n", + "\n", + "#### From Per-Cell Loops to Whole-Array Operations \n", + "\n", + "In the **naive** version, every one of the NxN cells in Python was traversed within two nested loops; then, for each cell, two more loops over the offsets `di` and `dj` counted its eight neighbours by computing. `(i + di) % N` and `(j + dj) % M` in pure Python. \n", + "**Cost**: ~9·N² Python-level iterations per generation, including branching and modulo arithmetic.\n", + "**Drawback** Thousands of interpreter calls and non-contiguous memory access. \n", + "In the **NumPy** version, no Python loops over individual cells occur. Instead, eight calls to `np.roll` shift the entire grid array (up, down, left, right and on diagonals), automatically handling wrap-around in one C-level operation. Summing those eight arrays gives a full neighbour count in a single, optimised pass. \n", + "\n", + "#### Manual `if/else` vs Vectorised Mask \n", + "\n", + "In the **naive** implementation, after counting neighbours, each cell's fate is determined with a Python `if grid[i,j] == 1: ... else: ...` and assigned via `new[i,j] = ...`. \n", + "In the **NumPy** implementation a single expression of `(neighbours == 3) | ((grid == 1) & (neighbours == 2))` produces an NxN Boolean mask of *cells alive next*. Converting that mask to integers with `np.where(mask, 1, 0)` builds the entire next-generation grid in one C-level operation, resulting in no per-element Python overhead. \n", + "\n", + "#### Automatic Wrap-Around vs Manual Modulo Logic\n", + "\n", + "In the **naive** version, every neighbour checks does: \n", + "\n", + "```python \n", + "ni = (i + di) % N\n", + "nj = (j + dj) % M\n", + "```\n", + "\n", + "with Python-level branching and modulo arithmetic on each of the 9 checks per cell. The associated **cost** is thousands of modulo (`%`) operations and branch instructions per generation. \n", + "\n", + "In the **NumPy** version, a single call to \n", + "\n", + "```python\n", + "np.roll(grid, shift, axis=)\n", + "```\n", + "\n", + "automatically wraps the entire array in one C-level operation. The **benefit** is that all per-cell `%` operations and branching are eliminated, being replaced by a single optimised memory shift over the whole grid. \n", + "\n", + "## GPU-Accelerated Implementation \n", + "\n", + "```python\n", + "def life_step_gpu(grid: cp.ndarray) -> cp.ndarray:\n", + " neighbours = (\n", + " cp.roll(cp.roll(grid, 1, axis=0), 1, axis=1) +\n", + " cp.roll(cp.roll(grid, 1, axis=0), -1, axis=1) +\n", + " cp.roll(cp.roll(grid, -1, axis=0), 1, axis=1) +\n", + " cp.roll(cp.roll(grid, -1, axis=0), -1, axis=1) +\n", + " cp.roll(grid, 1, axis=0) +\n", + " cp.roll(grid, -1, axis=0) +\n", + " cp.roll(grid, 1, axis=1) +\n", + " cp.roll(grid, -1, axis=1)\n", + " )\n", + " return cp.where((neighbours == 3) | ((grid == 1) & (neighbours == 2)), 1, 0)\n", + "\n", + "\n", + "def simulate_life_cupy(N: int, timesteps: int, p_alive: float = 0.2):\n", + " grid_gpu = (cp.random.random((N, N)) < p_alive).astype(cp.int32)\n", + " history = []\n", + " for _ in range(timesteps):\n", + " history.append(cp.asnumpy(grid_gpu))\n", + " grid_gpu = life_step_gpu(grid_gpu)\n", + " return history\n", + "\n", + "```\n", + "\n", + "### CuPy vs NumPy: What's Changed.\n", + "\n", + "The power of **CuPy** lies in its near drop-in compatibility with **NumPy**: arrays live on the GPU, and computations run in parallel on the Device, yet the code looks almost identical. \n", + "\n", + "#### Imports \n", + "\n", + "The first change you will need to make is to use CuPy rather than NumPy. \n", + "**NumPy**: \n", + "```Python \n", + "import numpy as np\n", + "```\n", + "\n", + "**CuPy**: \n", + "```Python \n", + "import cupy as cp\n", + "```\n", + "\n", + "#### Random initialisation \n", + "\n", + "**NumPy**: \n", + "```Python \n", + "grid = np.random.choice([0,1], size=(N,N), p=[1-p, p])\n", + "```\n", + "\n", + "**CuPy**: \n", + "```Python \n", + "grid_gpu = (cp.random.random((N,N)) < p_alive).astype(cp.int32)\n", + "```\n", + "\n", + "#### Data Transfer\n", + "\n", + "**CuPy**: \n", + "\n", + "```Python \n", + "cp.asnumpy(grid_gpu) # bring a CuPy array back to NumPy\n", + "```\n", + "\n", + "### Which to use?\n", + "\n", + "**Large grids (e.g. N ≥ 500) or many timesteps**: GPU's parallel throughput outweighs kernel-launch and transfer overhead.\n", + "**Small grids (e.g. 10×10)**: GPU overhead may dominate, so you may want to stick with NumPy.\n", + "\n", + "### Why is this quicker?\n", + "\n", + "When a computation can be expressed as the same operation applied independently across many data elements, like counting neighbours on every cell of a large Game of Life grid, GPUs often deliver dramatic speedups compared to CPUs. This advantage stems from several architectural and compiler-related factors that we discussed earlier in the section on theory, including: \n", + "\n", + "- **Massive Data Parallelism**\n", + " - **CPU**: A few (4–16) powerful cores optimised for sequential tasks and complex control flow.\n", + " - **GPU**: Hundreds to thousands of simpler cores running in lock-step.\n", + " - **Result**: A Game of Life update, which is identical work on each of N² cells, can be dispatched as thousands of parallel GPU threads, sweeping through the grid in a single kernel launch instead of looping in software.\n", + "- **Throughput-Oriented Design**\n", + " - **CPU cores** focus on single-thread performance (high clock speeds, deep pipelines, branch prediction).\n", + " - **GPU cores** sacrifice single-thread speed in favour of raw arithmetic throughput and memory bandwidth across many threads.\n", + " - **Result**: When you need to process millions of cell updates, the GPU's aggregate arithmetic and memory bandwidth far outstrips what a CPU can deliver.\n", + "- **Specialised Memory Hierarchy**\n", + " - **CPU**: Large multi-level caches and direct access to system RAM.\n", + " - **GPU:** High-bandwidth device memory (VRAM), with its own caches and shared memory for thread blocks.\n", + " - **Result**: Once the grid is transferred to GPU memory, all subsequent neighbour-count rolls and mask evaluations occur on-device, benefiting from coalesced global reads and fast on-chip scratchpads.\n", + "- **Compiled GPU Kernels vs. Interpreted Loops**\n", + " - **CPU code** that uses Python loops incurs per-iteration interpreter overhead. Even NumPy's C loops run on a single core.\n", + " - **GPU kernels** compiled ahead of time by NVCC or generated at runtime execute the same inner logic entirely in device code without returning to Python between elements.\n", + " - **Result**: You replace thousands of Python bytecode dispatches or even C-loop iterations with just a few kernel launches and a handful of device-resident function calls.\n", + "\n", + "**In summary,**, for problems like Conway's Game of Life, where the same simple computation is applied independently across a large array, GPUs excel by running thousands of data-parallel threads in hardware, backed by specialised memory systems and aggressive compiler optimisations. Offloading to the GPU transforms an O(N²) loop of Python or C iterations into just a handful of highly parallel kernel launches, yielding orders-of-magnitude speedups on sufficiently large grids.\n", + "\n", + "### How much quicker?\n", + "\n", + "Each implementation exhibits a different overall runtime, as you have probably noticed when running them from the command line. We can use the built-in UNIX command line tool `time` to measure the time that is taken to run the code. The `time` command is a simple profiler that measures how long a given program takes to run. It provides three primary metrics, including:\n", + "\n", + "- **real**: The \"wall-clock\" time elapsed from start to finish (i.e. actual elapsed time).\n", + "- **user**: CPU time spent in user-mode (your programs own computations)\n", + "- **sys**: CPU time spent in kernel mode (system calls on behalf of your program)." + ] + }, + { + "cell_type": "markdown", + "id": "8379cd58-8b99-494b-a276-c947c57f83f2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "full-width" + ] + }, + "source": [ + "````{div} full-width\n", + "#### Framework Comparison on Each Machine\n", + "\n", + "
\n", + "\n", + " \n", + "
\n", + " \"AMD\n", + "
\n", + "

AMD 64-Core + NV A100

\n", + "

\n", + " CuPy pays a ~2.7 s launch cost but then stays nearly flat up to 2.5 k cells, after which it grows to ~42.9 s at 40 k. \n", + " NumPy starts faster on very small grids (∼2.1 s at 250) but surpasses CuPy by 500–1 k cells and balloons to ~239 s at 10 k. \n", + " The naive triple loop is usable only below 250×250—beyond that you hit >130 s at 500 and ~560 s at 1 k.\n", + "

\n", + "

\n", + " Takeaway: On the A100, CuPy becomes the clear winner past ~600 cells per side. NumPy is fine for a few hundred but uncompetitive beyond 1 k, and pure Python loops become unusable quickly.\n", + "

\n", + "
\n", + "
\n", + "\n", + " \n", + "
\n", + " \"AMD\n", + "
\n", + "

AMD 96-Core + NV H100

\n", + "

\n", + " CuPy’s overhead shrinks slightly (~2.5 s) and its growth to ~26.1 s at 40 k is gentler than on the A100. \n", + " NumPy remains sub-3 s until 1 k but then rises to ~222 s at 10 k, faster than the 64-core host only for the very largest sizes. \n", + " Naive loops on this 96-core machine still hit ~62 s at 500 and ~265 s at 1 k.\n", + "

\n", + "

\n", + " Takeaway: The H100 host’s extra cores give NumPy ~10–20% speed-up over the 64-core EPYC, but CuPy is still >8× faster than NumPy past 2 k, and >100× faster than naive loops at 1 k.\n", + "

\n", + "
\n", + "
\n", + "\n", + " \n", + "
\n", + " \"AMD\n", + "
\n", + "

AMD 16-Core + NV RTX 3070

\n", + "

\n", + " The midrange RTX 3070 shows ~3.9 s overhead up to 1 k, then climbs to ~14.2 s at 10 k—still well below NumPy’s ~331.9 s at 10 k on 16 cores. \n", + " However, because it has fewer SMs, the plateau appears earlier (around 5 k). \n", + " Naive loops cross ~70 s at 500 and ~290 s at 1 k, so even here Python loops are unusable.\n", + "

\n", + "

\n", + " Takeaway: On consumer-grade hardware, CuPy wins beyond ~400 cells per side; NumPy can handle a few thousand but then spirals into minutes, and naive loops are only for toy problems.\n", + "

\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "#### Machine Comparison on Each Framework\n", + "\n", + "
\n", + "\n", + " \n", + "
\n", + " \"CuPy\n", + "
\n", + "

CuPy Across Hardware

\n", + "

\n", + " All three GPUs show a ~2.5–4 s startup cost. \n", + " The H100 host is the fastest at scale (∼26 s @40 k), followed by the A100 (∼43 s) and the RTX 3070 (∼14 s @10 k). \n", + " Their scaling is roughly quadratic in grid side length, but the higher-end cards pull away as problem size grows.\n", + "

\n", + "

\n", + " Insight: If your grid exceeds ~1 k per side, even a midrange 3070 will beat any CPU—just be mindful of that initial launch overhead.\n", + "

\n", + "
\n", + "
\n", + "\n", + " \n", + "
\n", + " \"NumPy\n", + "
\n", + "

NumPy Across Hardware

\n", + "

\n", + " Vectorised CPU performance scales with core count: the 96-core H100 host does ~2.9 s @1 k and ~222 s @10 k, \n", + " the 64-core EPYC hits ~3.9 s @1 k and ~239 s @10 k, \n", + " and the 16-core Ryzen only ~5.2 s @1 k but then ~332 s @10 k.\n", + "

\n", + "

\n", + " Insight: Doubling cores cuts runtime roughly in half, but you never beat the GPU’s parallelism for large areas.\n", + "

\n", + "
\n", + "
\n", + "\n", + " \n", + "
\n", + " \"Naive\n", + "
\n", + "

Naive Across Hardware

\n", + "

\n", + " Pure Python loops are orders of magnitude slower across the board: \n", + " even on the H100 host the run times are ~62 s @500 and ~265 s @1 k. \n", + " On the 3070 machine it’s ~70 s @500, and on the A100 host ~132 s @500.\n", + "

\n", + "

\n", + " Insight: Triple-nested loops simply don’t scale. They’re only viable for extremely small toy problems (<250×250).\n", + "

\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "\n", + "#### Inter-Machine-Framework Comparison\n", + "\n", + "
\n", + " \"All\n", + "\n", + "
\n", + "

All Methods & Hardware

\n", + "

\n", + " Across all three machines, CuPy becomes faster than NumPy at roughly 500–1 k grid size, \n", + " while naive loops are unusable past ~250. \n", + " GPU dispatch overhead (~3 s) means tiny grids still favor NumPy or CPU, \n", + " but anything beyond a few thousand cells per side is best done with CuPy on a modern accelerator.\n", + "

\n", + "
\n", + "
\n", + "````" + ] + }, + { + "cell_type": "markdown", + "id": "a147f851-ff6c-4a77-bc4e-338bc6de3d01", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Exercise: Generate and Visualise Game-of-Life Performance Data\n", + "\n", + "In this exercise, you will:\n", + "\n", + "1. **Run** a SLURM job to generate timing data for Conway’s Game of Life on your platform. \n", + "2. **Modify or extend** the provided plotting script to visualize that data. \n", + "3. **Interpret** and discuss the performance trends you observe.\n", + "\n", + "> **Note:** This is an open-ended assignment—feel free to experiment with new plot styles, additional metrics, or entirely custom analyses.\n", + "\n", + "#### Generate Timing Data\n", + "- Open the `game_of_life.slurm` slurm script.\n", + "- Edit the script to match your cluster’s configuration (e.g. partition, module loads). \n", + "- Submit the job:\n", + "\n", + "```bash\n", + "sbatch game_of_life.slurm\n", + "```\n", + "\n", + "After completion, you’ll find CSV files in data/ named like:\n", + "\n", + "```bash\n", + "gol_timings___ts100.csv\n", + "```\n", + "\n", + "#### Review and Adapt the Starter Code\n", + "The file `content/game_of_life_create_plots.py` contains:\n", + "- CSV loading logic (from `data/`)\n", + "- Function for:\n", + " - Per-machine “Framework vs Grid Size” plots\n", + " - Per-framework “Machine Comparison” plots \n", + " - Combined “All Methods & Hardware” overlay\n", + "\n", + "You may wish to use them as they are, or you could:\n", + "- Modify these functions (styles, scales, annotations).\n", + "- Rewrite your own plotting script or Jupyter notebook.\n", + "\n", + "#### Visualisations\n", + "Visualisations you could create include: \n", + "- **Framework Comparison on Each Machine**: One figure per machine (e.g. A100, H100, RTX 3070) showing NumPy, Naive, CuPy timings vs. grid size.\n", + "- **Machine Comparison on Each Framework**: One figure per framework, overlaying the three machines.\n", + "- **All Methods & Hardware**: A combined overlay with all nine curves in a single plot.\n", + "- **Any other plot you can think of!**\n", + "\n", + "#### Potential Enhancements\n", + "To extend the work you are done, you could:\n", + "- Calculate Speedup ratios (e.g. CPU_time / GPU_time).\n", + "- Efficiency metrics such as time per cell or memory throughput.\n", + "- Annotations marking crossover points or “break-even” grid sizes.\n", + "- Interactive or animated plots (e.g. showing performance evolution as grid size increases).\n", + "\n", + "\n", + "### Finer Grained Timing Information\n", + "Our next step is to quantify those differences by measuring exactly how long each stage takes (pure computation, data transfers, grid initialisation, etc.) and to pinpoint where the bulk of the time is spent. The following section will address these questions by introducing profiling techniques." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98d6747f-c236-4504-985e-7b4d233a54f1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/cuda_check.slurm b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/cuda_check.slurm new file mode 100644 index 00000000..0225ab1e --- /dev/null +++ b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/cuda_check.slurm @@ -0,0 +1,56 @@ +#!/bin/bash +# +# Example SLURM batch script for running a CUDA‐based check via Poetry in a Spack environment +# + +#SBATCH --job-name=cuda_check # A short, descriptive name for your job +#SBATCH --output=%x-%j.out.log # STDOUT -> -.out.log +#SBATCH --error=%x-%j.err.log # STDERR -> -.err.log +#SBATCH --partition=gpu # Partition/queue name (e.g., GPU partition) +#SBATCH --nodes=1 # Number of nodes to allocate +#SBATCH --ntasks=4 # Total number of MPI tasks (not used here, but SLURM requires it) +#SBATCH --cpus-per-task=1 # CPU cores per task (for any threading or OpenMP) +#SBATCH --gres=gpu:1 # Number of GPUs to reserve +#SBATCH -A Research_Project-RSATeam # Research Project +#SBATCH --time=00:10:00 # Designated Wall Time + + +####-------------------------------------------------------------------#### +#### 1) Load Required Modules (Without Spack) +####-------------------------------------------------------------------#### +module load nvidia-cuda/12.1.1 +module load Python/3.11.3 + +####-------------------------------------------------------------------#### +#### 2) Poetry virtualenv & project setup +####-------------------------------------------------------------------#### +echo "===== Poetry setup =====" +# Ensure your local bin (where Poetry is installed) is on PATH +export PATH="$HOME/.local/bin:$PATH" + +# Show which Poetry executable we’re invoking (sanity check) +echo "Using poetry from: $(command -v poetry || echo '')" + +# Exit if Poetry isn’t available +if ! command -v poetry &> /dev/null; then + echo "ERROR: poetry not found in ~/.local/bin; please verify installation" + exit 1 +fi + +# Install dependencies without interactive prompts or ANSI color codes +poetry install --no-interaction --no-ansi + +# Print the path to the active virtualenv for confirmation +echo "Poetry venv: $(poetry env info --path)" +echo + +####-------------------------------------------------------------------#### +#### 3) Run the CUDA check +####-------------------------------------------------------------------#### +echo "===== Running CUDA check =====" +# This should print out how many CUDA devices Cupy sees +poetry run cuda_check + +# Capture and echo the exit code for easy debugging +echo "cuda_check exit code: $?" +echo diff --git a/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/game_of_life.slurm b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/game_of_life.slurm new file mode 100644 index 00000000..c46e4aae --- /dev/null +++ b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/game_of_life.slurm @@ -0,0 +1,93 @@ +#!/bin/bash +# +# SLURM batch script to run the Game of Life experiment end-to-end +# +# Usage: +# sbatch run_game_of_life.sh +# +# What it does: +# 1) Loads your Spack toolchain (gpu_course env) +# 2) Installs Python deps via Poetry +# 3) Verifies GPU access with a quick cuda_check + nvidia-smi dump +# 4) Executes the Game of Life benchmark script +# +# Outputs: +# - STDOUT → game_of_life-.out.log +# - STDERR → game_of_life-.err.log +# + +#SBATCH --job-name=game_of_life # Job name shown in squeue +#SBATCH --output=%x-%j.out.log # STDOUT → -.out.log +#SBATCH --error=%x-%j.err.log # STDERR → -.err.log +#SBATCH --partition=gpu # GPU-enabled compute partition +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks=16 # Total MPI tasks (unused here but required) +#SBATCH --cpus-per-task=1 # Threads per task (for Python/IO) +#SBATCH --gres=gpu:1 # Number of GPUs to reserve +#SBATCH -A Research_Project-RSATeam # Research Project +#SBATCH --time=0:10:00 # Designated Wall Time + +####-------------------------------------------------------------------#### +#### 1) Load Required Modules (Without Spack) +####-------------------------------------------------------------------#### +module load nvidia-cuda/12.1.1 +module load Python/3.11.3 + +####-------------------------------------------------------------------#### +#### 2) Poetry venv & dependency installation +####-------------------------------------------------------------------#### +echo "===== Poetry setup =====" +# Ensure Poetry (often installed via pipx) is on your PATH +export PATH="$HOME/.local/bin:$PATH" + +# Show which Poetry binary will run +echo "Using poetry from: $(command -v poetry || echo '')" + +# Fail early if Poetry is unavailable +if ! command -v poetry &> /dev/null; then + echo "ERROR: Poetry not found; please install via pipx or official installer" + exit 1 +fi + +# Move into your project directory (adjust SCRIPT_DIR as needed) +cd "${SCRIPT_DIR:-$(pwd)}" + +# Install all dependencies into Poetry’s virtualenv, +# suppressing prompts and ANSI color codes +poetry install --no-interaction --no-ansi + +# Display the path to the created venv (for logging/debugging) +echo "Poetry venv path: $(poetry env info --path)" +echo + +####-------------------------------------------------------------------#### +#### 3) Quick CUDA sanity checks +####-------------------------------------------------------------------#### +echo "===== Running CUDA check =====" +# Run your cuda_check script (should print GPU count) +poetry run cuda_check +echo "cuda_check exit code: $?" +echo + +echo "===== GPU details via nvidia-smi =====" +# Print GPU name, driver, memory, utilization +if command -v nvidia-smi &> /dev/null; then + nvidia-smi \ + --query-gpu=index,name,driver_version,memory.total,memory.used,utilization.gpu \ + --format=csv +else + echo "nvidia-smi not found in PATH" +fi +echo + +####-------------------------------------------------------------------#### +#### 4) Run the Game of Life experiment +####-------------------------------------------------------------------#### +echo "===== Running Game of Life Experiment =====" +# Log current directory for traceability +pwd + +# Execute the benchmark script under Poetry’s venv +poetry run python ../content/game_of_life_experiment.py + +echo "===== Experiment complete =====" diff --git a/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/game_of_life_nvidia_nsight_profiling.slurm b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/game_of_life_nvidia_nsight_profiling.slurm new file mode 100644 index 00000000..47e1146b --- /dev/null +++ b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/game_of_life_nvidia_nsight_profiling.slurm @@ -0,0 +1,81 @@ +#!/bin/bash +# +# SLURM batch script to run the Game of Life experiment under NVIDIA Nsight Systems +# + +#SBATCH --job-name=game_of_life_NSight # Job name for easy identification +#SBATCH --output=%x-%j.out.log # STDOUT to -.out.log +#SBATCH --error=%x-%j.err.log # STDERR to -.err.log +#SBATCH --partition=gpu # GPU partition/queue +#SBATCH --nodes=1 # Number of nodes to allocate +#SBATCH --ntasks=16 # Number of tasks (MPI ranks, not used here) +#SBATCH --cpus-per-task=1 # CPU cores per task (for threading) +#SBATCH --gres=gpu:1 # Number of GPUs to reserve +#SBATCH -A Research_Project-RSATeam # Research Project +#SBATCH --time=0:10:00 # Designated Wall Time + +####-------------------------------------------------------------------#### +#### 1) Load Required Modules (Without Spack) +####-------------------------------------------------------------------#### +module load nvidia-cuda/12.1.1 +module load Python/3.11.3 + +####-------------------------------------------------------------------#### +#### 2) Poetry virtualenv & install dependencies +####-------------------------------------------------------------------#### +echo "===== Poetry setup =====" +# Ensure Poetry is on the PATH (user-local install) +export PATH="$HOME/.local/bin:$PATH" + +# Sanity check: which Poetry will run? +echo "Using poetry from: $(command -v poetry || echo '')" + +# Exit early if Poetry is missing +if ! command -v poetry &> /dev/null; then + echo "ERROR: poetry not found; please install via pipx or official installer" + exit 1 +fi + +# Install project dependencies into the Poetry venv (non-interactive) +poetry install --no-interaction --no-ansi +# Show which virtualenv Poetry created/activated +echo "Poetry venv: $(poetry env info --path)" +echo + +####-------------------------------------------------------------------#### +#### 3) Quick CUDA sanity checks +####-------------------------------------------------------------------#### +echo "===== Running CUDA check =====" +# Run your earlier cuda_check command to verify GPU visibility +poetry run cuda_check +echo "cuda_check exit code: $?" +echo + +echo "===== GPU info (nvidia-smi) =====" +# Print out GPU state (driver, memory, utilization) +if command -v nvidia-smi &> /dev/null; then + nvidia-smi --query-gpu=index,name,driver_version,memory.total,memory.used,utilization.gpu \ + --format=csv +else + echo "nvidia-smi not found in PATH" +fi +echo + +####-------------------------------------------------------------------#### +#### 4) Profile the Game of Life experiment with Nsight Systems +####-------------------------------------------------------------------#### +echo "===== Running Game of Life Experiment under Nsight Systems =====" +# Print current directory for debugging +pwd + +# Launch Nsight Systems profiler: +# - sample=none : only collect traces, no statistical sampling +# - trace=cuda,nvtx : capture CUDA API calls and NVTX ranges +# - -o : base name for the .qdrep/.nsys-rep files +# +# We wrap the profiled command with `poetry run` to use the project venv. +echo "../output/NVIDIA_NSight_exp_report_${SLURM_JOB_ID}" +nsys profile --sample=none --trace=cuda,nvtx -o "../output/NVIDIA_NSight_exp_report_${SLURM_JOB_ID}" poetry run game_of_life_experiment_profiled --profile-gpu --profile-cpu + + +echo "=== Nsight Systems profiling complete ===" diff --git a/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/system_info.slurm b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/system_info.slurm new file mode 100644 index 00000000..c3f09037 --- /dev/null +++ b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/system_info.slurm @@ -0,0 +1,63 @@ +#!/bin/bash +# +# SLURM batch script to dump allocation, system, and GPU information +# +# Usage: +# sbatch info.sh +# Outputs: +# - STDOUT → info-.out.log +# - STDERR → info-.err.log +# + +#SBATCH --job-name=info # Descriptive name for this job +#SBATCH --output=%x-%j.out.log # STDOUT → -.out.log +#SBATCH --error=%x-%j.err.log # STDERR → -.err.log +#SBATCH --partition=gpu # Partition/queue to submit to +#SBATCH --nodes=1 # Number of nodes requested +#SBATCH --ntasks=4 # Number of tasks (MPI ranks) +#SBATCH --cpus-per-task=1 # CPU cores per task +#SBATCH --gres=gpu:1 # Number of GPUs to reserve +#SBATCH -A Research_Project-RSATeam # Research Project +#SBATCH --time=00:10:00 # Designated Wall Time + +####-------------------------------------------------------------------#### +#### 1) Print SLURM job allocation details +####-------------------------------------------------------------------#### +echo "===== Job allocation info =====" +# Show key SLURM environment variables (or placeholders if unset) +echo "Job ID: ${SLURM_JOB_ID:-}" +echo "Job name: ${SLURM_JOB_NAME:-}" +echo "Partition: ${SLURM_JOB_PARTITION:-}" +echo "Nodes: ${SLURM_JOB_NUM_NODES:-}" +echo "Tasks: ${SLURM_NTASKS:-}" +echo "CPUs/task: ${SLURM_CPUS_PER_TASK:-}" +echo "Nodelist: ${SLURM_JOB_NODELIST:-}" +echo "Submit dir: ${SLURM_SUBMIT_DIR:-$(pwd)}" # Directory where sbatch was invoked +echo "Current host: $(hostname)" # Node on which this script runs +echo "Script directory:${SCRIPT_DIR:-}" # Custom env var if you set SCRIPT_DIR +echo + +####-------------------------------------------------------------------#### +#### 2) CPU topology and memory usage +####-------------------------------------------------------------------#### +echo "===== CPU topology & memory =====" +# Print detailed CPU info and topology +lscpu +echo "----- Free memory -----" +# Show available and used RAM +free -h +echo + +####-------------------------------------------------------------------#### +#### 3) GPU information +####-------------------------------------------------------------------#### +echo "===== GPU info (nvidia-smi) =====" +# If nvidia-smi is available, query key GPU metrics +if command -v nvidia-smi &> /dev/null; then + nvidia-smi \ + --query-gpu=index,name,driver_version,memory.total,memory.used,utilization.gpu \ + --format=csv +else + echo "nvidia-smi not found in PATH" +fi +echo diff --git a/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/temperature_diffusion.slurm b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/temperature_diffusion.slurm new file mode 100644 index 00000000..0c5bb0db --- /dev/null +++ b/individual_modules/intro_to_GPUs/exeter_isca_slurm_submission_scripts/temperature_diffusion.slurm @@ -0,0 +1,93 @@ +#!/bin/bash +# +# SLURM batch script to run the Temperature Diffusion experiment end-to-end +# +# Usage: +# sbatch run_temperature_diffusion.sh +# +# Steps performed: +# 1) Load Spack environment (`gpu_course`) for compilers/libs +# 2) Install Python dependencies via Poetry +# 3) Verify GPU access with a quick cuda_check + nvidia-smi dump +# 4) Execute the temperature diffusion benchmark script +# +# Outputs: +# - STDOUT → temperature_diffusion-.out.log +# - STDERR → temperature_diffusion-.err.log +# + +#SBATCH --job-name=temperature_diffusion # Job name shown in scheduler +#SBATCH --output=%x-%j.out.log # STDOUT → -.out.log +#SBATCH --error=%x-%j.err.log # STDERR → -.err.log +#SBATCH --partition=gpu # GPU partition/queue +#SBATCH --nodes=1 # Number of nodes to allocate +#SBATCH --ntasks=16 # Number of tasks (unused by this script) +#SBATCH --cpus-per-task=1 # CPU cores per task (for threading/IO) +#SBATCH --gres=gpu:1 # Number of GPUs to reserve +#SBATCH -A Research_Project-RSATeam # Research Project +#SBATCH --time=0:10:00 # Designated Wall Time + +####-------------------------------------------------------------------#### +#### 1) Load Required Modules (Without Spack) +####-------------------------------------------------------------------#### +module load nvidia-cuda/12.1.1 +module load Python/3.11.3 + +####-------------------------------------------------------------------#### +#### 2) Poetry virtualenv & dependency installation +####-------------------------------------------------------------------#### +echo "===== Poetry setup =====" +# Ensure Poetry’s user‐install bin directory is on PATH +export PATH="$HOME/.local/bin:$PATH" + +# Show which Poetry executable will be used +echo "Using poetry from: $(command -v poetry || echo '')" + +# Exit if Poetry is not installed +if ! command -v poetry &> /dev/null; then + echo "ERROR: poetry not found; please install Poetry in ~/.local/bin" + exit 1 +fi + +# Change into project directory (adjust if necessary) +cd "${SCRIPT_DIR:-$(pwd)}" + +# Install project dependencies into the Poetry-managed venv, +# suppressing interactive prompts and ANSI coloring +poetry install --no-interaction --no-ansi + +# Print the path to the active virtual environment for verification +echo "Poetry venv path: $(poetry env info --path)" +echo + +####-------------------------------------------------------------------#### +#### 3) Quick CUDA sanity checks +####-------------------------------------------------------------------#### +echo "===== Running CUDA check =====" +# Verify GPU visibility/count using your cuda_check script +poetry run cuda_check +echo "cuda_check exit code: $?" +echo + +echo "===== GPU info (nvidia-smi) =====" +# Dump GPUs’ index, name, driver version, total/used memory, utilization +if command -v nvidia-smi &> /dev/null; then + nvidia-smi \ + --query-gpu=index,name,driver_version,memory.total,memory.used,utilization.gpu \ + --format=csv +else + echo "nvidia-smi not found in PATH" +fi +echo + +####-------------------------------------------------------------------#### +#### 4) Run the Temperature Diffusion experiment +####-------------------------------------------------------------------#### +echo "===== Running Temperature Diffusion Experiment =====" +# Print current working directory for logging/debugging purposes +pwd + +# Execute the diffusion benchmarking script under Poetry’s venv +poetry run python ../content/temperature_diffusion_experiment.py + +echo "===== Experiment complete =====" diff --git a/individual_modules/intro_to_GPUs/figures/cpu_4_regions.png b/individual_modules/intro_to_GPUs/figures/cpu_4_regions.png new file mode 100644 index 00000000..8cb170d2 Binary files /dev/null and b/individual_modules/intro_to_GPUs/figures/cpu_4_regions.png differ diff --git a/individual_modules/intro_to_GPUs/figures/cpu_architecture.png b/individual_modules/intro_to_GPUs/figures/cpu_architecture.png new file mode 100644 index 00000000..5bcd4583 Binary files /dev/null and b/individual_modules/intro_to_GPUs/figures/cpu_architecture.png differ diff --git a/individual_modules/intro_to_GPUs/figures/gpu_2048_regions.png b/individual_modules/intro_to_GPUs/figures/gpu_2048_regions.png new file mode 100644 index 00000000..fd234d25 Binary files /dev/null and b/individual_modules/intro_to_GPUs/figures/gpu_2048_regions.png differ diff --git a/individual_modules/intro_to_GPUs/figures/gpu_architecture.png b/individual_modules/intro_to_GPUs/figures/gpu_architecture.png new file mode 100644 index 00000000..81c5852b Binary files /dev/null and b/individual_modules/intro_to_GPUs/figures/gpu_architecture.png differ diff --git a/individual_modules/intro_to_GPUs/figures/gpu_for_graphics.png b/individual_modules/intro_to_GPUs/figures/gpu_for_graphics.png new file mode 100644 index 00000000..2cccd827 Binary files /dev/null and b/individual_modules/intro_to_GPUs/figures/gpu_for_graphics.png differ diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_A100-SXM4-80GB_AMD_EPYC_7V12_64-Core_Processor_cpu_gpu_naive_ts100.csv b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_A100-SXM4-80GB_AMD_EPYC_7V12_64-Core_Processor_cpu_gpu_naive_ts100.csv new file mode 100644 index 00000000..6e01222c --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_A100-SXM4-80GB_AMD_EPYC_7V12_64-Core_Processor_cpu_gpu_naive_ts100.csv @@ -0,0 +1,16 @@ +gpu,cpu,method,grid_size,timesteps,mean_time_sec,std_dev_sec +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),50,100,2.063467,0.045987 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),50,100,2.673722,0.065165 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,Naive (CPU),50,100,3.174932,0.073362 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),100,100,2.087588,0.051706 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),100,100,2.667915,0.065814 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,Naive (CPU),100,100,6.653774,0.113325 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),250,100,2.225526,0.100086 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),250,100,2.658933,0.054842 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,Naive (CPU),250,100,30.527410,0.202121 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),500,100,2.415106,0.033757 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),500,100,2.706837,0.069167 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,Naive (CPU),500,100,132.631681,1.444587 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),1000,100,4.010868,0.296557 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),1000,100,2.652861,0.031628 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,Naive (CPU),1000,100,559.772829,11.304633 diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_A100-SXM4-80GB_AMD_EPYC_7V12_64-Core_Processor_cpu_gpu_ts100.csv b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_A100-SXM4-80GB_AMD_EPYC_7V12_64-Core_Processor_cpu_gpu_ts100.csv new file mode 100644 index 00000000..ec2f0f0f --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_A100-SXM4-80GB_AMD_EPYC_7V12_64-Core_Processor_cpu_gpu_ts100.csv @@ -0,0 +1,17 @@ +gpu,cpu,method,grid_size,timesteps,mean_time_sec,std_dev_sec +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),50,100,2.091429,0.075250 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),50,100,2.630615,0.026028 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),100,100,2.072720,0.052241 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),100,100,2.688768,0.059501 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),250,100,2.175943,0.053714 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),250,100,2.644925,0.036298 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),500,100,2.450191,0.075497 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),500,100,2.827237,0.052290 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),1000,100,3.875571,0.007465 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),1000,100,2.691696,0.043639 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),2500,100,18.247786,0.085240 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),2500,100,2.759881,0.110523 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),5000,100,62.562225,0.187020 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),5000,100,3.363562,0.281929 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,NumPy (CPU),10000,100,239.376334,0.764313 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),10000,100,5.186553,0.248861 diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_A100-SXM4-80GB_AMD_EPYC_7V12_64-Core_Processor_gpu_ts100.csv b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_A100-SXM4-80GB_AMD_EPYC_7V12_64-Core_Processor_gpu_ts100.csv new file mode 100644 index 00000000..8ea46775 --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_A100-SXM4-80GB_AMD_EPYC_7V12_64-Core_Processor_gpu_ts100.csv @@ -0,0 +1,14 @@ +gpu,cpu,method,grid_size,timesteps,mean_time_sec,std_dev_sec +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),50,100,3.373583,0.826936 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),100,100,2.761768,0.088103 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),250,100,2.800489,0.023046 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),500,100,2.690734,0.057994 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),1000,100,2.753769,0.078095 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),2500,100,2.790142,0.037062 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),5000,100,3.195418,0.053766 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),10000,100,5.027194,0.031042 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),15000,100,8.442919,0.263919 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),20000,100,12.644069,0.093395 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),25000,100,18.554928,0.177289 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),30000,100,25.315868,0.173294 +NVIDIA_A100-SXM4-80GB,AMD_EPYC_7V12_64-Core_Processor,CuPy (GPU),40000,100,42.860808,0.284663 diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_GeForce_RTX_3070_AMD_Ryzen_9_5950X_16-Core_Processor_cpu_gpu_naive_ts100.csv b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_GeForce_RTX_3070_AMD_Ryzen_9_5950X_16-Core_Processor_cpu_gpu_naive_ts100.csv new file mode 100644 index 00000000..7ba2eb3d --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_GeForce_RTX_3070_AMD_Ryzen_9_5950X_16-Core_Processor_cpu_gpu_naive_ts100.csv @@ -0,0 +1,16 @@ +gpu,cpu,method,grid_size,timesteps,mean_time_sec,std_dev_sec +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),50,100,3.640638,0.036798 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),50,100,4.266703,0.177343 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,Naive (CPU),50,100,4.338512,0.111102 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),100,100,3.604704,0.063980 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),100,100,4.218450,0.092314 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,Naive (CPU),100,100,5.897992,0.069724 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),250,100,3.594188,0.045117 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),250,100,4.092836,0.101914 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,Naive (CPU),250,100,18.074257,0.265221 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),500,100,4.050222,0.079530 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),500,100,4.181186,0.128703 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,Naive (CPU),500,100,70.806250,0.382224 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),1000,100,5.090459,0.094832 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),1000,100,4.415727,0.013465 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,Naive (CPU),1000,100,290.000797,2.489353 diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_GeForce_RTX_3070_AMD_Ryzen_9_5950X_16-Core_Processor_cpu_gpu_ts100.csv b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_GeForce_RTX_3070_AMD_Ryzen_9_5950X_16-Core_Processor_cpu_gpu_ts100.csv new file mode 100644 index 00000000..a09d46cf --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_GeForce_RTX_3070_AMD_Ryzen_9_5950X_16-Core_Processor_cpu_gpu_ts100.csv @@ -0,0 +1,17 @@ +gpu,cpu,method,grid_size,timesteps,mean_time_sec,std_dev_sec +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),50,100,3.599059,0.069716 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),50,100,4.192222,0.114750 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),100,100,3.613343,0.075662 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),100,100,4.158486,0.034935 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),250,100,3.587423,0.019483 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),250,100,4.147545,0.045834 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),500,100,3.835621,0.013189 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),500,100,4.204011,0.025557 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),1000,100,5.177155,0.038985 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),1000,100,4.339210,0.021666 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),2500,100,24.733530,0.109393 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),2500,100,5.307009,0.178610 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),5000,100,82.864845,3.439353 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),5000,100,7.100434,0.399275 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,NumPy (CPU),10000,100,331.884689,4.332655 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),10000,100,15.048609,0.088731 diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_GeForce_RTX_3070_AMD_Ryzen_9_5950X_16-Core_Processor_gpu_ts100.csv b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_GeForce_RTX_3070_AMD_Ryzen_9_5950X_16-Core_Processor_gpu_ts100.csv new file mode 100644 index 00000000..af3dcb09 --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_GeForce_RTX_3070_AMD_Ryzen_9_5950X_16-Core_Processor_gpu_ts100.csv @@ -0,0 +1,9 @@ +gpu,cpu,method,grid_size,timesteps,mean_time_sec,std_dev_sec +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),50,100,3.949557,0.061366 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),100,100,3.976500,0.048919 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),250,100,3.878019,0.001916 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),500,100,3.881949,0.010910 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),1000,100,3.929638,0.021552 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),2500,100,4.407816,0.030216 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),5000,100,6.377364,0.045547 +NVIDIA_GeForce_RTX_3070,AMD_Ryzen_9_5950X_16-Core_Processor,CuPy (GPU),10000,100,14.188796,0.045603 diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_H100_NVL_AMD_EPYC_9V84_96-Core_Processor_cpu_gpu_naive_ts100.csv b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_H100_NVL_AMD_EPYC_9V84_96-Core_Processor_cpu_gpu_naive_ts100.csv new file mode 100644 index 00000000..2c17c2a3 --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_H100_NVL_AMD_EPYC_9V84_96-Core_Processor_cpu_gpu_naive_ts100.csv @@ -0,0 +1,16 @@ +gpu,cpu,method,grid_size,timesteps,mean_time_sec,std_dev_sec +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),50,100,1.983749,0.007513 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),50,100,2.381084,0.068829 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,Naive (CPU),50,100,2.410273,0.049775 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),100,100,1.945175,0.039153 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),100,100,2.336525,0.002403 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,Naive (CPU),100,100,3.904390,0.023506 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),250,100,1.971557,0.060440 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),250,100,2.469172,0.128338 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,Naive (CPU),250,100,14.725007,0.102246 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),500,100,2.184285,0.066509 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),500,100,2.443785,0.112158 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,Naive (CPU),500,100,62.142849,0.301355 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),1000,100,2.938326,0.294434 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),1000,100,2.359441,0.030637 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,Naive (CPU),1000,100,264.626117,13.762658 diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_H100_NVL_AMD_EPYC_9V84_96-Core_Processor_cpu_gpu_ts100.csv b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_H100_NVL_AMD_EPYC_9V84_96-Core_Processor_cpu_gpu_ts100.csv new file mode 100644 index 00000000..89f12eae --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_H100_NVL_AMD_EPYC_9V84_96-Core_Processor_cpu_gpu_ts100.csv @@ -0,0 +1,17 @@ +gpu,cpu,method,grid_size,timesteps,mean_time_sec,std_dev_sec +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),50,100,1.869059,0.043287 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),50,100,2.374159,0.029446 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),100,100,1.911878,0.055914 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),100,100,2.321233,0.027948 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),250,100,1.951020,0.020478 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),250,100,2.380752,0.072261 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),500,100,2.199362,0.094118 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),500,100,2.393087,0.049348 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),1000,100,2.864257,0.115768 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),1000,100,2.364880,0.022514 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),2500,100,16.395034,0.058331 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),2500,100,2.688642,0.313695 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),5000,100,57.519849,0.447362 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),5000,100,2.872334,0.298268 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,NumPy (CPU),10000,100,222.391967,1.363038 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),10000,100,3.946568,0.353731 diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_H100_NVL_AMD_EPYC_9V84_96-Core_Processor_gpu_ts100.csv b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_H100_NVL_AMD_EPYC_9V84_96-Core_Processor_gpu_ts100.csv new file mode 100644 index 00000000..32b52618 --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/game_of_life_data/gol_timings_NVIDIA_H100_NVL_AMD_EPYC_9V84_96-Core_Processor_gpu_ts100.csv @@ -0,0 +1,15 @@ +gpu,cpu,method,grid_size,timesteps,mean_time_sec,std_dev_sec +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),50,100,3.238925,1.033222 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),100,100,2.502330,0.067399 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),250,100,2.453384,0.130417 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),500,100,2.436340,0.049481 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),1000,100,2.437894,0.122810 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),2500,100,2.503792,0.056716 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),5000,100,2.692318,0.040983 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),10000,100,3.809769,0.050915 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),15000,100,5.623580,0.018359 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),20000,100,8.400177,0.074555 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),25000,100,11.584349,0.057047 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),30000,100,15.807539,0.149792 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),40000,100,26.089969,0.200815 +NVIDIA_H100_NVL,AMD_EPYC_9V84_96-Core_Processor,CuPy (GPU),50000,100,40.500881,0.414871 diff --git a/individual_modules/intro_to_GPUs/files/game_of_life_figures/game_of_life_example.gif b/individual_modules/intro_to_GPUs/files/game_of_life_figures/game_of_life_example.gif new file mode 100644 index 00000000..c43598f4 Binary files /dev/null and b/individual_modules/intro_to_GPUs/files/game_of_life_figures/game_of_life_example.gif differ diff --git a/individual_modules/intro_to_GPUs/files/profiling/example_data_file.nsys-rep b/individual_modules/intro_to_GPUs/files/profiling/example_data_file.nsys-rep new file mode 100644 index 00000000..f53ab380 Binary files /dev/null and b/individual_modules/intro_to_GPUs/files/profiling/example_data_file.nsys-rep differ diff --git a/individual_modules/intro_to_GPUs/files/profiling/example_nsys_stats_output.txt b/individual_modules/intro_to_GPUs/files/profiling/example_nsys_stats_output.txt new file mode 100644 index 00000000..a0a444b6 --- /dev/null +++ b/individual_modules/intro_to_GPUs/files/profiling/example_nsys_stats_output.txt @@ -0,0 +1,118 @@ +Generating SQLite file _static/profiling/example_data_file.sqlite from _static/profiling/example_data_file.nsys-rep +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/nvtx_sum.py]... + + ** NVTX Range Summary (nvtx_sum): + + Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Style Range + -------- --------------- --------- ----------- ----------- --------- ---------- ----------- ------- -------------------- + 36.6 8535674999 12 711306249.9 337956117.5 32066518 2154888540 879481300.4 PushPop :simulate_life_naive + 36.0 8398662776 1200 6998885.6 3274825.5 208959 21511028 8419478.3 PushPop :life_step_naive + 20.0 4671705906 12 389308825.5 386720513.0 377014681 414862062 11356633.9 PushPop :simulate_life_cupy + 5.5 1284742952 1200 1070619.1 934097.0 894127 12411105 1001792.4 PushPop :life_step_gpu + 1.2 276921198 12 23076766.5 22136591.0 20319042 27672011 2828715.6 PushPop :simulate_life_numpy + 0.6 144652945 1200 120544.1 115209.5 93330 320779 28252.5 PushPop :life_step_numpy + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/osrt_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain OS Runtime trace data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/cuda_api_sum.py]... + + ** CUDA API Summary (cuda_api_sum): + + Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name + -------- --------------- --------- ---------- --------- -------- --------- ----------- ---------------------------- + 86.4 1560619502 60 26010325.0 210.0 110 137306387 52472498.8 cudaFree + 6.9 125243240 38436 3258.5 2920.0 2280 71249 962.6 cuLaunchKernel + 2.6 46825640 24 1951068.3 1933764.5 1223737 2732432 721292.2 cudaLaunchKernel + 2.5 45049441 7200 6256.9 6049.5 4820 23620 1171.9 cudaMemcpyAsync + 0.9 15769146 180 87606.4 83960.0 78700 133209 10580.0 cuModuleLoadData + 0.3 4899267 96 51034.0 45400.0 35760 89929 13186.4 cuModuleUnload + 0.2 3247650 24 135318.8 134505.0 103140 201630 24291.5 cuLibraryUnload + 0.1 2010781 12 167565.1 167964.0 163939 168879 1326.3 cudaDeviceSynchronize + 0.1 1722245 102 16884.8 5135.0 2440 109670 32859.3 cudaMalloc + 0.1 1020638 4944 206.4 190.0 60 1150 115.7 cuGetProcAddress_v2 + 0.0 498689 12 41557.4 41500.0 34970 45760 3623.0 cudaMemGetInfo + 0.0 85680 12 7140.0 7145.0 6840 7430 199.5 cudaStreamIsCapturing_v10000 + 0.0 17550 24 731.3 680.0 100 1630 599.0 cuModuleGetLoadingMode + 0.0 17110 12 1425.8 1415.0 1270 1710 140.0 cuInit + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/cuda_gpu_kern_sum.py]... + + ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum): + + Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name + -------- --------------- --------- -------- -------- -------- -------- ----------- ---------------------------------------------------------------------------------------------------- + 56.4 33398172 21384 1561.8 1536.0 1056 2048 251.7 cupy_copy__int64_int64 + 19.9 11784791 8316 1417.1 1440.0 1088 1728 158.9 cupy_add__int64_int64_int64 + 8.3 4895203 3564 1373.5 1408.0 1088 1600 135.8 cupy_equal__int64_int_bool + 3.4 2006447 12 167203.9 167233.0 166466 167681 412.6 void generate_seed_pseudo>(unsigned long long, u… + 2.8 1655408 1200 1379.5 1440.0 1088 1632 154.1 cupy_bitwise_and__bool_bool_bool + 2.8 1654728 1200 1378.9 1424.5 1056 1600 150.5 cupy_where__bool_int_int_int64 + 2.8 1639308 1200 1366.1 1408.0 1056 1600 149.4 cupy_bitwise_or__bool_bool_bool + 2.7 1627787 1200 1356.5 1392.0 1056 1664 144.8 cupy_copy__bool_bool + 0.6 334881 216 1550.4 1472.0 1056 2048 230.6 cupy_copy__int32_int32 + 0.2 120513 84 1434.7 1456.0 1152 1728 153.2 cupy_add__int32_int32_int32 + 0.1 48225 36 1339.6 1376.0 1056 1568 172.6 cupy_equal__int32_int_bool + 0.0 18912 12 1576.0 1616.0 1344 1856 187.6 void gen_sequenced… + 0.0 18016 12 1501.3 1520.0 1280 1632 123.9 cupy_random_x_mod_1 + 0.0 17600 12 1466.7 1504.0 1344 1600 102.0 cupy_less__float64_float_bool + 0.0 16512 12 1376.0 1536.0 1056 1568 211.4 cupy_copy__bool_int32 + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/cuda_gpu_mem_time_sum.py]... + + ** CUDA GPU MemOps Summary (by Time) (cuda_gpu_mem_time_sum): + + Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation + -------- --------------- ----- -------- -------- -------- -------- ----------- ------------------------------ + 100.0 9124387 7200 1267.3 1312.0 960 1472 119.7 [CUDA memcpy Device-to-Device] + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/cuda_gpu_mem_size_sum.py]... + + ** CUDA GPU MemOps Summary (by Size) (cuda_gpu_mem_size_sum): + + Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation + ---------- ----- -------- -------- -------- -------- ----------- ------------------------------ + 186.837 7200 0.026 0.007 0.000 0.079 0.031 [CUDA memcpy Device-to-Device] + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/openmp_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain OpenMP event data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/opengl_khr_range_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain KHR Extension (KHR_DEBUG) data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/opengl_khr_gpu_range_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain GPU KHR Extension (KHR_DEBUG) data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/vulkan_marker_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain Vulkan Debug Extension (Vulkan Debug Util) data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/vulkan_gpu_marker_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain GPU Vulkan Debug Extension (GPU Vulkan Debug markers) data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/dx11_pix_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain DX11 CPU debug markers. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/dx12_gpu_marker_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain DX12 GPU debug markers. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/dx12_pix_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain DX12 CPU debug markers. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/wddm_queue_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain WDDM context data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/um_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain CUDA Unified Memory CPU page faults data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/um_total_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain CUDA Unified Memory CPU page faults data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/um_cpu_page_faults_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain CUDA Unified Memory CPU page faults data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/openacc_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain OpenACC event data. + +Processing [_static/profiling/example_data_file.sqlite] with [/shared/home/liam.berrisford/spack/opt/spack/linux-zen2/cuda-12.8.0-omla3gzzcerjbpx3pu6h7vywmqsj3rn4/nsight-systems-2024.6.2/host-linux-x64/reports/syscall_sum.py]... +SKIPPED: _static/profiling/example_data_file.sqlite does not contain syscall data. + diff --git a/individual_modules/intro_to_GPUs/files/profiling/snakeviz_output.png b/individual_modules/intro_to_GPUs/files/profiling/snakeviz_output.png new file mode 100644 index 00000000..d8579367 Binary files /dev/null and b/individual_modules/intro_to_GPUs/files/profiling/snakeviz_output.png differ diff --git a/individual_modules/intro_to_GPUs/files/temperature_diffusion/temperature_slice_static.png b/individual_modules/intro_to_GPUs/files/temperature_diffusion/temperature_slice_static.png new file mode 100644 index 00000000..2554fdf7 Binary files /dev/null and b/individual_modules/intro_to_GPUs/files/temperature_diffusion/temperature_slice_static.png differ diff --git a/individual_modules/intro_to_GPUs/profiling.ipynb b/individual_modules/intro_to_GPUs/profiling.ipynb new file mode 100644 index 00000000..44246459 --- /dev/null +++ b/individual_modules/intro_to_GPUs/profiling.ipynb @@ -0,0 +1,610 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "aaa5f08a-6f66-4906-b47d-2d5f3dd855e6", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Profiling and Optimisation of CPU and GPU Code\n", + "\n", + "## Learning Objectives\n", + "\n", + "By the end of this section, learners will be able to:\n", + "\n", + "- Interpret GPU profiling outputs, including kernel execution times, CUDA API calls, and memory transfer operations. \n", + "- Compare the performance of naive Python, NumPy, and CuPy implementations across different problem sizes. \n", + "- Identify performance bottlenecks such as excessive Python loops, implicit synchronisations (e.g. `cudaFree`), and frequent small memory transfers. \n", + "- Distinguish between compute-bound and memory-bound workloads by analyzing profiling data. \n", + "- Explain the impact of kernel launch overhead, device-to-device memory copies, and synchronisation points on GPU performance. \n", + "- Recognize when GPU acceleration provides benefits over CPU execution and determine crossover points where GPU use becomes advantageous. \n", + "- Propose optimisation strategies for both CPU (e.g., vectorisation, efficient libraries, multiprocessing) and GPU (e.g., minimising data transfers, kernel fusion, asynchronous overlap, coalesced memory access). \n", + "- Apply profiling insights to guide real-world optimisation decisions in scientific or machine learning workflows. \n", + "\n", + "\n", + "## Resource Files \n", + "\n", + "The job submission scripts specifically configured for use on the University of Exeter ISCA HPC system are available [here](../intro_to_GPUs/zip_files/exeter_isca_slurm_submission_scripts.zip). \n", + "\n", + "General-purpose job submission scripts, which can serve as a starting point for use on other HPC systems (with minor modifications required for this course), are available [here](../intro_to_GPUs/zip_files/slurm_submission_scripts.zip). \n", + "\n", + "The Python scripts used in this course can be downloaded [here](../intro_to_GPUs/zip_files/scripts.zip). \n", + "\n", + "All supplementary files required for the course are available [here](../intro_to_GPUs/zip_files/files.zip). \n", + "\n", + "The presentation slides for this course can be accessed [here](../intro_to_GPUs/slides/GPU_Training_Day_Slides.pptx).\n", + "\n", + "## Overview \n", + "\n", + "Writing code is the first problem, but generally, the second is **optimising** code for performance, an equally important skill, especially in GPU computing. Before optimising, you need to know *where* the time is being spent, which is where **profiling** comes in. Profiling means measuring the performance characteristics of your program, typically which parts of the code consume the most time or resources. \n", + "\n", + "## Profiling Python Code with cPython (CPU) \n", + "Python has a built-in profiler called **cPython**. It can help you find which functions are taking up the most time in your program. This is key before you go into GPU acceleration; sometimes, you might find bottlenecks in places you didn't expect or identify parts of the code that would benefit the most from being moved to the GPU.\n", + "\n", + "### How to use cProfile \n", + "You can make use of cProfile via the command line: `python -m cProfile -o profile_results.pstats myscript.py`, which will run `myscript.py` under the profiler and output stats to a file. In the following examples we will instead call cProfile directly within our scripts, and use the `pstat` library to create immediate summaries.\n", + "\n", + "```Python \n", + "import cProfile\n", + "import pstats\n", + "import numpy as np\n", + "\n", + "# ─────────────────────────────────────────────────────────────────────────────\n", + "# 1) Naïve Game of Life implementation\n", + "# ─────────────────────────────────────────────────────────────────────────────\n", + "\n", + "def life_step_naive(grid: np.ndarray) -> np.ndarray:\n", + " N, M = grid.shape\n", + " new = np.zeros((N, M), dtype=int)\n", + " for i in range(N):\n", + " for j in range(M):\n", + " cnt = 0\n", + " for di in (-1, 0, 1):\n", + " for dj in (-1, 0, 1):\n", + " if di == 0 and dj == 0:\n", + " continue\n", + " ni, nj = (i + di) % N, (j + dj) % M\n", + " cnt += grid[ni, nj]\n", + " if grid[i, j] == 1:\n", + " new[i, j] = 1 if (cnt == 2 or cnt == 3) else 0\n", + " else:\n", + " new[i, j] = 1 if (cnt == 3) else 0\n", + " return new\n", + "\n", + "def simulate_life_naive(N: int, timesteps: int, p_alive: float = 0.2):\n", + " grid = np.random.choice([0, 1], size=(N, N), p=[1-p_alive, p_alive])\n", + " history = []\n", + " for _ in range(timesteps):\n", + " history.append(grid.copy())\n", + " grid = life_step_naive(grid)\n", + " return history\n", + "\n", + "# ─────────────────────────────────────────────────────────────────────────────\n", + "# 2) Profiling using cProfile\n", + "# ─────────────────────────────────────────────────────────────────────────────\n", + "\n", + "N = 200\n", + "STEPS = 100\n", + "P_ALIVE = 0.2\n", + "\n", + "profiler = cProfile.Profile()\n", + "profiler.enable() # ── start profiling ────────────────\n", + "\n", + "# Run the full naïve simulation\n", + "simulate_life_naive(N=N, timesteps=STEPS, p_alive=P_ALIVE)\n", + "\n", + "profiler.disable() # ── stop profiling ─────────────────\n", + "profiler.dump_file(\"naive.pstat\") # ── save output ────────────────────\n", + "\n", + "stats = pstats.Stats(profiler).sort_stats('cumtime')\n", + "stats.print_stats(10) # print top 10 functions by cumulative time\n", + "\n", + "\n", + "```\n", + "\n", + "**Interpreting cProfile output**: When you print stats, you'll see a table with columns including: \n", + "- **ncalls**: number of calls to the function \n", + "- **tottime**: total time spent in the function (excluding sub-function calls) \n", + "- **cumtime**: cumulative time spent in the function includes sub-functions\n", + "- The function name\n", + "\n", + "```bash \n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.034 0.034 4.312 4.312 4263274180.py:27(simulate_life_naive)\n", + " 100 4.147 0.041 4.150 0.041 4263274180.py:9(life_step_naive)\n", + "... (other functions)\n", + "```\n", + "Therefore in the above table `ncalls` (100) tells you `life_step_naive` was invoked 100 times; `tottime` (4.147 s) is the time spent inside `life_step_naive` itself, excluding any functions it calls; `cumtime` (4.150 s) is the total cumulative time in `life_step_naive` plus any sub-calls it makes. In this example, `life_step_naive` spent about 4.147 s in its own Python loops, and an extra ~0.003 s in whatever minor sub-calls it did (array indexing, % operations, etc.), for a total of 4.150 s. The per-call columns are simply `tottime/ncalls` and `cumtime/ncalls`, and the single call to `simulate_life_naive` shows its cumulative 4.312 s includes all the 100 naive steps plus the list-append overhead.\n", + "\n", + "### Visualising the Output with Snakeviz \n", + "\n", + "Snakeviz is a separate tool that we can use to analyse the output of cProfile. Snakeviz is a stand-alone tool available through PyPI. We can install it with\n", + "\n", + "``` bash\n", + "poetry add snakeviz\n", + "```\n", + "\n", + "We can use it to visualise a cProfile output such as the one generated from the above snippet\n", + "\n", + "``` bash\n", + "poetry run snakeviz naive.pstat\n", + "```\n", + "\n", + "which launches an interactive webapp which we can use to explore the profiling timings.\n", + "\n", + "![Screenshot of SnakeViz](files/profiling/snakeviz_output.png)\n", + "\n", + "### Finding Bottlenecks \n", + "\n", + "To pinpoint where your code spends most of its time, look at the cumulative time (`cumtime`) column in the profiler report. This shows the total time in a function plus all of its sub-calls. A high total time (`tottime`) means that the function’s own Python code is heavy, whereas a large gap between `cumtime` and `tottime` reveals significant work in any functions it invokes (array indexing, modulo ops, etc.).\n", + "\n", + "In our naive Game of Life example:\n", + "- `life_step_naive` is called 100 times, with `tottime ≈ 4.147 s` and `cumtime ≈ 4.150 s`.\n", + " - Almost all the work is in its own nested loops and per-cell logic.\n", + " - Only a few milliseconds are spent in its sub-calls (grid indexing, % arithmetic).\n", + "- `simulate_life_naive` appears once with `cumtime ≈ 4.312 s`, which covers the single Python loop plus all 100 calls to `life_step_naive`.\n", + "\n", + "Once you’ve identified the culprit:\n", + "- If you have high `tottime` in a Python function, you may want to consider consider vectorising inner loops (e.g. switch to NumPy’s `np.roll` + `np.where`) or using a compiled extension.\n", + "- If you have heavy external calls under your `cumtime`, then you may want to explore hardware acceleration (e.g. GPU via `CuPy`) or more efficient algorithms.\n", + "\n", + "## Profiling the CPU-Vectorised Implementation using NumPy. \n", + "\n", + "```python\n", + "import cProfile\n", + "import pstats\n", + "import numpy as np\n", + "\n", + "# ─────────────────────────────────────────────────────────────────────────────\n", + "# 1) NumPy Game of Life implementation\n", + "# ─────────────────────────────────────────────────────────────────────────────\n", + "\n", + "def life_step_numpy(grid: np.ndarray) -> np.ndarray:\n", + " neighbours = (\n", + " np.roll(np.roll(grid, 1, axis=0), 1, axis=1) +\n", + " np.roll(np.roll(grid, 1, axis=0), -1, axis=1) +\n", + " np.roll(np.roll(grid, -1, axis=0), 1, axis=1) +\n", + " np.roll(np.roll(grid, -1, axis=0), -1, axis=1) +\n", + " np.roll(grid, 1, axis=0) +\n", + " np.roll(grid, -1, axis=0) +\n", + " np.roll(grid, 1, axis=1) +\n", + " np.roll(grid, -1, axis=1)\n", + " )\n", + " return np.where((neighbours == 3) | ((grid == 1) & (neighbours == 2)), 1, 0)\n", + "\n", + "def simulate_life_numpy(N: int, timesteps: int, p_alive: float = 0.2):\n", + " grid = np.random.choice([0, 1], size=(N, N), p=[1-p_alive, p_alive])\n", + " history = []\n", + " for _ in range(timesteps):\n", + " history.append(grid.copy())\n", + " grid = life_step_numpy(grid)\n", + " return history\n", + "\n", + "# ─────────────────────────────────────────────────────────────────────────────\n", + "# 2) Profiling using cProfile\n", + "# ─────────────────────────────────────────────────────────────────────────────\n", + "\n", + "N = 200\n", + "STEPS = 100\n", + "P_ALIVE = 0.2\n", + "\n", + "profiler = cProfile.Profile()\n", + "profiler.enable() # ── start profiling ────────────────────────\n", + "\n", + "# Run the full NumPy-based simulation\n", + "simulate_life_numpy(N=N, timesteps=STEPS, p_alive=P_ALIVE)\n", + "\n", + "profiler.disable() # ── stop profiling ─────────────────────────\n", + "profiler.dump_file('numpy.pstat') # ── save output ─────────────────────────\n", + "\n", + "stats = (\n", + " pstats.Stats(profiler)\n", + " .strip_dirs() # remove full paths\n", + " .sort_stats('cumtime') # sort by cumulative time\n", + ")\n", + "# show only the NumPy functions in the report\n", + "stats.print_stats(r\"life_step_numpy|simulate_life_numpy\")\n", + "```\n", + "```bash\n", + "ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 100 0.028 0.000 0.055 0.001 2865127924.py:9(life_step_numpy)\n", + " 1 0.000 0.000 0.011 0.011 2865127924.py:22(simulate_life_numpy)\n", + "```\n", + "\n", + "### Interpreting the Results\n", + "`life_step_numpy` \n", + "**`ncalls = 100`**: called once per generation for 100 generations, the same as before. \n", + "**`tottime ≈ 0.028 s`**: time spent in the Python-level wrapper (the eight `np.roll` calls and the one `np.where`, excluding the internal C work. \n", + "**`cumtime ≈ 0.055 s`**: includes both the Python-level overhead *and* the time spent inside NumPy's compiled code (rolling, adding, masking, etc.)\n", + "`simulate_life_numpy`\n", + "**`ncalls = 1`**: the top-level driver is run once. \n", + "**`cumtime ≈ 0.011 s`**: covers grid initialisation, 100 calls to `life_step_numpy`, and the history list appends. \n", + "\n", + "### Why is it so much faster than the naive version?\n", + "- **Bulk C-level operations**\n", + " - The eight `np.roll` shifts and the single `np.where` are all implemented in optimised C loops. \n", + " - cProfile only attributes a few milliseconds to Python itself because the heavy lifting happens outside Python's interpreter. \n", + "- **Minimal Python overhead**\n", + " - We pay one Python-level call per generation (100 calls total) versus *hundreds of thousands* of Python-loop iterations in the naive version. \n", + " - That drops the Python-layer `tottime` from ~4s (naive) to ~0.03s (NumPy)\n", + "- **Cache and vector-friendly memory access**\n", + " - NumPy works on large contiguous buffers, so the CPU prefetches data and applies vector instructions.\n", + " - The naïve per-cell modulo arithmetic and scattered indexing defeat those hardware optimisations.\n", + "\n", + "Overall, by moving the neighbour counting and rule application into a few large NumPy calls, we cut down Python‐level time from over 4 seconds to under 0.1 seconds for 100 generations on a 200×200 grid." + ] + }, + { + "cell_type": "markdown", + "id": "87dcdf8a-5482-41d2-8488-de76175657b5", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Profiling GPU Code with NVIDIA Nsight Systems \n", + "When we involve GPUs, cProfile alone isn't enough. cProfile will tell us about the Python side, but we also need to know what's happening on the GPU. Does the GPU spend most of its time computing, or is it idle while waiting for data? Are there a few kernel launches that take a long time or many tiny kernel launches?\n", + "\n", + "**NVIDIA Nsight Systems** is a profiler for GPU applications that provides a timeline of CPU and GPU activity. It can show: \n", + "- When your code launched GPU kernels and how long they ran \n", + "- GPU memory transfers between host and device \n", + "- CPU-side functions as well (to correlate CPU and GPU)\n", + "\n", + "### Using Nsight Systems \n", + "Nsight Systems can be used via a GUI or command line. On clusters, you might use the CLI, assuming it's installed. \n", + "\n", + "You will need to run your script under Nsight: \n", + "```bash \n", + "nsys profile -o profile_report python my_gpu_script.py\n", + "```\n", + "\n", + "This will run `my_gpu_script.py` and record profiling data into a data file with the extension `.nsys-rep`, creating in the above case the file `profile_report.nsys-rep`. The file can then be analysed with the following command: \n", + "\n", + "\n", + "```bash\n", + "nsys stats profile_report.nsys-rep\n", + "```\n", + "\n", + "An example `.nsys-rep` file has been included within the GitHub Repo for you to try the command with, at the filepath `files/profiling/example_data_file.nsys-rep`. We will discuss the contents of the file in the section \"Example Output\" after discussing the necessary code changes to generate the file. \n", + "\n", + "### Code Changes \n", + "To get the fine-tuned profiling, we also need to make some changes to the code. A new version of Conway's Game of Life has been created and is located in `game_of_life_profiled.py`, where additional imports are needed: \n", + "\n", + "```python \n", + "from cupyx.profiler import time_range \n", + "from cupy.cuda import profiler\n", + "```\n", + "\n", + "We then also need to decorate all the core functions we are interested in with a `@time_range()` decorator, for example: \n", + "```python \n", + "@time_range()\n", + "def life_step_numpy():\n", + "\n", + "@time_range()\n", + "def life_step_gpu():\n", + "\n", + "@time_range()\n", + "def life_step_naive():\n", + "```\n", + "\n", + "Finally we also need to start and stop the profiler, whcih is done with: \n", + "\n", + "```python \n", + "def run_life_cupy():\n", + " if args.profile_gpu:\n", + " profiler.start()\n", + "\n", + " history = simulate_life_cupy()\n", + "\n", + " if args.profile_gpu:\n", + " profiler.stop()\n", + "```\n", + "\n", + "The final change that needs to be made is to change the manner in which the Python code is called within the `.slurm` script using: \n", + "\n", + "```bash \n", + "nsys profile --sample=none --trace=cuda,nvtx -o ../output/${SLURM_JOB_NAME}_${SLURM_JOB_ID}_exp_report -- poetry run game_of_life_experiment_profiled --profile-gpu --profile-cpu\n", + "```\n", + "\n", + "Unfortunately, you can't call the Python script itself as we did before as the Python interpreter obfuscates the profiler, and so there is a need to instead define a new entry point and call that to run the complete experiment run. \n", + "\n", + "Together these are all the changes that are needed to create the data file and be able to understand better how the code is performing and where there is potential for further improvements through optimisation. " + ] + }, + { + "cell_type": "markdown", + "id": "cb4c6cca-87bd-4b9e-8bb1-6f3ac24cdfbe", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "````{div} full-width\n", + "## Example Output Grid Sizes 10, 25, 50, 100 Across Naive, NumPy, CuPy \n", + "When you run the command `nsys stats` on a `.nsys-rep` file it will generate text report of the profiling that was conducted. An example of the output producded is located at `files/profiling/example_nsys_stats_output.txt`, but you can run it for yourself with the command: \n", + "\n", + "```bash\n", + "nsys stats files/profiling/example_data_file.nsys-rep\n", + "```\n", + "\n", + "The following subsections detail the different components of the report generated. \n", + "\n", + "### NVTX Range Summary \n", + "\n", + "The NVTX ranges bracket your Python/CuPy functins. \n", + "\n", + "| Time (%) | Total Time (ns) | Instances | Avg (ns) | Med (ns) | Min (ns) | Max (ns) | StdDev (ns) | Style | Range |\n", + "| -------- | --------------- | --------- | ------------- | ------------- | ----------- | ------------- | ------------- | ------- | ---------------------- |\n", + "| 36.6 | 8 535 674 999 | 12 | 711 306 249.9 | 337 956 117.5 | 32 066 518 | 2 154 888 540 | 879 481 300.4 | PushPop | `:simulate_life_naive` |\n", + "| 36.0 | 8 398 662 776 | 1200 | 6 998 885.6 | 3 274 825.5 | 208 959 | 21 511 028 | 8 419 478.3 | PushPop | `:life_step_naive` |\n", + "| 20.0 | 4 671 705 906 | 12 | 389 308 825.5 | 386 720 513.0 | 377 014 681 | 414 862 062 | 11 356 633.9 | PushPop | `:simulate_life_cupy` |\n", + "| 5.5 | 1 284 742 952 | 1200 | 1 070 619.1 | 934 097.0 | 894 127 | 12 411 105 | 1 001 792.4 | PushPop | `:life_step_gpu` |\n", + "| 1.2 | 276 921 198 | 12 | 23 076 766.5 | 22 136 591.0 | 20 319 042 | 27 672 011 | 2 828 715.6 | PushPop | `:simulate_life_numpy` |\n", + "| 0.6 | 144 652 945 | 1200 | 120 544.1 | 115 209.5 | 93 330 | 320 779 | 28 252.5 | PushPop | `:life_step_numpy` |\n", + "\n", + "Over 72% of time sits in the **naive** Python loops (`simulate_life_naive` and `life_step_naive`), while the GPU vectorised step (`life_step_gpu`) only accounts for ~5.5%. Interesting in this context NumPy is faster than the CuPy code. The grid sizes used were `[10, 25, 50, 100]`.\n", + "\n", + "### CUDA API Summary \n", + "\n", + "| Time (%) | Total Time (ns) | Num Calls | Avg (ns) | Med (ns) | Min (ns) | Max (ns) | StdDev (ns) | Name |\n", + "| -------- | --------------- | --------- | ------------ | ----------- | --------- | ----------- | ------------ | ------------------------------ |\n", + "| 86.4 | 1 560 619 502 | 60 | 26 010 325.0 | 210.0 | 110 | 137 306 387 | 52 472 498.8 | `cudaFree` |\n", + "| 6.9 | 125 243 240 | 38 436 | 3 258.5 | 2 920.0 | 2 280 | 71 249 | 962.6 | `cuLaunchKernel` |\n", + "| 2.6 | 46 825 640 | 24 | 1 951 068.3 | 1 933 764.5 | 1 223 737 | 2 732 432 | 721 292.2 | `cudaLaunchKernel` |\n", + "| 2.5 | 45 049 441 | 7 200 | 6 256.9 | 6 049.5 | 4 820 | 23 620 | 1 171.9 | `cudaMemcpyAsync` |\n", + "| 0.9 | 15 769 146 | 180 | 87 606.4 | 83 960.0 | 78 700 | 133 209 | 10 580.0 | `cuModuleLoadData` |\n", + "| 0.3 | 4 899 267 | 96 | 51 034.0 | 45 400.0 | 35 760 | 89 929 | 13 186.4 | `cuModuleUnload` |\n", + "| 0.2 | 3 247 650 | 24 | 135 318.8 | 134 505.0 | 103 140 | 201 630 | 24 291.5 | `cuLibraryUnload` |\n", + "| 0.1 | 2 010 781 | 12 | 167 565.1 | 167 964.0 | 163 939 | 168 879 | 1 326.3 | `cudaDeviceSynchronize` |\n", + "| 0.1 | 1 722 245 | 102 | 16 884.8 | 5 135.0 | 2 440 | 109 670 | 32 859.3 | `cudaMalloc` |\n", + "| 0.1 | 1 020 638 | 4 944 | 206.4 | 190.0 | 60 | 1 150 | 115.7 | `cuGetProcAddress_v2` |\n", + "| 0.0 | 498 689 | 12 | 41 557.4 | 41 500.0 | 34 970 | 45 760 | 3 623.0 | `cudaMemGetInfo` |\n", + "| 0.0 | 85 680 | 12 | 7 140.0 | 7 145.0 | 6 840 | 7 430 | 199.5 | `cudaStreamIsCapturing_v10000` |\n", + "| 0.0 | 17 550 | 24 | 731.3 | 680.0 | 100 | 1 630 | 599.0 | `cuModuleGetLoadingMode` |\n", + "| 0.0 | 17 110 | 12 | 1 425.8 | 1 415.0 | 1 270 | 1 710 | 140.0 | `cuInit` |\n", + "\n", + "Going into the individual calls being performed within the CUDA API is outside of the scope of this course. However, this table does give a better idea of what is happening with the GPU if you require going into that detail for your optimisations. For example, `cudaFree` is a runtime call that releases device memory allocation. This is called in CuPy, every time a `cp.ndarray` (or call `.get()`/`.astype()`), to free the memory that was being used. The key part that makes this expensive is that `cudaFree` is a synchronous operation, so the CPU will stall until the GPU has completed this step. The actionable step we could take to reduce this is to minimise these calls. Instead of freeing every array after each iteration, we could **pre-allocate** a buffer once and **reuse** it for every step, eliminating repeated synchronisation. \n", + "\n", + "### GPU Kernel Execution \n", + "\n", + "| Time (%) | Total Time (ns) | Instances | Avg (ns) | Med (ns) | Min (ns) | Max (ns) | StdDev (ns) | Name |\n", + "| -------- | --------------- | --------- | --------- | --------- | -------- | -------- | ----------- | ------------------------------------------------------------------------------------------------------- |\n", + "| 56.4 | 33 398 172 | 21 384 | 1 561.8 | 1 536.0 | 1 056 | 2 048 | 251.7 | `cupy_copy__int64_int64` |\n", + "| 19.9 | 11 784 791 | 8 316 | 1 417.1 | 1 440.0 | 1 088 | 1 728 | 158.9 | `cupy_add__int64_int64_int64` |\n", + "| 8.3 | 4 895 203 | 3 564 | 1 373.5 | 1 408.0 | 1 088 | 1 600 | 135.8 | `cupy_equal__int64_int_bool` |\n", + "| 3.4 | 2 006 447 | 12 | 167 203.9 | 167 233.0 | 166 466 | 167 681 | 412.6 | `void generate_seed_pseudo>(unsigned long long, u…)` |\n", + "| 2.8 | 1 655 408 | 1 200 | 1 379.5 | 1 440.0 | 1 088 | 1 632 | 154.1 | `cupy_bitwise_and__bool_bool_bool` |\n", + "| 2.8 | 1 654 728 | 1 200 | 1 378.9 | 1 424.5 | 1 056 | 1 600 | 150.5 | `cupy_where__bool_int_int_int64` |\n", + "| 2.8 | 1 639 308 | 1 200 | 1 366.1 | 1 408.0 | 1 056 | 1 600 | 149.4 | `cupy_bitwise_or__bool_bool_bool` |\n", + "| 2.7 | 1 627 787 | 1 200 | 1 356.5 | 1 392.0 | 1 056 | 1 664 | 144.8 | `cupy_copy__bool_bool` |\n", + "| 0.6 | 334 881 | 216 | 1 550.4 | 1 472.0 | 1 056 | 2 048 | 230.6 | `cupy_copy__int32_int32` |\n", + "| 0.2 | 120 513 | 84 | 1 434.7 | 1 456.0 | 1 152 | 1 728 | 153.2 | `cupy_add__int32_int32_int32` |\n", + "| 0.1 | 48 225 | 36 | 1 339.6 | 1 376.0 | 1 056 | 1 568 | 172.6 | `cupy_equal__int32_int_bool` |\n", + "| 0.0 | 18 912 | 12 | 1 576.0 | 1 616.0 | 1 344 | 1 856 | 187.6 | `void gen_sequenced…` |\n", + "| 0.0 | 18 016 | 12 | 1 501.3 | 1 520.0 | 1 280 | 1 632 | 123.9 | `cupy_random_x_mod_1` |\n", + "| 0.0 | 17 600 | 12 | 1 466.7 | 1 504.0 | 1 344 | 1 600 | 102.0 | `cupy_less__float64_float_bool` |\n", + "| 0.0 | 16 512 | 12 | 1 376.0 | 1 536.0 | 1 056 | 1 568 | 211.4 | `cupy_copy__bool_int32` |\n", + "\n", + "This breakdown shows that over half of all GPU kernel time is spent in the `cupy_copy__int64_int64` kernel—handling bulk data movement—followed by the `cupy_add__int64_int64_int64` and `cupy_equal__int64_int_bool` compute kernels, each taking roughly 1–1.6 µs per instance. All other kernels, including bitwise ops, conditional selects, and random‐seed generation, run in similar microsecond ranges but contribute far less overall, indicating a workload dominated by simple element‐wise copy and arithmetic operations. Highlighting that the majority of the GPU time is **not** being spent on computation. \n", + "\n", + "### GPU Memory Operations \n", + "#### By Time\n", + "| Time (%) | Total Time (ns) | Count | Avg (ns) | Med (ns) | Min (ns) | Max (ns) | StdDev (ns) | Operation |\n", + "| -------- | --------------- | ----- | -------- | -------- | -------- | -------- | ----------- | -------------------------------- |\n", + "| 100.0 | 9 124 387 | 7 200 | 1 267.3 | 1 312.0 | 960 | 1 472 | 119.7 | `[CUDA memcpy Device-to-Device]` |\n", + "\n", + "#### By Size\n", + "\n", + "| Total (MB) | Count | Avg (MB) | Med (MB) | Min (MB) | Max (MB) | StdDev (MB) | Operation |\n", + "| ---------- | ----- | -------- | -------- | -------- | -------- | ----------- | -------------------------------- |\n", + "| 186.837 | 7 200 | 0.026 | 0.007 | 0.000 | 0.079 | 0.031 | `[CUDA memcpy Device-to-Device]` |\n", + "\n", + "\n", + "### Key Takeaways\n", + "\n", + "The takeaways that we could take from this include the following:\n", + "- **Python loops severely degrade performance**: Over 72% of run time is in the naive implementations, so vectorisation (NumPy/CuPy) is critical. \n", + "- **Implicit syncs dominate**: `cudaFree` stalls the pipe, and so avoiding per-iteration free calls by reusing buffers is key. \n", + "- **Kernel work is tiny**: Each kernel takes ~1-2µs; orchestration (kernel launches + memops) is the real bottleneck.\n", + "- **Memcopy patterns matter**: 7200 small transfers add up, so we need to use larger batches of copies to reduce the overhead.\n", + "\n", + "## Example Output Grid Sizes 50, 100, 250, 500, 1000 Across Naive, NumPy, CuPy \n", + "\n", + "Provided below are the same tables as above but for the Game of Life ran with grid sizes `[50, 100, 250, 500, 1000]`.\n", + "\n", + "### NVTX Range Summary \n", + "\n", + "| Time (%) | Total Time (ns) | Instances | Avg (ns) | Med (ns) | Min (ns) | Max (ns) | StdDev (ns) | Style | Range |\n", + "| -------- | --------------- | --------- | ---------------- | ---------------- | ----------- | --------------- | ----------------- | ------- | ---------------------- |\n", + "| 49.5 | 996 594 758 961 | 15 | 66 439 650 597.4 | 13 121 447 905.0 | 536 057 614 | 261 739 193 353 | 100 890 404 862.4 | PushPop | `:simulate_life_naive` |\n", + "| 49.5 | 996 314 582 960 | 1 500 | 664 209 722.0 | 131 032 064.5 | 5 185 001 | 2 635 408 984 | 974 949 530.9 | PushPop | `:life_step_naive` |\n", + "| 0.6 | 11 719 493 946 | 15 | 781 299 596.4 | 399 051 461.0 | 373 843 763 | 6 218 365 721 | 1 504 209 809.4 | PushPop | `:simulate_life_cupy` |\n", + "| 0.2 | 3 874 165 387 | 15 | 258 277 692.5 | 91 469 695.0 | 22 700 648 | 851 927 462 | 323 227 971.4 | PushPop | `:simulate_life_numpy` |\n", + "| 0.2 | 3 513 311 048 | 1 500 | 2 342 207.4 | 759 883.0 | 112 230 | 10 461 891 | 3 085 684.1 | PushPop | `:life_step_numpy` |\n", + "| 0.1 | 1 633 590 838 | 1 500 | 1 089 060.6 | 940 589.0 | 894 823 | 14 415 246 | 1 105 761.6 | PushPop | `:life_step_gpu` |\n", + "\n", + "\n", + "### CUDA API Summary \n", + "\n", + "\n", + "| Time (%) | Total Time (ns) | Num Calls | Avg (ns) | Med (ns) | Min (ns) | Max (ns) | StdDev (ns) | Name |\n", + "| -------- | --------------- | --------- | ------------ | ----------- | --------- | ----------- | ------------- | ------------------------------ |\n", + "| 69.2 | 2 541 019 787 | 75 | 33 880 263.8 | 230.0 | 120 | 733 443 841 | 96 242 335.2 | `cudaFree` |\n", + "| 22.5 | 824 537 275 | 30 | 27 484 575.8 | 2 278 088.5 | 1 190 585 | 495 570 646 | 101 802 539.0 | `cudaLaunchKernel` |\n", + "| 4.3 | 158 809 973 | 48 045 | 3 305.4 | 3 000.0 | 2 300 | 48 351 | 925.4 | `cuLaunchKernel` |\n", + "| 1.7 | 63 278 947 | 9 000 | 7 031.0 | 7 120.0 | 4 890 | 27 190 | 1 639.3 | `cudaMemcpyAsync` |\n", + "| 0.9 | 31 528 151 | 225 | 140 125.1 | 85 220.0 | 79 071 | 11 819 926 | 782 186.4 | `cuModuleLoadData` |\n", + "| 0.6 | 21 676 197 | 120 | 180 635.0 | 46 030.0 | 34 890 | 15 677 051 | 1 426 563.3 | `cuModuleUnload` |\n", + "| 0.4 | 14 435 308 | 15 | 962 353.9 | 5 191.0 | 4 730 | 14 363 066 | 3 707 195.4 | `cudaStreamIsCapturing_v10000` |\n", + "| 0.2 | 5 899 095 | 132 | 44 690.1 | 6 315.0 | 2 860 | 150 401 | 49 075.1 | `cudaMalloc` |\n", + "| 0.1 | 4 075 165 | 30 | 135 838.8 | 135 280.5 | 103 310 | 190 941 | 21 163.6 | `cuLibraryUnload` |\n", + "| 0.1 | 2 546 761 | 15 | 169 784.1 | 168 900.0 | 166 201 | 172 431 | 2 026.7 | `cudaDeviceSynchronize` |\n", + "| 0.0 | 1 251 918 | 6 180 | 202.6 | 180.0 | 60 | 2 020 | 114.9 | `cuGetProcAddress_v2` |\n", + "| 0.0 | 599 392 | 15 | 39 959.5 | 41 610.0 | 31 850 | 53 191 | 6 136.1 | `cudaMemGetInfo` |\n", + "| 0.0 | 23 950 | 15 | 1 596.7 | 1 460.0 | 1 270 | 3 670 | 592.4 | `cuInit` |\n", + "| 0.0 | 21 130 | 30 | 704.3 | 665.0 | 100 | 1 670 | 582.4 | `cuModuleGetLoadingMode` |\n", + "\n", + "### GPU Kernel Execution \n", + "\n", + "| Time (%) | Total Time (ns) | Instances | Avg (ns) | Med (ns) | Min (ns) | Max (ns) | StdDev (ns) | Name |\n", + "| -------- | --------------- | --------- | --------- | --------- | -------- | -------- | ----------- | ------------------------------------------------------------------------------------------------------- |\n", + "| 46.6 | 57 042 704 | 26 730 | 2 134.0 | 1 824.0 | 1 056 | 7 648 | 1 464.2 | `cupy_copy__int64_int64` |\n", + "| 25.4 | 31 075 482 | 10 395 | 2 989.5 | 1 856.0 | 1 440 | 7 328 | 2 067.8 | `cupy_add__int64_int64_int64` |\n", + "| 10.6 | 12 926 455 | 4 455 | 2 901.6 | 1 760.0 | 1 408 | 7 168 | 2 065.3 | `cupy_equal__int64_int_bool` |\n", + "| 3.6 | 4 414 582 | 1 500 | 2 943.1 | 1 792.0 | 1 440 | 7 232 | 2 084.5 | `cupy_bitwise_and__bool_bool_bool` |\n", + "| 3.6 | 4 412 844 | 1 500 | 2 941.9 | 1 792.0 | 1 408 | 7 232 | 2 102.9 | `cupy_bitwise_or__bool_bool_bool` |\n", + "| 3.6 | 4 398 695 | 1 500 | 2 932.5 | 1 792.0 | 1 440 | 7 296 | 2 076.4 | `cupy_where__bool_int_int_int64` |\n", + "| 3.5 | 4 300 049 | 1 500 | 2 866.7 | 1 760.0 | 1 408 | 7 040 | 2 039.9 | `cupy_copy__bool_bool` |\n", + "| 2.1 | 2 535 815 | 15 | 169 054.3 | 167 969.0 | 167 297 | 171 872 | 1 744.7 | `void generate_seed_pseudo>(unsigned long long, u…)` |\n", + "| 0.5 | 570 464 | 270 | 2 112.8 | 1 664.0 | 1 024 | 7 392 | 1 473.7 | `cupy_copy__int32_int32` |\n", + "| 0.3 | 313 762 | 105 | 2 988.2 | 1 856.0 | 1 440 | 7 136 | 2 085.9 | `cupy_add__int32_int32_int32` |\n", + "| 0.1 | 130 881 | 45 | 2 908.5 | 1 792.0 | 1 408 | 7 008 | 2 089.1 | `cupy_equal__int32_int_bool` |\n", + "| 0.1 | 76 928 | 15 | 5 128.5 | 2 464.0 | 1 664 | 14 720 | 5 025.6 | `void gen_sequenced…` |\n", + "| 0.0 | 44 896 | 15 | 2 993.1 | 1 856.0 | 1 504 | 6 976 | 2 121.2 | `cupy_copy__bool_int32` |\n", + "| 0.0 | 44 288 | 15 | 2 952.5 | 1 824.0 | 1 504 | 6 912 | 2 107.8 | `cupy_less__float64_float_bool` |\n", + "| 0.0 | 44 064 | 15 | 2 937.6 | 1 824.0 | 1 568 | 6 816 | 2 040.6 | `cupy_random_x_mod_1` |\n", + "\n", + "\n", + "### GPU Memory Operations \n", + "#### By Time\n", + "\n", + "| Time (%) | Total Time (ns) | Count | Avg (ns) | Med (ns) | Min (ns) | Max (ns) | StdDev (ns) | Operation |\n", + "| -------- | --------------- | ----- | -------- | -------- | -------- | -------- | ----------- | -------------------------------- |\n", + "| 100.0 | 29 435 086 | 9 000 | 3 270.6 | 1 600.0 | 1 248 | 17 888 | 2 406.9 | `[CUDA memcpy Device-to-Device]` |\n", + "\n", + "\n", + "#### By Size\n", + "\n", + "| Total (MB) | Count | Avg (MB) | Med (MB) | Min (MB) | Max (MB) | StdDev (MB) | Operation |\n", + "| ---------- | ----- | -------- | -------- | -------- | -------- | ----------- | -------------------------------- |\n", + "| 18 957.377 | 9 000 | 2.106 | 0.498 | 0.010 | 7.992 | 3.014 | `[CUDA memcpy Device-to-Device]` |\n", + "````" + ] + }, + { + "cell_type": "markdown", + "id": "0b343ecc-238e-486d-a494-437f5142d1c9", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Exercise: Undertsanding Change in Profiling Data \n", + "\n", + "Now that you have seen the detailed profiling breakdowns for grid sizes **[10, 25, 50, 100]** and **[50, 100, 250, 500, 1000]**, take some time to consider and answer the following:\n", + "\n", + "- **Scaling of Python vs Vectorised Code** \n", + " - How does the **percentage of total run-time** spent in the naive Python loops (`simulate_life_naive` + `life_step_naive`) change as the grid size grows? \n", + " - At what grid size does the **NumPy** implementation begin to out-perform the **naive** Python version? And at what point does **CuPy** start to consistently beat NumPy?\n", + "- **NumPy vs CuPy Overhead** \n", + " - For the smaller grids (10–100), NumPy was faster than CuPy, why? \n", + " - Identify which **CUDA API calls** (e.g. `cudaFree`, `cudaMalloc`, `cudaMemcpyAsync`) dominate the overhead in the CuPy runs. How does this overhead fraction evolve for larger problem sizes?\n", + "- **Kernel vs Memory-Transfer Balance** \n", + " - Examine the GPU **Kernel Execution** tables: what fraction of the total GPU time is spent in compute kernels (e.g. `cupy_add`, `cupy_equal`) versus simple copy kernels (e.g. `cupy_copy`)? \n", + " - How does the ratio of **Device-to-Device memcpy** time to compute time change when moving from small to large grids?\n", + "- **Impact of Implicit Synchronisations** \n", + " - The `cudaFree` call is synchronous and stalls the CPU; how many times is it invoked per iteration, and how much total time does it cost? \n", + " - Propose a strategy to **pre-allocate** and **reuse** GPU buffers across iterations—how many `cudaFree` calls would you eliminate, and roughly how much time would this save?\n", + "- **Optimisation Opportunities** \n", + " - Based on the profiling data across both grid-size ranges, what is the **single biggest bottleneck** you would tackle first? \n", + " - What are some optimisations that could be used?. \n", + "- **Real-World Implications** \n", + " - If this Game of Life kernel were part of a larger simulation pipeline, what lessons can you draw about when and how to offload work to the GPU? \n", + " - At what problem size does GPU acceleration become worthwhile, and how would you detect that programmatically?\n", + "\n", + "```{admonition} Potential Answers!\n", + ":class: dropdown\n", + "### Scaling of Python vs. Vectorized Code\n", + "- **Naive Python loops dominate more** as the grid grows: they consume ~72% of total NVTX‐measured time for N between [10–100], and nearly 99% for N between [50–1000], showing that pure‐Python O(N²) code scales very poorly.\n", + "- **NumPy** is faster than the naive version even at the smallest grid (N=10) thanks to C‐level vectorisation. **CuPy** only begins to consistently beat NumPy once N≥250–500, where the fixed GPU launch and transfer overhead is amortised.\n", + "\n", + "### NumPy vs. CuPy Overhead\n", + "- For **small grids (N ≤100)**, NumPy wins because CuPy pays extra for `cudaMalloc`/`cudaFree`, kernel launches and synchronisations on every step.\n", + "- In the CuPy runs, **`cudaFree`** alone accounts for ~86% of all CUDA API time (dropping to ~69% at larger N), followed by `cuLaunchKernel`. That overhead fraction shrinks slightly as compute work grows, but remains the dominant cost.\n", + "\n", + "### Kernel vs. Memory‐Transfer Balance\n", + "- Across both size ranges, **`cupy_copy__…`** kernels (element-wise copies) take over 50% of GPU compute time, while arithmetic kernels (`cupy_add`, `cupy_equal`) contribute ~20%. The GPU is spending more time moving data than doing math.\n", + "- The ratio of **Device-to-Device memcpy** to compute time decreases for larger grids: small runs saw ~9ms of memcpy vs. ~33ms of compute, whereas large runs saw ~29ms of memcpy vs. ~57ms of compute, so data‐move overhead becomes relatively less as problem size grows.\n", + "\n", + "### Impact of Implicit Synchronisations\n", + "- Each timestep triggers at least one `cudaFree` (and possibly an allocation), costing ~1.5s of sync overhead on small grids and ~2.5s on large ones. That’s >80% of API time.\n", + "- **Pre-allocating** two device buffers (next and current grid) and reusing them would eliminate all per-step `cudaFree` calls—saving roughly the entire `cudaFree` overhead.\n", + "\n", + "### Optimisation Opportunities\n", + "- **First tackle** the naive Python loops: replace them with NumPy or CuPy to eliminate the 70–99% NVTX‐time they consume.\n", + "- **Then address** the GPU syncs by pre-allocating buffers, batching kernel launches into fewer calls, and consolidating small memcopies into larger transfers.\n", + "\n", + "### Real‐World Implications\n", + "- In a larger pipeline you should **only offload** those kernels whose work (e.g. N≥250–500) justifies the overhead of transfers and launches.\n", + "- You can **auto-tune** by benchmarking a handful of sizes at startup and selecting CPU vs. GPU code paths based on the measured crossover point.\n", + "\n", + "These answers illustrate how to interpret NVTX ranges, CUDA API tables and kernel/memop summaries to guide optimisations in a Python and CuPy workflow.\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "c2187d99-e201-495c-a234-49ac50ca9317", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## General Optimisation Strategies\n", + "Bringing everything together, some strategies include:\n", + "\n", + "**On the CPU side (Python)**: \n", + "- **Vectorise Operations**: We saw this with NumPy; doing things in batches is faster than Python loops. \n", + "- **Use efficient libraries**: If a certain computation is slow in Python, see if there is a library (NumPy, SciPy, etc) that does it in C or another language. \n", + "- **Optimise algorithms**: Sometimes, a better algorithm can speed things up more than any level of optimisation. For example, if you find a certain computation is N^2 in complexity and it's slow, see if you can make it N log N or similar.\n", + "- **Consider multiprocessing or parallelisation**: Use multiple CPU cores (with `multiprocessing` or `joblib` or others) if appropriate.\n", + "\n", + "**On the GPU side**:\n", + "- **Minimise data transfers**: Once data is on the GPU, try to do as much as possible there. Transferring large arrays back and forth every iteration will kill performance. Maybe accumulate results and transfer once at the end, or use pinned memory for faster transfers if you must.\n", + "- **Kernel fusion / reducing launch overhead**: Each call (like our multiple `cp.roll` operations) launches separate kernels. If possible, combining operations into one kernel means the GPU can do it all in one pass. Some libraries or tools do this automatically (for example, CuPy might fuse elementwise operations under the hood, and deep learning frameworks definitely fuse a lot of ops). If not, one can write a custom CUDA kernel to do more work in one go.\n", + "- **Asynchronous overlap**: GPUs operate asynchronously relative to the CPU. You can have the CPU queue up work and then do something else (like prepare next batch of data) while GPU is processing. Nsight can show if your CPU and GPU are overlapping or if one is waiting for the other. Ideally, you overlap communication (PCIe transfers) with computation if possible.\n", + "- **Memory access patterns**: This is more advanced, but if diving into custom kernel, coalesced memory access (accessing consecutive memory addresses in threads that are next to each other) is important for performance. Uncoalesced or random access can slow down even if arithmetic is small.\n", + "- **Use specialised libraries**: For certain tasks, libraries like cuDNN (deep neural nets), cuBLAS (linear algebra), etc., are heavily optimised. Always prefer a library call (e.g., `cp.fft` or `cp.linalg`) over writing your own, if it fits the need, because those are likely tuned for performance.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/individual_modules/intro_to_GPUs/questions/summary_slurm.json b/individual_modules/intro_to_GPUs/questions/summary_slurm.json new file mode 100644 index 00000000..5d8e946d --- /dev/null +++ b/individual_modules/intro_to_GPUs/questions/summary_slurm.json @@ -0,0 +1,132 @@ +[ + { + "question": "Which command is used to submit a batch job script to Slurm?", + "type": "many_choice", + "answers": [ + { + "answer": "sinfo", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "sbatch", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "squeue", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "scancel", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "In a job script, which #SBATCH directive requests a single GPU?", + "type": "many_choice", + "answers": [ + { + "answer": "--cpu-per-task=1", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "--mem=1G", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "--partition=gpu", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "--gres=gpu:1", + "correct": true, + "feedback": "Correct" + } + ] + }, + { + "question": "After submitting a job with sbatch, which Slurm command do you use to check its status in the queue?", + "type": "many_choice", + "answers": [ + { + "answer": "scancel", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "sacct", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "squeue", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "sinfo", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "If you do not specify --output in your script, Slurm writes standard output to a file named by default:", + "type": "many_choice", + "answers": [ + { + "answer": "job_.out", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "slurm-.out", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "output_.txt", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "stdout.log", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "Which #SBATCH option sets the maximum wall-clock time your job is allowed to run?", + "type": "many_choice", + "answers": [ + { + "answer": "--time", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "--mem", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "--cpus-per-task", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "--job-name", + "correct": false, + "feedback": "Incorrect" + } + ] + } +] \ No newline at end of file diff --git a/individual_modules/intro_to_GPUs/questions/summary_spack.json b/individual_modules/intro_to_GPUs/questions/summary_spack.json new file mode 100644 index 00000000..c605ead1 --- /dev/null +++ b/individual_modules/intro_to_GPUs/questions/summary_spack.json @@ -0,0 +1,314 @@ +[ + { + "question": "In the above spack spec output, what build system is used for the top-level cuda@12.8.0 package?", + "type": "many_choice", + "answers": [ + { + "answer": "Autotools", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "CMake", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Generic", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "Makefile", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "In the above spack spec output, which feature is enabled in the Python build according to the spec?", + "type": "many_choice", + "answers": [ + { + "answer": "tkinter", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "debug symbols", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "sqlite3", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "optimizations", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "In the above spack spec output, what architecture and OS is targeted by all packages in this spec?", + "type": "many_choice", + "answers": [ + { + "answer": "linux-ubuntu20.04-x86_64", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "linux-ubuntu22.04-zen2", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "linux-centos7-x86_64", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "linux-ubuntu22.04-amd64", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "In the above spack spec output, what version of bzip2 is specified as a dependency of Python?", + "type": "many_choice", + "answers": [ + { + "answer": "1.0.6", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "1.0.8", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "1.0.10", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "1.0.4", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "What does \"spack env activate -p gpu_training\" do?", + "type": "many_choice", + "answers": [ + { + "answer": "Installs all packages in the env", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Deactivates the environment", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Activates the environment making its packages visible in your shell, and modifies the prompt", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "Lists the environment's spec", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "Which command shows the planned dependency graph before installation?", + "type": "many_choice", + "answers": [ + { + "answer": "spack install", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "spack spec", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "spack find", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "spack info", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "How do you add a specific package version to your environment?", + "type": "many_choice", + "answers": [ + { + "answer": "spack install py-numpy@1.21.2", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "spack add py-numpy@1.21.2", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "spack spec py-numpy@1.21.2", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "spack list py-numpy@1.21.2", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "What symbol indicates a package is already installed in the spec output?", + "type": "many_choice", + "answers": [ + { + "answer": "-", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "^", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "+", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "*", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "Which file should you edit to load Spack automatically in Bash?", + "type": "many_choice", + "answers": [ + { + "answer": "~/.profile", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "~/.bashrc", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "~/.spack/config.yaml", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "~/.zshrc", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "What does the spec string \"py-numpy@1.21.2%gcc@11.4.1 build_system=python_pip arch=linux-ubuntu20.04-zen2\" include?", + "type": "many_choice", + "answers": [ + { + "answer": "Package name and version only", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Package name, version, compiler and version, build system and architecture", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "dependencies list", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Environment name", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "How do you remove a package from the environment spec before installation?", + "type": "many_choice", + "answers": [ + { + "answer": "spack uninstall py-numpy", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "spack purge py-numpy", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "spack remove py-numpy", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "spack delete py-numpy", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "Why is Spack particularly useful in HPC/GPU contexts?", + "type": "many_choice", + "answers": [ + { + "answer": "It’s the only package manager that supports Python", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "It only installs GPU libraries", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "It manages complex dependency trees and multiple versions/variants of compilers and libraries, without needing root access", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "It replaces the system package manager", + "correct": false, + "feedback": "Incorrect" + } + ] + } +] \ No newline at end of file diff --git a/individual_modules/intro_to_GPUs/questions/summary_theory.json b/individual_modules/intro_to_GPUs/questions/summary_theory.json new file mode 100644 index 00000000..cc965e9b --- /dev/null +++ b/individual_modules/intro_to_GPUs/questions/summary_theory.json @@ -0,0 +1,226 @@ +[ + { + "question": "What does \"GPU\" stand for?", + "type": "many_choice", + "answers": [ + { + "answer": "Graphical Processing Unit", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Graphics Processing Unit", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "General Processing Unit", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Graphics Programmer Unit", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "Which characteristic best describes a GPU compared to a CPU?", + "type": "many_choice", + "answers": [ + { + "answer": "Fewer, more powerful cores", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "One core with very high clock speed", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Hundreds or thousands of simpler cores optimized for parallel throughput", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "No caching, only direct memory access", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "What is VRAM on a GPU?", + "type": "many_choice", + "answers": [ + { + "answer": "The CPU's L1 cache", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Disk-based virtual memory", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "The GPUs dedicated device memory", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "Shared system RAM accessible by CPU and GPU equally", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "Which compiler is specifically used to compile NVIDIA CUDA C++ code?", + "type": "many_choice", + "answers": [ + { + "answer": "GCC", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Clang", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "NVCC", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "Intel ICC", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "What is meant by “data transfer overhead” in GPU computing?", + "type": "many_choice", + "answers": [ + { + "answer": "The time to launch a kernel", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "The time spent inside GPU registers", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "The time to move data between host (CPU) memory and device (GPU) memory", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "The time to compile the code", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "What does “parallel overhead” refer to?", + "type": "many_choice", + "answers": [ + { + "answer": "Time to write parallel code", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Cost of launching many small GPU kernels or threads", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "CPU cache misses during parallel loops", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Increased FLOP rate on GPU", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "In the craftsman vs. factory analogy, the GPU is like a…", + "type": "many_choice", + "answers": [ + { + "answer": "Skilled craftsman", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Factory with many workers", + "correct": true, + "feedback": "Correct" + } + ] + }, + { + "question": "When should you choose a GPU over a CPU?", + "type": "many_choice", + "answers": [ + { + "answer": "Tasks with heavy conditional logic", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Low-latency single-thread tasks", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Highly data-parallel tasks applying the same operation to large arrays", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "File I/O bound workloads", + "correct": false, + "feedback": "Incorrect" + } + ] + }, + { + "question": "Why is an understanding of compilers important in GPU programming?", + "type": "many_choice", + "answers": [ + { + "answer": "Compilers are not used with GPUs", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "To manually write PTX by hand", + "correct": false, + "feedback": "Incorrect" + }, + { + "answer": "Because they translate high-level code into optimized GPU machine code, affecting performance and compatibility", + "correct": true, + "feedback": "Correct" + }, + { + "answer": "They handle disk I/O optimisations", + "correct": false, + "feedback": "Incorrect" + } + ] + } +] \ No newline at end of file diff --git a/individual_modules/intro_to_GPUs/scripts/game_of_life.py b/individual_modules/intro_to_GPUs/scripts/game_of_life.py new file mode 100644 index 00000000..32096f97 --- /dev/null +++ b/individual_modules/intro_to_GPUs/scripts/game_of_life.py @@ -0,0 +1,305 @@ +""" +Conway’s Game of Life Simulation + +This module implements core update rules and simulation drivers for Conway’s +Game of Life on 2D grids, with three backends: +- NumPy vectorized updates +- CuPy GPU-accelerated updates +- Naive Python nested loops + +It also provides an animation exporter (GIF via matplotlib) and CLI entry points: +- run_life_numpy() +- run_life_cupy() +- run_life_naive() +""" + +# ------------------------------------------------------------------- +# Library imports +# ------------------------------------------------------------------- +import argparse +from pathlib import Path +import matplotlib.pyplot as plt +import matplotlib.animation as animation +import numpy as np +import cupy as cp + + +# ───────────────────────────────────────────────────────────────────────────── +# 1) Core update functions (no plotting/animation) +# ───────────────────────────────────────────────────────────────────────────── + +def life_step_numpy(grid: np.ndarray) -> np.ndarray: + """ + Compute the next generation of the Game of Life using NumPy. + + Applies the standard 8‐neighbor Conway rules on a toroidal grid: + - A dead cell with exactly 3 neighbors becomes alive. + - A live cell with 2 or 3 neighbors stays alive. + - Otherwise, the cell dies or remains dead. + + Args: + grid (np.ndarray): 2D array of 0s and 1s representing the current state. + + Returns: + np.ndarray: 2D array of the same shape for the next generation. + """ + neighbours = ( + np.roll(np.roll(grid, 1, axis=0), 1, axis=1) + + np.roll(np.roll(grid, 1, axis=0), -1, axis=1) + + np.roll(np.roll(grid, -1, axis=0), 1, axis=1) + + np.roll(np.roll(grid, -1, axis=0), -1, axis=1) + + np.roll(grid, 1, axis=0) + + np.roll(grid, -1, axis=0) + + np.roll(grid, 1, axis=1) + + np.roll(grid, -1, axis=1) + ) + return np.where((neighbours == 3) | ((grid == 1) & (neighbours == 2)), 1, 0) + + +def life_step_gpu(grid: cp.ndarray) -> cp.ndarray: + """ + Compute the next generation of the Game of Life using CuPy on GPU. + + Identical rules to life_step_numpy, but leverages GPU arrays. + + Args: + grid (cp.ndarray): 2D CuPy array of 0s and 1s. + + Returns: + cp.ndarray: Next-generation 2D CuPy array. + """ + neighbours = ( + cp.roll(cp.roll(grid, 1, axis=0), 1, axis=1) + + cp.roll(cp.roll(grid, 1, axis=0), -1, axis=1) + + cp.roll(cp.roll(grid, -1, axis=0), 1, axis=1) + + cp.roll(cp.roll(grid, -1, axis=0), -1, axis=1) + + cp.roll(grid, 1, axis=0) + + cp.roll(grid, -1, axis=0) + + cp.roll(grid, 1, axis=1) + + cp.roll(grid, -1, axis=1) + ) + return cp.where((neighbours == 3) | ((grid == 1) & (neighbours == 2)), 1, 0) + + +def life_step_naive(grid: np.ndarray) -> np.ndarray: + """ + Compute the next generation with a naive Python loop implementation. + + Iterates over each cell and its 8 neighbors, applying wrap-around. + + Args: + grid (np.ndarray): 2D array of 0s and 1s. + + Returns: + np.ndarray: Next-generation 2D array. + """ + N, M = grid.shape + new = np.zeros((N, M), dtype=int) + for i in range(N): + for j in range(M): + cnt = 0 + for di in (-1, 0, 1): + for dj in (-1, 0, 1): + if di == 0 and dj == 0: + continue + ni, nj = (i + di) % N, (j + dj) % M + cnt += grid[ni, nj] + if grid[i, j] == 1: + new[i, j] = 1 if (cnt == 2 or cnt == 3) else 0 + else: + new[i, j] = 1 if (cnt == 3) else 0 + return new + + +# ───────────────────────────────────────────────────────────────────────────── +# 2) Simulation functions (no animation) +# ───────────────────────────────────────────────────────────────────────────── + +def simulate_life_numpy(N: int, timesteps: int, p_alive: float = 0.2, record_history: bool = False): + """ + Run a Game of Life simulation using the NumPy backend. + + Initializes a random NxN grid with alive probability p_alive, + then iterates the specified number of timesteps. + + Args: + N (int): Grid dimension (N × N). + timesteps (int): Number of generations to simulate. + p_alive (float): Probability that a cell starts alive. + record_history (bool): If True, collect each generation in a list. + + Returns: + list[np.ndarray] or None: History of grids if record_history else None. + """ + grid = np.random.choice([0, 1], size=(N, N), p=[1 - p_alive, p_alive]) + history = [] if record_history else None + for _ in range(timesteps): + if record_history: + history.append(grid.copy()) + grid = life_step_numpy(grid) + return history + + +def simulate_life_cupy(N: int, timesteps: int, p_alive: float = 0.2, record_history: bool = False): + """ + Run a Game of Life simulation on GPU using CuPy. + + Args: + N (int): Grid dimension (N × N). + timesteps (int): Number of generations. + p_alive (float): Initial alive probability. + record_history (bool): If True, collect grids (converted to NumPy). + + Returns: + list[np.ndarray] or None: History of grids as NumPy arrays if recorded. + """ + grid_gpu = (cp.random.random((N, N)) < p_alive).astype(cp.int32) + history = [] if record_history else None + for _ in range(timesteps): + if record_history: + history.append(cp.asnumpy(grid_gpu)) + grid_gpu = life_step_gpu(grid_gpu) + return history + + +def simulate_life_naive(N: int, timesteps: int, p_alive: float = 0.2, record_history: bool = False): + """ + Run a Game of Life simulation with the naive Python implementation. + + Args: + N (int): Grid size. + timesteps (int): Number of generations. + p_alive (float): Starting alive probability. + record_history (bool): Whether to collect each generation. + + Returns: + list[np.ndarray] or None: Recorded history if requested. + """ + grid = np.random.choice([0, 1], size=(N, N), p=[1 - p_alive, p_alive]) + history = [] if record_history else None + for _ in range(timesteps): + if record_history: + history.append(grid.copy()) + grid = life_step_naive(grid) + return history + + +# ───────────────────────────────────────────────────────────────────────────── +# 3) Animation/export +# ───────────────────────────────────────────────────────────────────────────── + +def animate_life(history, output_file: Path, interval: int = 200, dpi: int = 80): + """ + Create and save a GIF animation of the Game of Life history. + + Uses matplotlib’s FuncAnimation and the pillow writer. + + Args: + history (list[np.ndarray]): List of 2D grids to animate. + output_file (Path): Path for the output GIF file. + interval (int): Delay between frames in ms. + dpi (int): Resolution of the saved animation. + """ + + + fig, ax = plt.subplots(figsize=(6, 6)) + im = ax.imshow(history[0], cmap='binary') + ax.set_axis_off() + + def _update(idx): + im.set_data(history[idx]) + return [im] + + anim = animation.FuncAnimation( + fig, _update, + frames=len(history), + interval=interval, + blit=True + ) + anim.save(str(output_file), writer='pillow', dpi=dpi) + plt.close(fig) + + +# ───────────────────────────────────────────────────────────────────────────── +# 4) CLI entry-points (only --size, --timesteps, --save-gif) +# ───────────────────────────────────────────────────────────────────────────── + +def run_life_numpy(): + """ + Command‐line entry for NumPy-based Game of Life. + + Parses --size, --timesteps, and --save-gif; runs simulation and optionally saves GIF. + """ + p = argparse.ArgumentParser("Game of Life (NumPy)") + p.add_argument("--size", type=int, default=100, help="Grid dimension (N×N)") + p.add_argument("--timesteps", type=int, default=50, help="Number of generations") + p.add_argument("--save-gif", action="store_true", help="Save GIF animation") + args = p.parse_args() + + print(f"[NumPy] Args received: {args}") + record = args.save_gif and args.size <= 100 + history = simulate_life_numpy(args.size, args.timesteps, record_history=record) + + if args.save_gif: + if record: + output = Path("game_of_life_cpu.gif") + animate_life(history, output) + print(f"Saved CPU GIF to {output}") + else: + print("[NumPy] Problem size > 100: cannot save history or create GIF.") + else: + print("[NumPy] GIF creation skipped; history not saved.") + + +def run_life_cupy(): + """ + Command‐line entry for CuPy-based Game of Life. + + Same arguments as run_life_numpy, but runs on GPU. + """ + p = argparse.ArgumentParser("Game of Life (CuPy)") + p.add_argument("--size", type=int, default=100, help="Grid dimension (N×N)") + p.add_argument("--timesteps", type=int, default=50, help="Number of generations") + p.add_argument("--save-gif", action="store_true", help="Save GIF animation") + args = p.parse_args() + + print(f"[CuPy] Args received: {args}") + record = args.save_gif and args.size <= 100 + history = simulate_life_cupy(args.size, args.timesteps, record_history=record) + + if args.save_gif: + if record: + output = Path("game_of_life_gpu.gif") + animate_life(history, output) + print(f"Saved GPU GIF to {output}") + else: + print("[CuPy] Problem size > 100: cannot save history or create GIF.") + else: + print("[CuPy] GIF creation skipped; history not saved.") + + +def run_life_naive(): + """ + Command‐line entry for the naive Python Game of Life. + + Same CLI interface, uses the nested-loop implementation. + """ + p = argparse.ArgumentParser("Game of Life (Naive)") + p.add_argument("--size", type=int, default=100, help="Grid dimension (N×N)") + p.add_argument("--timesteps", type=int, default=50, help="Number of generations") + p.add_argument("--save-gif", action="store_true", help="Save GIF animation") + args = p.parse_args() + + print(f"[Naive] Args received: {args}") + record = args.save_gif and args.size <= 100 + history = simulate_life_naive(args.size, args.timesteps, record_history=record) + + if args.save_gif: + if record: + output = Path("game_of_life_naive.gif") + animate_life(history, output) + print(f"Saved Naive GIF to {output}") + else: + print("[Naive] Problem size > 100: cannot save history or create GIF.") + else: + print("[Naive] GIF creation skipped; history not saved.") diff --git a/individual_modules/intro_to_GPUs/scripts/game_of_life_create_plots.py b/individual_modules/intro_to_GPUs/scripts/game_of_life_create_plots.py new file mode 100644 index 00000000..a11be936 --- /dev/null +++ b/individual_modules/intro_to_GPUs/scripts/game_of_life_create_plots.py @@ -0,0 +1,189 @@ +""" +Aggregate and Visualize Game of Life Benchmark Results + +This script loads all benchmarking CSV files matching '../output/gol_timings_*.csv', +processes hardware and method metadata, and generates publication-quality plots: + 1. Performance of each method across different hardware. + 2. Performance of each hardware combination across methods. + 3. Combined overview of all methods and hardware. + +Key steps: +- Load and concatenate CSV data into a pandas DataFrame. +- Map raw GPU/CPU strings to concise, human-readable labels. +- Define marker, color, and linestyle mappings for clarity. +- Plot with matplotlib, saving PNGs to '../output'. +""" + +# ───────────────────────────────────────────────────────────────────────────── +# Library imports +# ───────────────────────────────────────────────────────────────────────────── + +import os, glob, re +import pandas as pd +import matplotlib.pyplot as plt + +# ───────────────────────────────────────────────────────────────────────────── +# 1) Load all benchmark CSV files into one DataFrame +# ───────────────────────────────────────────────────────────────────────────── +files = glob.glob('../output/gol_timings_*.csv') +df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True) + +# ───────────────────────────────────────────────────────────────────────────── +# 2) Define plotting markers for each method +# ───────────────────────────────────────────────────────────────────────────── +marker_map = { + 'naive': 's', + 'numpy': 'o', + 'cupy': '^', +} + +# ───────────────────────────────────────────────────────────────────────────── +# 3) Shorten and clean hardware labels for legibility +# ───────────────────────────────────────────────────────────────────────────── +def short_gpu(name): + """ + Map raw GPU names to concise labels for plot legends. + + Args: + name: Original GPU string from CSV. + + Returns: + A shorter, human-readable GPU label. + """ + if 'A100' in name: return 'NV A100' + if '3070' in name: return 'NV RTX 3070' + if 'H100' in name: return 'NV H100' + return name + +def short_cpu(name): + """ + Simplify raw CPU strings using regex to extract brand and core count. + + Args: + name: Original CPU string from CSV. + + Returns: + A shorter, human-readable CPU label. + """ + m = re.search(r'(AMD).*?_(\d+)-Core', name, flags=re.IGNORECASE) + if m: + brand, cores = m.group(1).upper(), m.group(2) + return f"{brand} {cores} Core" + return name.replace('_', ' ') + +def base_method(method_str): + """ + Strip any parenthetical notes from method names (e.g. "NumPy (CPU)" → "NumPy"). + + Args: + method_str: Full method label from CSV. + + Returns: + Clean method name without parentheses. + """ + return re.sub(r'\s*\(.*\)', '', method_str).strip() + +# ───────────────────────────────────────────────────────────────────────────── +# 4) Build style map for each unique (GPU, CPU) combination +# ───────────────────────────────────────────────────────────────────────────── +combos = df[['gpu','cpu']].drop_duplicates().values.tolist() + +# Color-blind safe palette (Paul Tol six): +colors = ['#4477AA', '#EE6677', '#228833', '#CCBB44', '#66CCEE', '#AA3377'] +# Linestyles for up to 6 combos: +linestyles = ['-', '--', '-.', ':', (0, (1,1)), (0, (5,1))] + +style_map = {} +for idx, (gpu, cpu) in enumerate(combos): + style_map[(gpu, cpu)] = { + 'color': colors[idx % len(colors)], + 'linestyle': linestyles[idx % len(linestyles)] + } + +legend_kwargs = dict(fontsize='small') + +# ───────────────────────────────────────────────────────────────────────────── +# 5) Plotting helper function +# ───────────────────────────────────────────────────────────────────────────── +def make_plot(grouped, title, fname, legend_args): + """ + Generate and save a line plot with error bars for grouped benchmark data. + + Args: + grouped: pandas GroupBy object grouping DataFrame by (gpu, cpu, method). + title: Title string for the plot. + fname: Filename (PNG) to save under '../output/'. + legend_args: Extra arguments for ax.legend(). + """ + fig, ax = plt.subplots(figsize=(8,6)) + + for (gpu, cpu, method), g in grouped: + bm = base_method(method) + bm_low = bm.lower() + marker = marker_map[bm_low] + style = style_map[(gpu, cpu)] + label = f"{bm}\nCPU: {short_cpu(cpu)}\nGPU: {short_gpu(gpu)}" + + ax.plot( + g['grid_size'], + g['mean_time_sec'], + marker=marker, + color=style['color'], + linestyle=style['linestyle'], + label=label + ) + + ax.set( + xlabel='Grid Size', + ylabel='Mean Time (s)', + title=title + ) + ax.legend(**legend_kwargs, **legend_args) + fig.tight_layout(rect=[0,0,1,1]) + fig.savefig(f"../output/{fname}", bbox_inches='tight', dpi=300) + plt.close(fig) + +# ───────────────────────────────────────────────────────────────────────────── +# 6) Generate plots: method-specific across hardware +# ───────────────────────────────────────────────────────────────────────────── +for method in df['method'].unique(): + sub = df[df['method']==method].sort_values('grid_size') + grp = sub.groupby(['gpu','cpu','method']) + make_plot( + grp, + title=f"{base_method(method)} Across Hardware", + fname=f"{base_method(method).lower()}_across_hardware.png", + legend_args={'loc':'best', 'ncol':1} + ) + +# ───────────────────────────────────────────────────────────────────────────── +# 7) Hardware-specific across methods +# ───────────────────────────────────────────────────────────────────────────── +for (gpu,cpu), sub in df.groupby(['gpu','cpu']): + grp = sub.sort_values('grid_size').groupby(['gpu','cpu','method']) + cpu_s = cpu.lower().replace(' ','_').replace('/','_') + gpu_s = short_gpu(gpu).replace(' ','_').lower() + make_plot( + grp, + title=f"{short_cpu(cpu)} + {short_gpu(gpu)}", + fname=f"perf_{cpu_s}_{gpu_s}.png", + legend_args={'loc':'best', 'ncol':1} + ) + +# ───────────────────────────────────────────────────────────────────────────── +# 8) Combined plot of all methods & hardware +# ───────────────────────────────────────────────────────────────────────────── +grp_all = df.sort_values(['gpu','cpu','method','grid_size'])\ + .groupby(['gpu','cpu','method']) +make_plot( + grp_all, + title="All Methods & Hardware", + fname="all_methods_hardware.png", + legend_args={ + 'loc':'center left', + 'bbox_to_anchor':(1,0.5), + 'ncol':1 + } +) + +print("All plots saved to ../output") diff --git a/individual_modules/intro_to_GPUs/scripts/game_of_life_experiment.py b/individual_modules/intro_to_GPUs/scripts/game_of_life_experiment.py new file mode 100644 index 00000000..0871418e --- /dev/null +++ b/individual_modules/intro_to_GPUs/scripts/game_of_life_experiment.py @@ -0,0 +1,184 @@ +""" +Benchmarking Game of Life Implementations Across Grid Sizes + +This script measures execution time for three versions of Conway’s Game +of Life: + - NumPy (CPU vectorized) + - CuPy (GPU-accelerated) + - Naive Python (nested loops) + +For each combination of grid size and number of timesteps, it: + 1. Runs each implementation multiple times. + 2. Records mean and standard deviation of runtimes to a CSV. + 3. Generates an error‐bar plot of time vs. grid size. +""" + +# ───────────────────────────────────────────────────────────────────────────── +# Library imports +# ───────────────────────────────────────────────────────────────────────────── +import subprocess +import time +import numpy as np +import os +import csv +import matplotlib.pyplot as plt + +# ───────────────────────────────────────────────────────────────────────────── +# Constants +# ───────────────────────────────────────────────────────────────────────────── + +# Ensure output directory exists +out_dir = "../output" +os.makedirs(out_dir, exist_ok=True) + +def get_gpu_name(): + """ + Query the system GPU name via nvidia-smi. + + Returns: + The first GPU’s name with spaces replaced by underscores, or + 'Unknown_GPU' if the command fails. + """ + try: + out = subprocess.check_output( + ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], + stderr=subprocess.DEVNULL + ).decode().strip().splitlines() + return out[0] + except Exception: + return "Unknown_GPU" + +def get_cpu_name(): + """ + Query the CPU model name via lscpu (Linux). + + Returns: + The CPU model string with spaces replaced by underscores, or + 'Unknown_CPU' if detection fails. + """ + try: + out = subprocess.check_output(["lscpu"], stderr=subprocess.DEVNULL).decode().splitlines() + for line in out: + if line.startswith("Model name:"): + return line.split(":", 1)[1].strip() + except Exception: + pass + return "Unknown_CPU" + +def plot_timings(csv_filename): + """ + Read benchmark CSV and generate an error‐bar plot: execution time vs. grid size. + + The CSV is expected to have columns: + gpu, cpu, method, grid_size, timesteps, mean_time_sec, std_dev_sec + + Args: + csv_filename: Path to the CSV file containing benchmark results. + """ + data = {} + timesteps = None + with open(csv_filename, newline="") as f: + reader = csv.DictReader(f) + for row in reader: + method = row["method"] + size = int(row["grid_size"]) + mean_t = float(row["mean_time_sec"]) + std_t = float(row["std_dev_sec"]) + timesteps = row["timesteps"] + data.setdefault(method, {"sizes": [], "means": [], "stds": []}) + data[method]["sizes"].append(size) + data[method]["means"].append(mean_t) + data[method]["stds"].append(std_t) + + plt.figure(figsize=(10, 6)) + for method, vals in data.items(): + plt.errorbar( + vals["sizes"], + vals["means"], + yerr=vals["stds"], + label=method, + marker="o", + capsize=5 + ) + plt.title(f"Game of Life - Time vs Grid Size (Timesteps = {timesteps})") + plt.xlabel("Grid Size (N x N)") + plt.ylabel("Execution Time (seconds)") + plt.legend() + plt.grid(True) + plt.tight_layout() + + # Save PNG next to CSV + base = os.path.splitext(os.path.basename(csv_filename))[0] + out_png = os.path.join(os.path.dirname(csv_filename), f"{base}.png") + plt.savefig(out_png, dpi=300) + plt.close() + print(f"Saved plot: {out_png}") + + +# ───────────────────────────────────────────────────────────────────────────── +# Main benchmarking loop +# ───────────────────────────────────────────────────────────────────────────── + +gpu_name = get_gpu_name().replace(" ", "_") +cpu_name = get_cpu_name().replace(" ", "_") + +methods = { + "NumPy (CPU)": "game_of_life_cpu", + "CuPy (GPU)": "game_of_life_gpu", + "Naive (CPU)": "game_of_life_naive", +} + +# Build an underscore-joined string of all entry-point names, sorted for consistency +clean_names = [v.replace("game_of_life_", "") for v in methods.values()] +method_ids = "_".join(sorted(clean_names)) + +#grid_sizes = [50, 100, 250, 500, 1000] +grid_sizes = [10, 25] +timesteps_list = [100] # you can expand this list +repeats = 3 + +for timesteps in timesteps_list: + # Filename now includes GPU, CPU, all methods, and timesteps + csv_filename = os.path.join( + out_dir, + f"gol_timings_{gpu_name}_{cpu_name}_{method_ids}_ts{timesteps}.csv" + ) + + with open(csv_filename, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow([ + "gpu", "cpu", "method", + "grid_size", "timesteps", + "mean_time_sec", "std_dev_sec" + ]) + + print(f"\n==== Running benchmarks for {timesteps} timesteps ====") + for size in grid_sizes: + for method_name, entry_point in methods.items(): + run_times = [] + for i in range(repeats): + cmd = [ + "poetry", "run", entry_point, + "--size", str(size), + "--timesteps", str(timesteps) + ] + print(f" {method_name:<12} | {size:6}×{size:<6} | run {i+1}/{repeats}") + t0 = time.perf_counter() + subprocess.run(cmd, check=True) + t1 = time.perf_counter() + run_times.append(t1 - t0) + + mean_t = np.mean(run_times) + std_t = np.std(run_times) + writer.writerow([ + gpu_name, cpu_name, + method_name, + size, timesteps, + f"{mean_t:.6f}", + f"{std_t:.6f}" + ]) + + print(f"Saved CSV: {csv_filename}") + + # Generate the plot from that CSV + plot_timings(csv_filename) diff --git a/individual_modules/intro_to_GPUs/scripts/game_of_life_experiment_profiled.py b/individual_modules/intro_to_GPUs/scripts/game_of_life_experiment_profiled.py new file mode 100644 index 00000000..f7e76112 --- /dev/null +++ b/individual_modules/intro_to_GPUs/scripts/game_of_life_experiment_profiled.py @@ -0,0 +1,196 @@ +""" +Benchmarking Suite for Profiled Game of Life Implementations + +This script runs profiled entry-points for three versions of Conway’s Game of Life: + - NumPy (CPU): game_of_life_cpu_profiled + - CuPy (GPU): game_of_life_gpu_profiled + - Naive (CPU): game_of_life_naive_profiled + +For each combination of grid size and timestep count, it: + 1. Executes each implementation multiple times via Poetry entry-points. + 2. Computes mean and standard deviation of run times. + 3. Writes results to a CSV file. + 4. Generates an error-bar plot (execution time vs. grid size) saved alongside the CSV. +""" + +# ───────────────────────────────────────────────────────────────────────────── +# Library imports +# ───────────────────────────────────────────────────────────────────────────── + +import subprocess +import time +import numpy as np +import os +import csv +import matplotlib.pyplot as plt + +# ───────────────────────────────────────────────────────────────────────────── +# Constants +# ───────────────────────────────────────────────────────────────────────────── + +# Ensure output directory exists +out_dir = "../output" +os.makedirs(out_dir, exist_ok=True) + +def get_gpu_name(): + """ + Query the GPU model name via `nvidia-smi`. + + Returns: + str: The first GPU’s name, or 'Unknown_GPU' if detection fails. + """ + try: + out = subprocess.check_output( + ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], + stderr=subprocess.DEVNULL + ).decode().strip().splitlines() + return out[0] + except Exception: + return "Unknown_GPU" + +def get_cpu_name(): + """ + Query the CPU model name via `lscpu` on Linux. + + Returns: + str: The CPU model string, or 'Unknown_CPU' if detection fails. + """ + try: + out = subprocess.check_output(["lscpu"], stderr=subprocess.DEVNULL).decode().splitlines() + for line in out: + if line.startswith("Model name:"): + return line.split(":", 1)[1].strip() + except Exception: + pass + return "Unknown_CPU" + +def plot_timings(csv_filename): + """ + Read benchmark CSV and generate an error-bar plot of runtime vs grid size. + + The CSV must include columns: + gpu, cpu, method, grid_size, timesteps, mean_time_sec, std_dev_sec + + Args: + csv_filename (Path): Path to the CSV file of timing results. + """ + data = {} + timesteps = None + with open(csv_filename, newline="") as f: + reader = csv.DictReader(f) + for row in reader: + method = row["method"] + size = int(row["grid_size"]) + mean_t = float(row["mean_time_sec"]) + std_t = float(row["std_dev_sec"]) + timesteps = row["timesteps"] + data.setdefault(method, {"sizes": [], "means": [], "stds": []}) + data[method]["sizes"].append(size) + data[method]["means"].append(mean_t) + data[method]["stds"].append(std_t) + + plt.figure(figsize=(10, 6)) + for method, vals in data.items(): + plt.errorbar( + vals["sizes"], + vals["means"], + yerr=vals["stds"], + label=method, + marker="o", + capsize=5 + ) + plt.title(f"Game of Life - Time vs Grid Size (Timesteps = {timesteps})") + plt.xlabel("Grid Size (N x N)") + plt.ylabel("Execution Time (seconds)") + plt.legend() + plt.grid(True) + plt.tight_layout() + + # Save PNG next to CSV + base = os.path.splitext(os.path.basename(csv_filename))[0] + out_png = os.path.join(os.path.dirname(csv_filename), f"{base}.png") + plt.savefig(out_png, dpi=300) + plt.close() + print(f"Saved plot: {out_png}") + + + +# ───────────────────────────────────────────────────────────────────────────── +# Main benchmarking loop +# ───────────────────────────────────────────────────────────────────────────── + +def run_experiment(): + """ + Main benchmarking routine. + + Detects hardware names, defines profiled entry-points, loops over grid sizes + and timesteps, records timing statistics to CSV, and generates plots. + """ + + gpu_name = get_gpu_name().replace(" ", "_") + cpu_name = get_cpu_name().replace(" ", "_") + + # Updated to use the profiled entry-points defined in pyproject.toml: + methods = { + "NumPy (CPU)": "game_of_life_cpu_profiled", + "CuPy (GPU)": "game_of_life_gpu_profiled", + "Naive (CPU)": "game_of_life_naive_profiled", + } + + # Strip off both the "game_of_life_" prefix and the "_profiled" suffix + clean_names = [ + v.replace("game_of_life_", "").replace("_profiled", "") + for v in methods.values() + ] + method_ids = "_".join(sorted(clean_names)) + + #grid_sizes = [50, 100, 250, 500, 1000] + grid_sizes = [10, 25, 50, 100] + timesteps_list = [100] + repeats = 3 + + for timesteps in timesteps_list: + # Filename now includes GPU, CPU, all methods, and timesteps + csv_filename = os.path.join( + out_dir, + f"gol_timings_{gpu_name}_{cpu_name}_{method_ids}_ts{timesteps}.csv" + ) + + with open(csv_filename, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow([ + "gpu", "cpu", "method", + "grid_size", "timesteps", + "mean_time_sec", "std_dev_sec" + ]) + + print(f"\n==== Running benchmarks for {timesteps} timesteps ====") + for size in grid_sizes: + for method_name, entry_point in methods.items(): + run_times = [] + for i in range(repeats): + cmd = [ + "poetry", "run", entry_point, + "--size", str(size), + "--timesteps", str(timesteps) + ] + print(f" {method_name:<12} | {size:6}×{size:<6} | run {i+1}/{repeats}") + t0 = time.perf_counter() + subprocess.run(cmd, check=True) + t1 = time.perf_counter() + run_times.append(t1 - t0) + + mean_t = np.mean(run_times) + std_t = np.std(run_times) + writer.writerow([ + gpu_name, cpu_name, + method_name, + size, timesteps, + f"{mean_t:.6f}", + f"{std_t:.6f}" + ]) + + print(f"Saved CSV: {csv_filename}") + + # Generate the plot from that CSV + plot_timings(csv_filename) diff --git a/individual_modules/intro_to_GPUs/scripts/game_of_life_mem_opt.py b/individual_modules/intro_to_GPUs/scripts/game_of_life_mem_opt.py new file mode 100644 index 00000000..d167819f --- /dev/null +++ b/individual_modules/intro_to_GPUs/scripts/game_of_life_mem_opt.py @@ -0,0 +1,197 @@ +# Game of Life simulation with detailed comments explaining each part of the code. + +import argparse # For parsing command-line arguments +import time # For measuring wall-clock time +import resource # For measuring CPU and memory usage +from pathlib import Path # For convenient file path handling + +import numpy as np # Numerical operations on arrays +import matplotlib.pyplot as plt # Plotting figures and images +import matplotlib.animation as animation # Creating animated GIFs +from tqdm import tqdm # Progress bar for loops + + +def life_step_int(grid: np.ndarray, neighbours: np.ndarray) -> np.ndarray: + """ + Perform a single step (generation) update for Conway's Game of Life. + + Parameters: + - grid (np.ndarray): 2D uint8 array of shape (N, N) with values 0 (dead) or 1 (alive). + - neighbours (np.ndarray): 2D uint8 array of same shape used for counting neighbours. + + Returns: + - np.ndarray: New 2D uint8 array representing next generation. + """ + # Reset neighbour counts to zero for current iteration + neighbours.fill(0) + + # Iterate through the eight possible neighbour offsets + for dx, dy in ( + (-1, -1), (-1, 0), (-1, 1), + ( 0, -1), ( 0, 1), + ( 1, -1), ( 1, 0), ( 1, 1), + ): + # Roll the grid in x (rows) and y (columns), then accumulate counts + neighbours += np.roll(np.roll(grid, dx, axis=0), dy, axis=1) + + # Apply rules: + # - A dead cell with exactly 3 neighbours becomes alive (birth). + # - A live cell with 2 or 3 neighbours stays alive (survival). + # Convert boolean mask back to uint8. + return ((neighbours == 3) | ((grid == 1) & (neighbours == 2))).astype(np.uint8) + + +def simulate_and_animate( + N: int, + timesteps: int, + p_alive: float, + output_file: Path, + interval_ms: int = 200, + max_display: int = 1080, + dpi: int = 180 +) -> np.ndarray: + """ + Initialize the Game of Life grid randomly, run simulation, create a GIF, + and count alive occurrences per cell over time. + + Parameters: + - N: Grid width/height + - timesteps: Number of generations + - p_alive: Initial probability of a cell being alive + - output_file: Path to save the GIF + - interval_ms: Frame interval in milliseconds + - max_display: Maximum pixel dimension for display (downsample if larger) + - dpi: Output resolution + + Returns: + - counts: 2D uint32 array of shape (N, N) with number of times each cell was alive + """ + rng = np.random.default_rng() # Random number generator + + # Create initial grid: randomly set cells alive based on threshold + grid = np.empty((N, N), dtype=np.uint8) + threshold = int(p_alive * 1_000_000) + for i in range(N): + # Generate random integers and compare to threshold for alive/dead + random_row = rng.integers(0, 1_000_000, size=N, dtype=np.int32) + grid[i, :] = (random_row < threshold).astype(np.uint8) + + neighbours = np.zeros((N, N), dtype=np.uint8) # Buffer for neighbour counts + counts = np.zeros((N, N), dtype=np.uint32) # Alive counts accumulator + + # Determine downsampling step to fit within max_display + step = 1 if max_display is None or max_display >= N else max(1, N // max_display) + + # Set up Matplotlib figure without axes for clean frames + width_in = max_display / dpi + fig = plt.figure(figsize=(width_in, width_in), frameon=False) + fig.patch.set_visible(False) + ax = fig.add_axes([0, 0, 1, 1], frameon=False) + ax.set_axis_off() + + # Display initial frame (possibly downsampled) + small = grid[::step, ::step] + im = ax.imshow( + small, + cmap='binary', # black-white colormap + vmin=0, vmax=1, + interpolation='nearest' + ) + + # Configure GIF writer: frames per second = 1000 / interval_ms + writer = animation.PillowWriter(fps=1000 / interval_ms) + writer.setup(fig, str(output_file), dpi=dpi) + + # Main simulation loop with progress bar + for _ in tqdm(range(timesteps), desc="Simulating & writing GIF"): + counts += grid # Update alive counts + small = grid[::step, ::step] + im.set_data(small) # Update image data for frame + writer.grab_frame() # Write frame to GIF + grid = life_step_int(grid, neighbours) # Compute next generation + + writer.finish() # Finalize GIF file + plt.close(fig) # Close figure to free memory + + return counts + + +def plot_heatmap(counts: np.ndarray, output_file: Path = None): + """ + Generate a heatmap of cell alive counts and save or display it. + + Parameters: + - counts: 2D array of alive counts per cell + - output_file: Optional Path to save the heatmap image (PNG). Show interactively if None. + """ + plt.figure(figsize=(4, 4)) + plt.imshow(counts, cmap='hot', interpolation='nearest') # Use 'hot' colormap + plt.colorbar(label='Alive Count') # Show scale of counts + plt.title('Cell Alive Counts Heatmap') + plt.axis('off') # Hide axes for clarity + + if output_file: + plt.savefig(output_file, dpi=150, bbox_inches='tight', pad_inches=0) + print(f"Heatmap saved to {output_file}") # Log save location + else: + plt.show() # Display on screen if no output path provided + + +def main(): + """ + Entry point for command-line execution. Parses arguments, runs simulation, + generates GIF and heatmap, and reports resource usage. + """ + # Set up argument parser with descriptions and defaults + parser = argparse.ArgumentParser(description="Game of Life with Heatmap (all-int, streaming HD GIF)") + parser.add_argument("--size", type=int, default=100, help="Grid dimension (N×N)") + parser.add_argument("--timesteps", type=int, default=50, help="Number of generations to simulate") + parser.add_argument("--p-alive", type=float, default=0.2, help="Initial alive probability (0–1)") + parser.add_argument("--output", type=Path, default=Path("game_of_life_hd.gif"), help="Output GIF filename") + parser.add_argument("--heatmap", type=Path, default=Path("alive_heatmap.png"), help="Output heatmap filename (PNG)") + parser.add_argument("--interval", type=int, default=200, help="Frame duration in ms") + parser.add_argument("--max-display", type=int, default=1080, help="Max side length for display (pixels)") + parser.add_argument("--dpi", type=int, default=180, help="Resolution (dots per inch) for outputs") + args = parser.parse_args() + + # Log parameters for user reference + print(f"[All-int Matplotlib HD + Heatmap] size={args.size}, timesteps={args.timesteps}, p_alive={args.p_alive}") + + # Record start times for benchmarking + start_wall = time.perf_counter() + rstart = resource.getrusage(resource.RUSAGE_SELF) + + # Run simulation and animation + counts = simulate_and_animate( + N=args.size, + timesteps=args.timesteps, + p_alive=args.p_alive, + output_file=args.output, + interval_ms=args.interval, + max_display=args.max_display, + dpi=args.dpi + ) + + # Plot and save the heatmap of alive counts + plot_heatmap(counts, args.heatmap) + + # Record end times for benchmarking + end_wall = time.perf_counter() + rend = resource.getrusage(resource.RUSAGE_SELF) + + # Compute and display resource usage + elapsed = end_wall - start_wall + cpu_user = rend.ru_utime - rstart.ru_utime + cpu_system = rend.ru_stime - rstart.ru_stime + peak_rss = rend.ru_maxrss / (1024 ** 2) # Convert KB to GB + + print(f"Saved HD GIF to {args.output}") + print(f"Saved heatmap to {args.heatmap}\n") + print("=== Resource usage ===") + print(f"Wall-clock time : {elapsed:.2f} s") + print(f"CPU time : user {cpu_user:.2f} s, system {cpu_system:.2f} s") + print(f"Peak memory (RSS): {peak_rss:.2f} GB") + + +if __name__ == "__main__": + main() diff --git a/individual_modules/intro_to_GPUs/scripts/game_of_life_profiled.py b/individual_modules/intro_to_GPUs/scripts/game_of_life_profiled.py new file mode 100644 index 00000000..71136374 --- /dev/null +++ b/individual_modules/intro_to_GPUs/scripts/game_of_life_profiled.py @@ -0,0 +1,323 @@ +""" +Game of Life Profiling Suite + +This script augments the Game of Life implementations (NumPy, CuPy, Naive) +with profiling instrumentation, allowing you to collect CPU profiles via +cProfile/pstats and GPU NVTX ranges via CuPy’s profiler and time_range +decorator. + +Entry points (via CLI): +- run_life_numpy(): NumPy backend, optional --profile-cpu +- run_life_cupy(): CuPy backend, optional --profile-cpu, --profile-gpu +- run_life_naive(): Naive Python backend, optional --profile-cpu +""" +# ------------------------------------------------------------------- +# Library imports +# ------------------------------------------------------------------- +import argparse +import cProfile +import pstats +from pathlib import Path +import numpy as np +import cupy as cp +from cupyx.profiler import time_range +from cupy.cuda import profiler +import matplotlib.pyplot as plt +import matplotlib.animation as animation + +# ───────────────────────────────────────────────────────────────────────────── +# 1) Core update functions (decorated with NVTX ranges via cupyx.profiler) +# ───────────────────────────────────────────────────────────────────────────── +@time_range() +def life_step_numpy(grid: np.ndarray) -> np.ndarray: + """ + Next-generation update using NumPy with NVTX profiling. + + Applies the 8-neighbor Conway’s Game of Life rules on a toroidal grid. + + Args: + grid (np.ndarray): 2D array of 0s and 1s. + + Returns: + np.ndarray: Updated 2D array for the next generation. + """ + neighbours = ( + np.roll(np.roll(grid, 1, axis=0), 1, axis=1) + + np.roll(np.roll(grid, 1, axis=0), -1, axis=1) + + np.roll(np.roll(grid, -1, axis=0), 1, axis=1) + + np.roll(np.roll(grid, -1, axis=0), -1, axis=1) + + np.roll(grid, 1, axis=0) + + np.roll(grid, -1, axis=0) + + np.roll(grid, 1, axis=1) + + np.roll(grid, -1, axis=1) + ) + return np.where((neighbours == 3) | ((grid == 1) & (neighbours == 2)), 1, 0) + +@time_range() +def life_step_gpu(grid: cp.ndarray) -> cp.ndarray: + """ + Next-generation update using CuPy with NVTX profiling. + + Identical logic to life_step_numpy, but leverages GPU arrays. + + Args: + grid (cp.ndarray): 2D CuPy array of 0s and 1s. + + Returns: + cp.ndarray: Updated 2D CuPy array. + """ + neighbours = ( + cp.roll(cp.roll(grid, 1, axis=0), 1, axis=1) + + cp.roll(cp.roll(grid, 1, axis=0), -1, axis=1) + + cp.roll(cp.roll(grid, -1, axis=0), 1, axis=1) + + cp.roll(cp.roll(grid, -1, axis=0), -1, axis=1) + + cp.roll(grid, 1, axis=0) + + cp.roll(grid, -1, axis=0) + + cp.roll(grid, 1, axis=1) + + cp.roll(grid, -1, axis=1) + ) + return cp.where((neighbours == 3) | ((grid == 1) & (neighbours == 2)), 1, 0) + +@time_range() +def life_step_naive(grid: np.ndarray) -> np.ndarray: + """ + Next-generation update with a naive Python loop and NVTX profiling. + + Iterates over each cell and its 8 neighbors on a toroidal grid. + + Args: + grid (np.ndarray): 2D array of 0s and 1s. + + Returns: + np.ndarray: Updated 2D array. + """ + N, M = grid.shape + new = np.zeros((N, M), dtype=int) + for i in range(N): + for j in range(M): + cnt = 0 + for di in (-1, 0, 1): + for dj in (-1, 0, 1): + if di == 0 and dj == 0: + continue + ni, nj = (i + di) % N, (j + dj) % M + cnt += grid[ni, nj] + if grid[i, j] == 1: + new[i, j] = 1 if cnt in (2, 3) else 0 + else: + new[i, j] = 1 if cnt == 3 else 0 + return new + +# ------------------------------------------------------------------- +# 2) Simulation loops (also bracketed with NVTX ranges) +# ------------------------------------------------------------------- +@time_range() +def simulate_life_numpy(N: int, timesteps: int, p_alive: float = 0.2, record_history: bool = False): + """ + Run Game of Life for NumPy with NVTX profiling of the loop. + + Args: + N (int): Grid dimension (N × N). + timesteps (int): Number of generations. + p_alive (float): Initial probability of being alive. + record_history (bool): If True, collect each generation in a list. + + Returns: + list[np.ndarray] or None: Generation history if recorded, else None. + """ + grid = np.random.choice([0, 1], size=(N, N), p=[1 - p_alive, p_alive]) + history = [] if record_history else None + for _ in range(timesteps): + if record_history: + history.append(grid.copy()) + grid = life_step_numpy(grid) + return history + +@time_range() +def simulate_life_cupy(N: int, timesteps: int, p_alive: float = 0.2, record_history: bool = False): + """ + Run Game of Life on GPU with NVTX profiling of the loop. + + Args: + N (int): Grid size. + timesteps (int): Number of generations. + p_alive (float): Initial alive probability. + record_history (bool): If True, store each generation as NumPy array. + + Returns: + list[np.ndarray] or None: Recorded history if requested. + """ + grid_gpu = (cp.random.random((N, N)) < p_alive).astype(cp.int32) + history = [] if record_history else None + for _ in range(timesteps): + if record_history: + history.append(cp.asnumpy(grid_gpu)) + grid_gpu = life_step_gpu(grid_gpu) + return history + +@time_range() +def simulate_life_naive(N: int, timesteps: int, p_alive: float = 0.2, record_history: bool = False): + """ + Run naive Python Game of Life with NVTX profiling of the loop. + + Args: + N (int): Grid size. + timesteps (int): Number of generations. + p_alive (float): Initial alive probability. + record_history (bool): If True, collect each generation. + + Returns: + list[np.ndarray] or None: History of grids if recorded. + """ + grid = np.random.choice([0, 1], size=(N, N), p=[1 - p_alive, p_alive]) + history = [] if record_history else None + for _ in range(timesteps): + if record_history: + history.append(grid.copy()) + grid = life_step_naive(grid) + return history + +# ───────────────────────────────────────────────────────────────────────────── +# 3) Animation/export (unchanged) +# ───────────────────────────────────────────────────────────────────────────── +def animate_life(history, output_file: Path, interval: int = 200, dpi: int = 80): + """ + Create and save a GIF of the simulation history. + + Args: + history (list[np.ndarray]): Recorded generations to animate. + output_file (Path): Filepath for the output GIF. + interval (int): Delay between frames in milliseconds. + dpi (int): Dots per inch resolution for the saved GIF. + """ + + + fig, ax = plt.subplots(figsize=(6, 6)) + im = ax.imshow(history[0], cmap='binary') + ax.set_axis_off() + + def _update(idx): + im.set_data(history[idx]) + return [im] + + anim = animation.FuncAnimation( + fig, _update, + frames=len(history), + interval=interval, + blit=True + ) + anim.save(str(output_file), writer='pillow', dpi=dpi) + plt.close(fig) + +# ───────────────────────────────────────────────────────────────────────────── +# 4) CLI entry-points +# ───────────────────────────────────────────────────────────────────────────── +def run_life_numpy(): + """ + CLI for NumPy Game of Life with optional CPU profiling. + + Flags: + --size: Grid dimension (N×N) + --timesteps: Number of generations + --save-gif: Save a GIF if size ≤ 100 + --profile-cpu: Enable cProfile CPU profiling + """ + p = argparse.ArgumentParser("Game of Life (NumPy)") + p.add_argument("--size", type=int, default=100) + p.add_argument("--timesteps", type=int, default=50) + p.add_argument("--save-gif", action="store_true") + p.add_argument("--profile-cpu", action="store_true") + args = p.parse_args() + + # CPU profiling + if args.profile_cpu: + pr = cProfile.Profile() + pr.enable() + + history = simulate_life_numpy(args.size, args.timesteps, record_history=args.save_gif) + + if args.save_gif and args.size <= 100: + out = Path("game_of_life_cpu.gif") + animate_life(history, out) + print(f"Saved CPU GIF to {out}") + + if args.profile_cpu: + pr.disable() + stats = pstats.Stats(pr).strip_dirs().sort_stats("cumtime") + stats.print_stats(20) + +def run_life_cupy(): + """ + CLI for CuPy Game of Life with optional CPU/GPU profiling. + + Flags: + --size: Grid dimension (N×N) + --timesteps: Number of generations + --save-gif: Save a GIF if size ≤ 100 + --profile-cpu: Enable cProfile CPU profiling + --profile-gpu: Enable NVIDIA CUPTI GPU profiling + """ + p = argparse.ArgumentParser("Game of Life (CuPy)") + p.add_argument("--size", type=int, default=100) + p.add_argument("--timesteps", type=int, default=50) + p.add_argument("--save-gif", action="store_true") + p.add_argument("--profile-cpu", action="store_true") + p.add_argument("--profile-gpu", action="store_true") + args = p.parse_args() + + # CPU profiling + if args.profile_cpu: + pr = cProfile.Profile() + pr.enable() + + # CUPTI profiling + if args.profile_gpu: + profiler.start() + + history = simulate_life_cupy(args.size, args.timesteps, record_history=args.save_gif) + + if args.save_gif and args.size <= 100: + out = Path("game_of_life_gpu.gif") + animate_life(history, out) + print(f"Saved GPU GIF to {out}") + + if args.profile_gpu: + profiler.stop() + + if args.profile_cpu: + pr.disable() + stats = pstats.Stats(pr).strip_dirs().sort_stats("cumtime") + stats.print_stats(20) + +def run_life_naive(): + """ + CLI for Naive Python Game of Life with optional CPU profiling. + + Flags: + --size: Grid dimension (N×N) + --timesteps: Number of generations + --save-gif: Save a GIF if size ≤ 100 + --profile-cpu: Enable cProfile CPU profiling + """ + p = argparse.ArgumentParser("Game of Life (Naive)") + p.add_argument("--size", type=int, default=100) + p.add_argument("--timesteps", type=int, default=50) + p.add_argument("--save-gif", action="store_true") + p.add_argument("--profile-cpu", action="store_true") + args = p.parse_args() + + if args.profile_cpu: + pr = cProfile.Profile() + pr.enable() + + history = simulate_life_naive(args.size, args.timesteps, record_history=args.save_gif) + + if args.save_gif and args.size <= 100: + out = Path("game_of_life_naive.gif") + animate_life(history, out) + print(f"Saved Naive GIF to {out}") + + if args.profile_cpu: + pr.disable() + stats = pstats.Stats(pr).strip_dirs().sort_stats("cumtime") + stats.print_stats(20) diff --git a/individual_modules/intro_to_GPUs/scripts/support_scripts.py b/individual_modules/intro_to_GPUs/scripts/support_scripts.py new file mode 100644 index 00000000..ccb7b2a4 --- /dev/null +++ b/individual_modules/intro_to_GPUs/scripts/support_scripts.py @@ -0,0 +1,452 @@ +""" +Temperature Diffusion Visualization and Data Utilities + +This module provides: +- GPU/CPU hardware queries +- Downloading ocean temperature subsets from Copernicus +- Summary statistics for NetCDF temperature files +- Static 2D surface plots +- Interactive 2D slice animations +- Interactive 3D cube animations + +Entry points (via CLI): +- summary() +- visualisation_static() +- visualisation_slice() +- visualisation_cube() +""" + +# ------------------------------------------------------------------- +# Library imports +# ------------------------------------------------------------------- + +from ast import Str +from matplotlib import animation +import xarray as xr +import numpy as np +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D +from pathlib import Path +import plotly.graph_objects as go +import argparse +import subprocess +import os +import cupy + +# ------------------------------------------------------------------- +# Constants +# ------------------------------------------------------------------- +# Define the root directory +ROOT_DIR = Path(__file__).resolve().parent.parent +DATA_DIR = ROOT_DIR / "model_data" +OUTPUT_DIR = ROOT_DIR / "output" +ORIGINAL_DATA_FILE = "cmems_mod_glo_phy-thetao_anfc_0.083deg_PT6H-i_thetao_13.83W-6.17E_46.83N-65.25N_0.49-5727.92m_2024-01-01-2024-01-02.nc" + +def cuda_check(): + """ + Print the number of available CUDA-capable GPU devices. + + Uses CuPy's runtime API to query device count. + """ + num_devices = cupy.cuda.runtime.getDeviceCount() + print(f"Number of CUDA devices: {num_devices}") + +def download_ocean_data(): + """ + Download a subset of ocean temperature data using the CopernicusMarine CLI. + + Changes into DATA_DIR, runs 'poetry run copernicusmarine subset' with + the specified bounding box and time range, then returns to DATA_DIR. + + On success, prints a success message; on failure, prints the error. + """ + command = [ + "poetry", "run", "copernicusmarine", "subset", + "--dataset-id", "cmems_mod_glo_phy-thetao_anfc_0.083deg_PT6H-i", + "--variable", "thetao", + "--start-datetime", "2024-01-01T00:00:00", + "--end-datetime", "2024-01-02T00:00:00", + "--minimum-longitude", "-13.903248235212025", + "--maximum-longitude", "6.186015157645116", + "--minimum-latitude", "46.82995633719309", + "--maximum-latitude", "65.31207865862164", + "--minimum-depth", "0.49402499198913574", + "--maximum-depth", "5727.9169921875" + ] + + os.chdir(DATA_DIR) + + try: + # Run the command and enable interactive mode by attaching directly to the terminal's stdin/stdout + result = subprocess.run(command, check=True) + print("Data downloaded successfully.") + except subprocess.CalledProcessError as e: + print("An error occurred:", e) + finally: + # Change back to the original directory + os.chdir(DATA_DIR) + +def calculate_summary(data_file): + """ + Load a NetCDF file and print summary statistics for 'thetao'. + + Args: + data_file (str): Filename (within DATA_DIR) of the .nc dataset. + + Prints: + - Array shape (time, depth, lat, lon) + - Mean, max, min, std of the temperature + - Full dataset dimension and coordinate details + """ + # Load the NetCDF file from the data directory + file_path = DATA_DIR / data_file + data = xr.open_dataset(file_path) + + # Assume the variable of interest is named 'temperature' (check the variable name in your file) + temperature = data['thetao'] + print("The dimensions of the data is: " + str(temperature.shape)) + # Print some summary statistics + print("Temperature Summary Statistics:") + print("Mean temperature:", temperature.mean().item()) + print("Max temperature:", temperature.max().item()) + print("Min temperature:", temperature.min().item()) + print("Standard deviation:", temperature.std().item()) + + # Optional: Print information about dimensions and coordinates + print("\nDataset Dimensions and Coordinates:") + print(data) + +def summary(): + """ + CLI entry point for calculate_summary. + + Usage: + python this_script.py --data-file filename.nc + """ + parser = argparse.ArgumentParser(description="Calculate Summary Statistics for a datafile") + parser.add_argument("--data-file", type=str, default=ORIGINAL_DATA_FILE, help="Data file for visualisation.") + + args = parser.parse_args() + + # Pass parsed arguments to visualisation_slice + calculate_summary(data_file=args.data_file) + +def visualisation_static(): + """ + CLI entry point for generating a static 2D surface plot of temperature. + + Produces a PNG of the surface (time=0, depth=0) saved in OUTPUT_DIR. + """ + parser = argparse.ArgumentParser( + description="Generate a static 2D temperature slice" + ) + parser.add_argument( + "--data-file", "-f", + default=ORIGINAL_DATA_FILE, + help="Which .nc file to visualize" + ) + args = parser.parse_args() + + # Now call your existing logic, passing args.data_file + file_path = DATA_DIR / args.data_file + data = xr.open_dataset(file_path) + theta = data["thetao"].isel(time=0, depth=0) + lats = theta["latitude"].values + lons = theta["longitude"].values + + fig, ax = plt.subplots(figsize=(10, 6)) + c = ax.pcolormesh(lons, lats, theta.values, shading="auto", cmap="viridis") + ax.set_xlabel("Longitude") + ax.set_ylabel("Latitude") + ax.set_title(f"Surface Temperature: {args.data_file}") + fig.colorbar(c, ax=ax, label="°C") + + save_path = OUTPUT_DIR / f"{Path(args.data_file).stem}_static.png" + plt.savefig(save_path, dpi=300, bbox_inches="tight") + plt.close(fig) + print(f"Saved to {save_path}") + + +def plot_slice(target_depth=0, animation_speed=100, data_file=ORIGINAL_DATA_FILE): + """ + Create an interactive 2D heatmap animation over time at a given depth. + + Args: + target_depth (float): Desired depth in metres. + animation_speed (int): Frame duration in milliseconds. + data_file (str): NetCDF filename in DATA_DIR. + + Saves: + An HTML file with a Plotly animated heatmap in OUTPUT_DIR. + """ + # Load the NetCDF file + file_path = DATA_DIR / data_file + data = xr.open_dataset(file_path) + + # Assume the variable of interest is named 'temperature' (check the variable name in your file) + temperature = data['thetao'] + + # Find the closest depth level to the target depth + depths = temperature['depth'].values + closest_depth_idx = (abs(depths - target_depth)).argmin() + selected_depth = depths[closest_depth_idx] + temperature_subset = temperature.isel(depth=closest_depth_idx) + # Calculate the max temperature value across the subset for setting the color scale + # Calculate the max and min temperature values across the subset, ignoring NaNs + max_temp = np.nanmax(temperature_subset.values) + min_temp = np.nanmin(temperature_subset.values) + + print("MAX TEMP: " + str(max_temp)) + print("MIN TEMP: " + str(min_temp)) + + print("The depth being visualised is: " + str(selected_depth) + " as the target depth inputted was: " + str(target_depth)) + + # Prepare latitudes, longitudes, and time steps + latitudes = temperature['latitude'].values + longitudes = temperature['longitude'].values + time_steps = temperature['time'].values + + # Create the figure and initial heatmap for the first time step + fig = go.Figure(data=go.Heatmap( + z=temperature_subset.isel(time=0).values, + x=longitudes, + y=latitudes, + colorscale='Viridis', + colorbar=dict(title='Temperature (°C)'), + zmin=min_temp, # Set color range minimum + zmax=max_temp # Set color range maximum + + )) + + # Define frames for each time step + frames = [] + for t in range(len(time_steps)): + frames.append(go.Frame( + data=[go.Heatmap( + z=temperature_subset.isel(time=t).values, + x=longitudes, + y=latitudes, + colorscale='Viridis', + zmin=min_temp, # Set color range minimum + zmax=max_temp # Set color range maximum + )], + name=f"Time {t}" + )) + + fig.frames = frames + + # Add slider and play button to animate over all time steps + fig.update_layout( + title=f"Ocean Temperature Slice at Depth Level {selected_depth}m (Closest to {target_depth}m)", + xaxis_title="Longitude", + yaxis_title="Latitude", + updatemenus=[{ + "type": "buttons", + "showactive": False, + "buttons": [{ + "label": "Play", + "method": "animate", + "args": [None, {"frame": {"duration": animation_speed, "redraw": True}, "fromcurrent": True}] # Adjust duration here + }, { + "label": "Pause", + "method": "animate", + "args": [[None], {"frame": {"duration": 0, "redraw": False}, "mode": "immediate", "transition": {"duration": 0}}] + }] + }], + sliders=[{ + "active": 0, + "yanchor": "top", + "xanchor": "left", + "currentvalue": { + "font": {"size": 20}, + "prefix": "Time Step: ", + "visible": True, + "xanchor": "right" + }, + "steps": [{ + "label": f"{t}", + "method": "animate", + "args": [[f"Time {t}"], {"frame": {"duration": 300, "redraw": True}, "mode": "immediate", "transition": {"duration": 0}}] # Adjust duration here + } for t in range(len(time_steps))] + }] + ) + + # Save the interactive plot as an HTML file + if data_file==ORIGINAL_DATA_FILE: + save_path = OUTPUT_DIR / "original_temperature_2d_interactive.html" + else: + save_path = OUTPUT_DIR / "predicted_temperature_2d_interactive.html" + + fig.write_html(save_path) + +def visualisation_slice(): + """ + CLI entry point for the 2D interactive slice animation. + + Usage: + python this_script.py --target_depth 50 --animation_speed 300 --data-file file.nc + """ + parser = argparse.ArgumentParser( + description="Animate a 2D temperature slice over time" + ) + parser.add_argument("--target_depth", type=int, default=50, help="Depth in metres") + parser.add_argument("--animation_speed",type=int, default=300, help="ms per frame") + parser.add_argument( + "--data-file", "-f", + default=ORIGINAL_DATA_FILE, + help="Which .nc file to visualize" + ) + args = parser.parse_args() + + plot_slice( + target_depth=args.target_depth, + animation_speed=args.animation_speed, + data_file=args.data_file + ) + + + + +def plot_cube(num_depths=3, num_time_steps=3, data_file=ORIGINAL_DATA_FILE): + """ + Create an interactive 3D surface animation over time and depth. + + Args: + num_depths (int): Number of depth levels to include. + num_time_steps (int): Number of time frames to animate. + data_file (str): NetCDF filename in DATA_DIR. + + Saves: + An HTML file with a Plotly 3D animated cube in OUTPUT_DIR. + """ + # Load the NetCDF file + file_path = DATA_DIR / data_file + data = xr.open_dataset(file_path) + + # Assume the variable of interest is named 'temperature' (check the variable name in your file) + temperature = data['thetao'] + + # Prepare data for 3D plotting (take the specified number of depth and time levels) + latitudes = temperature['latitude'].values + longitudes = temperature['longitude'].values + depths = temperature['depth'].values[:num_depths] + time_steps = temperature['time'].values[:num_time_steps] + temperature_values = temperature.isel(depth=slice(0, num_depths), time=slice(0, num_time_steps)).values + + # Calculate the max temperature value across the subset for setting the color scale + # Calculate the max and min temperature values across the subset, ignoring NaNs + max_temp = np.nanmax(temperature_values) + min_temp = np.nanmin(temperature_values) + + print("MAX TEMP: " + str(max_temp)) + print("MIN TEMP: " + str(min_temp)) + + # Create the figure and define frames for each time step + fig = go.Figure() + + # Create initial frame (first time step) + for i, depth in enumerate(depths): + fig.add_trace(go.Surface( + z=[[depth] * len(longitudes) for _ in range(len(latitudes))], + x=longitudes, + y=latitudes, + surfacecolor=temperature_values[0, i], + colorscale='Viridis', + colorbar=dict(title='Temperature (°C)'), + zmin=min_temp, # Set color range minimum + zmax=max_temp # Set color range maximum + )) + + # Define frames for each subsequent time step + frames = [] + for t, time in enumerate(time_steps): + frame_data = [] + for i, depth in enumerate(depths): + frame_data.append(go.Surface( + z=[[depth] * len(longitudes) for _ in range(len(latitudes))], + x=longitudes, + y=latitudes, + surfacecolor=temperature_values[t, i], + colorscale='Viridis', + zmin=min_temp, # Set color range minimum + zmax=max_temp # Set color range maximum + )) + frames.append(go.Frame(data=frame_data, name=f"Time {t}")) + + fig.frames = frames + + # Set up play button and slider + fig.update_layout( + title="3D Ocean Temperature Profile Over Time", + scene=dict( + xaxis_title="Longitude", + yaxis_title="Latitude", + zaxis_title="Depth (m)", + zaxis=dict(autorange="reversed") + ), + updatemenus=[{ + "type": "buttons", + "showactive": False, + "buttons": [{ + "label": "Play", + "method": "animate", + "args": [None, {"frame": {"duration": 500, "redraw": True}, "fromcurrent": True}] + }, { + "label": "Pause", + "method": "animate", + "args": [[None], {"frame": {"duration": 0, "redraw": False}, "mode": "immediate", "transition": {"duration": 0}}] + }] + }], + sliders=[{ + "active": 0, + "yanchor": "top", + "xanchor": "left", + "currentvalue": { + "font": {"size": 20}, + "prefix": "Time Step: ", + "visible": True, + "xanchor": "right" + }, + "steps": [{ + "label": f"{t}", + "method": "animate", + "args": [[f"Time {t}"], {"frame": {"duration": 500, "redraw": True}, "mode": "immediate", "transition": {"duration": 0}}] + } for t in range(len(time_steps))] + }] + ) + + # Save the interactive plot as an HTML file + # Save the interactive plot as an HTML file + if data_file==ORIGINAL_DATA_FILE: + save_path = OUTPUT_DIR / "original_temperature_3d_interactive.html" + else: + save_path = OUTPUT_DIR / "predicted_temperature_3d_interactive.html" + fig.write_html(save_path) + +def visualisation_cube(): + """ + CLI entry point for the 3D cube animation. + + Usage: + python this_script.py --num_depths 5 --num_time_steps 3 --data-file file.nc + """ + parser = argparse.ArgumentParser( + description="Animate a 3D temperature cube over time" + ) + parser.add_argument("--num_depths", type=int, default=5, help="Number of depth levels") + parser.add_argument("--num_time_steps", type=int, default=3, help="Number of time steps") + parser.add_argument( + "--data-file", "-f", + default=ORIGINAL_DATA_FILE, + help="Which .nc file to visualize" + ) + args = parser.parse_args() + + plot_cube( + num_depths=args.num_depths, + num_time_steps=args.num_time_steps, + data_file=args.data_file + ) + + + diff --git a/individual_modules/intro_to_GPUs/scripts/temperature_diffusion.py b/individual_modules/intro_to_GPUs/scripts/temperature_diffusion.py new file mode 100644 index 00000000..ad48035b --- /dev/null +++ b/individual_modules/intro_to_GPUs/scripts/temperature_diffusion.py @@ -0,0 +1,409 @@ +""" +3D Temperature Diffusion Simulation + +This module implements a simple 3D temperature diffusion model using three backends: +- NumPy (CPU) +- CuPy (GPU) +- Pure Python nested lists + +Functions: +- load_data: Load ocean temperature data from a NetCDF file. +- save_to_netcdf: Save computed temperature fields back to NetCDF. +- temperature_diffusion_numpy: Run diffusion with NumPy arrays. +- temperature_diffusion_cupy: Run diffusion on GPU via CuPy. +- temperature_diffusion_purepython: Naive pure-Python implementation. +- run_diffusion_*: Entry points for CLI execution. +""" + + +# ------------------------------------------------------------------- +# Library imports +# ------------------------------------------------------------------- + +import xarray as xr +from pathlib import Path +import argparse +import time +import xarray as xr +import numpy as np +import cupy as cp +from tqdm import tqdm +import math +import copy + +# ------------------------------------------------------------------- +# Constants +# ------------------------------------------------------------------- + +# Define the project root directory (two levels up from this file) +ROOT_DIR = Path(__file__).resolve().parent.parent + +# Directory containing NetCDF input data +DATA_DIR = ROOT_DIR / "model_data" + +# NetCDF input filename (globe, 6-hourly, thetao variable) +DATA_FILE = "cmems_mod_glo_phy-thetao_anfc_0.083deg_PT6H-i_thetao_13.83W-6.17E_46.83N-65.25N_0.49-5727.92m_2024-01-01-2024-01-01.nc" + +# Filenames for each output implementation +OUTPUT_FILE_NUMPY = "predicted_temperatures_numpy.nc" +OUTPUT_FILE_CUPY = "predicted_temperatures_cupy.nc" +OUTPUT_FILE_PUREPYTHON = "predicted_temperatures_purepython.nc" + +# ------------------------------------------------------------------- +# Data I/O functions +# ------------------------------------------------------------------- + + +def load_data(): + """ + Load the ocean temperature dataset from a NetCDF file. + + The file path is constructed from DATA_DIR and DATA_FILE constants. + + Returns: + xr.Dataset: An xarray Dataset containing the loaded data. + """ + file_path = DATA_DIR / DATA_FILE + return xr.open_dataset(file_path) + + +def save_to_netcdf(data, new_temperature, output_file_path, num_timesteps): + """ + Save a modified temperature array to a NetCDF file. + + Adjusts the temperature array to match the specified number of timesteps + and constructs a new xarray Dataset with the original spatial coordinates. + + Args: + data (xr.Dataset): Original dataset containing depth, latitude, longitude. + new_temperature (np.ndarray): Array of shape (time, depth, lat, lon) + containing the updated temperatures. + output_file_path (Path or str): Path to write the new NetCDF file. + num_timesteps (int): Number of timesteps to include in the output. + """ + # Adjust new_temperature to have only num_timesteps or fewer + new_temperature = new_temperature[:num_timesteps] # Only include the desired number of timesteps + + # Generate a new time coordinate as a sequence of numbers (1, 2, ..., num_timesteps) + time_coord = range(1, num_timesteps + 1) + + # Create a new dataset with original depth, latitude, and longitude coordinates + output_data = xr.Dataset( + {'thetao': (('time', 'depth', 'latitude', 'longitude'), new_temperature)}, + coords={ + 'time': time_coord, # Sequential time coordinate + 'depth': data['depth'].values, # Use original depth coordinates + 'latitude': data['latitude'].values, # Use original latitude coordinates + 'longitude': data['longitude'].values # Use original longitude coordinates + }, + ) + output_data.to_netcdf(output_file_path, engine='netcdf4') + +# ------------------------------------------------------------------- +# Diffusion model implementations +# ------------------------------------------------------------------- + +# Temperature diffusion function using NumPy with masking for boundaries +def temperature_diffusion_numpy(data, num_timesteps, diffusion_coeff=0.1): + """ + Simulate temperature diffusion over time using NumPy arrays. + + A simple 3D diffusion stencil is applied across the ocean grid, + with NaN regions (land) masked out. + + Args: + data (xr.Dataset): Input dataset containing the 'thetao' variable. + num_timesteps (int): Number of timesteps to simulate. + diffusion_coeff (float, optional): Diffusion coefficient. Defaults to 0.1. + + Side effects: + - Saves the resulting temperature field to a NetCDF file (OUTPUT_FILE_NUMPY). + - Prints timing statistics to stdout. + """ + temperature = np.asarray(data['thetao'].values) # Convert to a NumPy array + temperature = temperature[0:1, :, :, :] + temperature = np.tile(temperature, (num_timesteps, 1, 1, 1)) + + mask = ~np.isnan(temperature) # Mask: True for ocean points, False for NaN regions (land) + new_temperature = np.copy(temperature) + timestep_durations = [] + + # Extract the first timestamp and create a new time coordinate for the predicted timesteps + original_time = data['time'].values + time_coord = np.array([original_time[0] + np.timedelta64(i, 'D') for i in range(num_timesteps)]) + + # Run the diffusion model + for t in tqdm(range(num_timesteps), desc="NumPy Diffusion Progress"): + start_time = time.time() + + # Apply diffusion calculation with mask-based boundary handling + temp_copy = temperature[1:-1, 1:-1, 1:-1] # Core section without boundaries + neighbor_sum = np.zeros_like(temp_copy) + neighbor_count = np.zeros_like(temp_copy) + + # Sum available neighbors and count them only for valid ocean points + if mask[:-2, 1:-1, 1:-1].any(): # Front + neighbor_sum += np.where(mask[:-2, 1:-1, 1:-1], temperature[:-2, 1:-1, 1:-1], 0) + neighbor_count += mask[:-2, 1:-1, 1:-1] + + if mask[2:, 1:-1, 1:-1].any(): # Back + neighbor_sum += np.where(mask[2:, 1:-1, 1:-1], temperature[2:, 1:-1, 1:-1], 0) + neighbor_count += mask[2:, 1:-1, 1:-1] + + if mask[1:-1, :-2, 1:-1].any(): # Left + neighbor_sum += np.where(mask[1:-1, :-2, 1:-1], temperature[1:-1, :-2, 1:-1], 0) + neighbor_count += mask[1:-1, :-2, 1:-1] + + if mask[1:-1, 2:, 1:-1].any(): # Right + neighbor_sum += np.where(mask[1:-1, 2:, 1:-1], temperature[1:-1, 2:, 1:-1], 0) + neighbor_count += mask[1:-1, 2:, 1:-1] + + if mask[1:-1, 1:-1, :-2].any(): # Bottom + neighbor_sum += np.where(mask[1:-1, 1:-1, :-2], temperature[1:-1, 1:-1, :-2], 0) + neighbor_count += mask[1:-1, 1:-1, :-2] + + if mask[1:-1, 1:-1, 2:].any(): # Top + neighbor_sum += np.where(mask[1:-1, 1:-1, 2:], temperature[1:-1, 1:-1, 2:], 0) + neighbor_count += mask[1:-1, 1:-1, 2:] + + # Apply diffusion to valid points only, avoiding NaN regions + new_temperature[1:-1, 1:-1, 1:-1] = np.where( + mask[1:-1, 1:-1, 1:-1], + temp_copy + diffusion_coeff * (neighbor_sum - 6 * temp_copy) / np.maximum(neighbor_count, 1), + temperature[1:-1, 1:-1, 1:-1] + ) + + timestep_durations.append(time.time() - start_time) + temperature = new_temperature + + # Convert to final temperature and save + final_temperature = new_temperature + + # Save to NetCDF (assuming you have a `save_to_netcdf` function defined) + save_to_netcdf(data, final_temperature, DATA_DIR / OUTPUT_FILE_NUMPY, num_timesteps) + + avg_time_per_timestep = sum(timestep_durations) / num_timesteps + print(f"NumPy model completed in {sum(timestep_durations):.4f} seconds. " + f"Average time per timestep: {avg_time_per_timestep:.4f} seconds.") + + + +def run_diffusion_numpy(): + """ + Entry point for running the NumPy diffusion model via command line. + + Parses --num_timesteps and invokes temperature_diffusion_numpy(). + """ + parser = argparse.ArgumentParser(description="Run 3D Diffusion Model with Numpy") + parser.add_argument("--num_timesteps", type=int, default=300, help="Number of Timesteps to run for") + + args = parser.parse_args() + + # Pass parsed arguments to visualisation_slice + temperature_diffusion_numpy(data=load_data(), num_timesteps=args.num_timesteps) + +# # Temperature diffusion function using CuPy with masking for boundaries +def temperature_diffusion_cupy(data, num_timesteps, diffusion_coeff=0.5): + """ + Simulate temperature diffusion over time using CuPy (GPU acceleration). + + Similar stencil as the NumPy version, but runs on the GPU. + Data is converted back to NumPy before saving. + + Args: + data (xr.Dataset): Input dataset containing 'thetao'. + num_timesteps (int): Number of timesteps to simulate. + diffusion_coeff (float, optional): Diffusion coefficient. Defaults to 0.5. + + Side effects: + - Saves the resulting field to a NetCDF file (OUTPUT_FILE_CUPY). + - Prints timing statistics. + """ + temperature = cp.asarray(data['thetao'].values) # Convert to a CuPy array + temperature = temperature[0:1, :, :, :] + temperature = np.tile(temperature, (num_timesteps, 1, 1, 1)) + + mask = ~cp.isnan(temperature) # Mask: True for ocean points, False for NaN regions (land) + new_temperature = cp.copy(temperature) + timestep_durations = [] + + # Extract the first timestamp and create a new time coordinate for the predicted timesteps + original_time = data['time'].values + time_coord = np.array([original_time[0] + np.timedelta64(i, 'D') for i in range(num_timesteps)]) + + # Run the diffusion model + for t in tqdm(range(num_timesteps), desc="CuPy Diffusion Progress"): + start_time = time.time() + + # Apply diffusion calculation with mask-based boundary handling + temp_copy = temperature[1:-1, 1:-1, 1:-1] # Core section without boundaries + neighbor_sum = cp.zeros_like(temp_copy) + neighbor_count = cp.zeros_like(temp_copy) + + # Sum available neighbors and count them only for valid ocean points + if mask[:-2, 1:-1, 1:-1].any(): # Front + neighbor_sum += cp.where(mask[:-2, 1:-1, 1:-1], temperature[:-2, 1:-1, 1:-1], 0) + neighbor_count += mask[:-2, 1:-1, 1:-1] + + if mask[2:, 1:-1, 1:-1].any(): # Back + neighbor_sum += cp.where(mask[2:, 1:-1, 1:-1], temperature[2:, 1:-1, 1:-1], 0) + neighbor_count += mask[2:, 1:-1, 1:-1] + + if mask[1:-1, :-2, 1:-1].any(): # Left + neighbor_sum += cp.where(mask[1:-1, :-2, 1:-1], temperature[1:-1, :-2, 1:-1], 0) + neighbor_count += mask[1:-1, :-2, 1:-1] + + if mask[1:-1, 2:, 1:-1].any(): # Right + neighbor_sum += cp.where(mask[1:-1, 2:, 1:-1], temperature[1:-1, 2:, 1:-1], 0) + neighbor_count += mask[1:-1, 2:, 1:-1] + + if mask[1:-1, 1:-1, :-2].any(): # Bottom + neighbor_sum += cp.where(mask[1:-1, 1:-1, :-2], temperature[1:-1, 1:-1, :-2], 0) + neighbor_count += mask[1:-1, 1:-1, :-2] + + if mask[1:-1, 1:-1, 2:].any(): # Top + neighbor_sum += cp.where(mask[1:-1, 1:-1, 2:], temperature[1:-1, 1:-1, 2:], 0) + neighbor_count += mask[1:-1, 1:-1, 2:] + + # Apply diffusion to valid points only, avoiding NaN regions + new_temperature[1:-1, 1:-1, 1:-1] = cp.where( + mask[1:-1, 1:-1, 1:-1], + temp_copy + diffusion_coeff * (neighbor_sum - 6 * temp_copy) / cp.maximum(neighbor_count, 1), + temperature[1:-1, 1:-1, 1:-1] + ) + + cp.cuda.Stream.null.synchronize() # Wait for the GPU computation to complete + timestep_durations.append(time.time() - start_time) + temperature = new_temperature + + + # Convert back to NumPy and save + final_temperature = cp.asnumpy(new_temperature) + + + save_to_netcdf(data, final_temperature, DATA_DIR / OUTPUT_FILE_CUPY, num_timesteps) + + avg_time_per_timestep = sum(timestep_durations) / num_timesteps + print(f"CuPy model completed in {sum(timestep_durations):.4f} seconds. " + f"Average time per timestep: {avg_time_per_timestep:.4f} seconds.") + + +def run_diffusion_cupy(): + """ + Entry point for running the CuPy diffusion model via command line. + + Parses --num_timesteps and calls temperature_diffusion_cupy(). + """ + parser = argparse.ArgumentParser(description="Run 3D Diffusion Model with CuPy") + parser.add_argument("--num_timesteps", type=int, default=300, help="Number of Timesteps to run for") + + args = parser.parse_args() + + # Pass parsed arguments to visualisation_slice + temperature_diffusion_cupy(data=load_data(), num_timesteps=args.num_timesteps) + + +def temperature_diffusion_purepython(data, num_timesteps, diffusion_coeff=0.1): + """ + Simulate temperature diffusion using pure Python nested loops and lists. + + This implementation is the simplest (and slowest), building Python lists + and manually iterating over every grid cell. + + Args: + data (xr.Dataset): Dataset containing 'thetao'. + num_timesteps (int): Number of timesteps to simulate. + diffusion_coeff (float, optional): Diffusion coefficient. Defaults to 0.1. + + Side effects: + - Saves the resulting field to a NetCDF file (OUTPUT_FILE_PUREPYTHON). + - Prints timing statistics. + """ + # Pull raw array and get dims + raw = data['thetao'].values # shape (time, depth, lat, lon) + depth, lat, lon = raw.shape[1], raw.shape[2], raw.shape[3] + + # Initial snapshot at t=0, as Python lists + initial = raw[0] # shape (depth, lat, lon) + temperature = [] + for _ in range(num_timesteps): + # deep copy initial for each timestep + plane = [] + for d in range(depth): + plane.append([ [ float(initial[d][i][j]) for j in range(lon) ] + for i in range(lat) ]) + temperature.append(plane) + + # Summary stats (very slow!) + flat = [] + for t in range(num_timesteps): + for d in range(depth): + for i in range(lat): + for j in range(lon): + v = temperature[t][d][i][j] + if not math.isnan(v): + flat.append(v) + flat_sorted = sorted(flat) + + # Precompute mask of valid ocean points + mask = [[[ [ not math.isnan(temperature[t][d][i][j]) + for j in range(lon) ] + for i in range(lat) ] + for d in range(depth) ] + for t in range(num_timesteps)] + + # Prepare output buffer + new_temperature = copy.deepcopy(temperature) + timestep_durations = [] + + # Diffusion loop + for t in range(num_timesteps): + start = time.time() + for d in range(1, depth-1): + for i in range(1, lat-1): + for j in range(1, lon-1): + if mask[t][d][i][j]: + center = temperature[t][d][i][j] + total = 0.0 + count = 0 + # 6 neighbors + for dd, ii, jj in ( + (d-1,i,j), (d+1,i,j), + (d,i-1,j), (d,i+1,j), + (d,i,j-1), (d,i,j+1) + ): + if mask[t][dd][ii][jj]: + total += temperature[t][dd][ii][jj] + count += 1 + # apply diffusion + if count > 0: + delta = diffusion_coeff * (total - count*center) / count + else: + delta = 0.0 + new_temperature[t][d][i][j] = center + delta + timestep_durations.append(time.time() - start) + # copy new → temperature for next step + temperature[t] = copy.deepcopy(new_temperature[t]) + + # Convert to NumPy array for saving + final = np.array(new_temperature) + + # Save result + save_to_netcdf(data, final, DATA_DIR / OUTPUT_FILE_PUREPYTHON, num_timesteps) + + total = sum(timestep_durations) + avg = total / num_timesteps + + print(f"PurePython model completed in {total:.4f} seconds. " + f"Average time per timestep: {avg:.4f} seconds.") + +def run_diffusion_purepython(): + """ + Entry point for running the pure Python diffusion model via command line. + + Parses --num_timesteps and invokes temperature_diffusion_purepython(). + """ + parser = argparse.ArgumentParser(description="Run 3D Diffusion Model in pure Python") + parser.add_argument("--num_timesteps", type=int, default=300, help="Number of Timesteps") + args = parser.parse_args() + temperature_diffusion_purepython(data=load_data(), num_timesteps=args.num_timesteps) diff --git a/individual_modules/intro_to_GPUs/scripts/temperature_diffusion_experiment.py b/individual_modules/intro_to_GPUs/scripts/temperature_diffusion_experiment.py new file mode 100644 index 00000000..ef3dc016 --- /dev/null +++ b/individual_modules/intro_to_GPUs/scripts/temperature_diffusion_experiment.py @@ -0,0 +1,167 @@ +""" +Temperature Diffusion Experiment Runner + +This script benchmarks three implementations of a 3D temperature diffusion model: +- Pure Python nested loops +- NumPy (CPU) +- CuPy (GPU) + +It records timing results over multiple timesteps and repeats, saves them to CSV, +and generates an error‐bar plot comparing performance across methods. + +Functions: +- get_gpu_name: Query the GPU model via nvidia‐smi. +- get_cpu_name: Read the CPU model string from /proc/cpuinfo. +- plot_timings: Read the CSV of results and produce an error‐bar plot. +""" + +# ------------------------------------------------------------------- +# Library imports +# ------------------------------------------------------------------- + +import subprocess +import time +import numpy as np +import os +import csv +import matplotlib.pyplot as plt + +# ------------------------------------------------------------------- +# Constants +# ------------------------------------------------------------------- + +# ensure directory for outputs +OUT_DIR = "../output" +os.makedirs(OUT_DIR, exist_ok=True) + +def get_gpu_name(): + """ + Query the system GPU name using `nvidia-smi`. + + Returns: + str: The name of the first GPU, or "Unknown GPU" if the command fails. + """ + try: + out = subprocess.check_output( + ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], + stderr=subprocess.DEVNULL + ).decode().strip().splitlines() + return out[0] + except Exception: + return "Unknown GPU" + +def get_cpu_name(): + """ + Read the CPU model name from /proc/cpuinfo (Linux). + + Returns: + str: The CPU model string, or "Unknown CPU" if reading fails. + """ + try: + with open("/proc/cpuinfo") as f: + for line in f: + if line.lower().startswith("model name"): + return line.split(":", 1)[1].strip() + except: + pass + return "Unknown CPU" + +def plot_timings(csv_filename): + """ + Read benchmarking CSV and create an error‐bar plot of runtimes. + + The CSV must have columns: + method, num_timesteps, mean_time_sec, std_dev_sec + + X‐axis: number of timesteps + Y‐axis: mean runtime in seconds, with error bars showing standard deviation + + Args: + csv_filename (str): Path to the CSV file of timing results. + """ + + data = {} + timesteps = [] + with open(csv_filename, newline="") as f: + reader = csv.DictReader(f) + for row in reader: + method = row["method"] + ts = int(row["num_timesteps"]) + mean_t = float(row["mean_time_sec"]) + std_t = float(row["std_dev_sec"]) + data.setdefault(method, []).append((ts, mean_t, std_t)) + if ts not in timesteps: + timesteps.append(ts) + + timesteps = sorted(timesteps) + plt.figure() + for method, vals in data.items(): + # sort vals by ts + vals = sorted(vals, key=lambda x: x[0]) + xs = [v[0] for v in vals] + ys = [v[1] for v in vals] + errs = [v[2] for v in vals] + plt.errorbar(xs, ys, yerr=errs, label=method, marker='o') + + plt.xlabel("Number of timesteps") + plt.ylabel("Time (s)") + plt.title("Temperature Diffusion Performance") + plt.legend() + png = os.path.join(OUT_DIR, "temperature_diffusion_timings.png") + plt.savefig(png) + plt.close() + print(f"Plot saved to {png}") + +if __name__ == "__main__": + """ + Main execution block: + + 1. Detect hardware (GPU, CPU). + 2. Define which diffusion scripts to run. + 3. Loop over methods, timesteps, and repeats: + - Run each script via subprocess. + - Measure elapsed time. + - Record results to CSV. + 4. After all runs, generate an error‐bar plot. + """ + + gpu_name = get_gpu_name() + cpu_name = get_cpu_name() + + # map display names → console_script names + methods = { + "Pure Python": "diffusion_pure_python", + "NumPy (CPU)": "diffusion_numpy", + "CuPy (GPU)": "diffusion_cupy", + } + + #timesteps_list = [10, 25, 50, 100] + timesteps_list = [3] + repeats = 3 + + csv_file = os.path.join(OUT_DIR, "temperature_diffusion_timings.csv") + with open(csv_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow([ + "gpu_name", "cpu_name", + "method", "num_timesteps", + "mean_time_sec", "std_dev_sec" + ]) + + for method, script in methods.items(): + for ts in timesteps_list: + times = [] + for _ in range(repeats): + start = time.time() + subprocess.check_call([script, f"--num_timesteps={ts}"]) + times.append(time.time() - start) + writer.writerow([ + gpu_name, cpu_name, + method, ts, + f"{np.mean(times):.6f}", + f"{np.std(times):.6f}" + ]) + print(f"{method}, ts={ts}: mean {np.mean(times):.4f}s, std {np.std(times):.4f}s") + + print(f"CSV timings saved to {csv_file}") + plot_timings(csv_file) diff --git a/individual_modules/intro_to_GPUs/setup.md b/individual_modules/intro_to_GPUs/setup.md new file mode 100644 index 00000000..fd7848f6 --- /dev/null +++ b/individual_modules/intro_to_GPUs/setup.md @@ -0,0 +1,316 @@ +# Context and Setup Guide + +## Course Philosophy + +Throughout this course, two guiding principles have been kept in mind: + +1) **Complete Pipeline Approach**. Getting to the point where you can simply: + + ```python + import cupy as cp + ``` + + is far harder in the context of using GPUs than writing code for GPUs. + +2) **Focus on practical GPU use**. This is a course about **using GPUs**, not about the low-level details of **programming GPUs**. + +By the end, you’ll have everything in place to leverage GPU acceleration immediately. We’ll walk you through installing the tools, configuring your environment, and running your first CUDA-powered code, so you can start leveraging the benefit of GPUs. + +## University of Exeter ISCA HPC Installation Instructions + +If you have not used an HPC platform before, then you may benefit from going through the material in "Helpful Auxiliary Software" on this page, as it will guide you through the process of connecting to an HPC platform, after which you can continue moving through these set-up instructions. + +## Clone the Repo + +To engage with all of the content within this GPU Training course, you will need to clone the repo, which can be done with + +``` bash +cd /lustre/projects/Research_Project-RSATeam #This is the directory that the RSA Team will do the course in. +mkdir $USER # Create a directory for you within the project space. +cd $USER +git clone https://github.com/UniExeterRSE/GPU_Training.git +cd GPU_Training +``` + +### Two Ways to Run the Course Code + +#### Method 1: Interactive GPU Session + +If you prefer to work interactively, follow these steps: + +Request an interactive session: + +```bash +srun \ + --partition=gpu \ + -A Research_Project-RSATeam \ + --time=12:00:00 \ + --nodes=1 \ + --ntasks=1 \ + --gres=gpu:1 \ + --cpus-per-task=4 \ + --pty /bin/bash +``` + +Load required modules: + +```bash +module load nvidia-cuda/12.1.1 +module load Python/3.11.3 +``` + +Install the Python requirements: + +```bash +poetry install +``` + +Once your environment is ready, you can invoke any of the project’s entry points via Poetry. For example: + +```bash +poetry run cuda_check +``` + +#### Method 2: Batch Submission via Slurm + +All of the key Slurm submission scripts live in the +`exeter_isca_slurm_submission_scripts/` directory. You can submit a job with + +```bash +cd exeter_isca_slurm_submission_scripts +sbatch .slurm +``` + +## General Installation Instructions + +The following provides the steps that are required to install the necessary compilers and packages to engage with the material in this course. + +```{important} +Please keep in mind that nearly all of the commands used in this section will be covered in detail within the course itself. They are included here to make sure you have all of the necessary resources (e.g. a GPU and relevant compilers) to complete the whole course. **The intention is for you to run these commands and confirm the output based on the contents of this page, not to completely understand each step you are taking.** If you do get stuck and are unsure of how to proceed, please reach out to the authors, and we can help you debug. + +If you are self-studying, then please read up to the section "Project: Conway's Game of Life - CPU vs GPU Implementation" to understand more about the commands that are being used. If you are taking the workshop, then these commands are here to make sure that you are able to run code on the designated platform to save time in the workshop and identify any permission errors when accessing the needed resources. +``` + +## Spack - Installing system-level requirements + +Within this course, [Spack](https://spack.io/) is being used to manage system-level requirements, such as drivers. The reason for this is that a lot of system-level requirements generally require privileged permissions, such as access to `sudo`. However, as a lot of the platforms that have GPUs available are HPC platforms, `spack` allows us to install drivers that normally would require privileged access. There are also a range of other benefits to the use of `spack` that will be discussed in this course. + +First, you will need to clone the `spack` repo in your user home directory at a recent stable version (extra config and depth flags suggested in spack's readme): + +``` bash +git clone -c feature.manyFiles=true --depth=2 -b v0.23.1 https://github.com/spack/spack.git +``` + +You will then need to activate `spack` with: + +```bash +source spack/share/spack/setup-env.sh +``` + +```{note} +You can check that `spack` has been successfully installed by running `spack --version`, which should return the version of spack that you have available. +``` + +You will need need to create a spack environment, which can be done with the following, creating a `spack` environment named "gpu_course": + +```bash +spack env create gpu_course +``` + +which can then be activated with + +```bash +spack env activate -p gpu_course +``` + +In this course, spack is being used to install system-level requirements, and so the required version of Python and the needed driver of CUDA are installed via spack with the following two commands. + +```bash +spack add python@3.12 +spack add cuda +``` + +```{note} +This step will simply say that you intend to install these packages; at this time, `spack` is still waiting for more packages to be added to the environment specification. We can check what the current specification is (e.g. package list, dependencies, compilers to be used etc.) with `spack spec`. +``` + +Finally, we are able to install all of the packages into our `spack` environment with + +```bash +spack install +``` + +```{note} +On an HPC environment, we would want to put the above spack commands into a shell script and run this with the scheduler, such as `sbatch` for ISCA/Archer2. The `install` can take on the order of hours for the above specifications. +``` + +```{note} +The `.spack` directory is a hidden folder in your home directory that stores user-level configuration data, caches, and environment settings for Spack. It helps Spack remember things like what packages you have installed, which mirrors you have configured, and any custom settings you have applied. Sometimes, these configuration files or caches can become outdated or inconsistent, especially if you have been experimenting with different environments, modifying package recipes, or changing `spack` versions. When a "weird" or hard-to-troubleshoot error occurs, one way to rule out bad configuration or cache data is to remove the `.spack` directory. By doing so, you essentially give Spack a clean slate: it will recreate the directory and its necessary files the next time it runs, which often resolves mysterious issues stemming from old or corrupted data. If you try to get a clean slate for spack by just removing the non-hidden `spack` directory, then it will likely not be a clean slate, and the previous experimentations data will still be present. +``` + +## Poetry - Installing user-level requirements + +Within this course, [Poetry](https://python-poetry.org/) is used to manage the user-level requirements. + +The following command will install poetry: + +```bash +curl -sSL https://install.python-poetry.org | python3 - +``` + +```{note} +Poetry can be uninstalled with `curl -sSL https://install.python-poetry.org | python3 - --uninstall`. +``` + +```{note} +`poetry install` needs to be run from within the training course repo. If you haven't, then you need to clone this repo with `git clone https://github.com/UniExeterRSE/GPU_Training.git` and then navigate to its root with `cd GPU_Training` +``` + +All of the user-level requirements can be installed via Poetry with the command: + +```bash +poetry install +``` + +`````{admonition} IMPORTANT: If running locally... +:class: important +You can check that the installation has been successful by running `poetry run cuda_check`, which should return the number of CUDA devices that are currently available, such as `Number of CUDA devices: 1`. If you want to find out more information about the device that is connected, you can run a command such as `nvidia-semi` for an NVIDIA GPU. +````` + +`````{admonition} IMPORTANT: If running on a HPC... +:class: important +If you are working on an HPC cluster via SLURM, submit the `cuda_check.slurm` script instead of running the commands directly. The script contain the same commands as above (e.g. `poetry run cuda_check` and `nvidia-smi`) that the `.slurm` script will run and store the output and errors in the files `out.log` and `err.log` respectively. This can be done with the command `sbatch slurm_submission_scripts/cuda_check.slurm`. +````` + +## Data + +### Data Download + +```{note} +For the RSA Team Day the data files are available on the shared ISCA file-system. +``` + +To download the dataset, follow these steps: + +- **Create a Copernicus Marine Account**: + - You will need an account to access the data. Register here: [Register for Account](https://data.marine.copernicus.eu/register?redirect=%2Fproduct%2FGLOBAL_ANALYSISFORECAST_PHY_001_024%2Fdownload%3Fdataset%3Dcmems_mod_glo_phy-thetao_anfc_0.083deg_PT6H-i_202406). + +- **Run the CLI Command to Download the Dataset**: + - Use the following command to download the subset of data: + + ```bash + poetry run download_data + ``` + + - This command will prompt you to enter your username and password. Once authenticated, the data file will download to the data directory. Please note that the download may take some time as the file size is approximately 250 MB. + +### Data Description + +The dataset used during the course is based on 3-dimensional Ocean Temperatures. The dataset is described in detail on the [Copernicus Marine Data Service](https://data.marine.copernicus.eu/product/GLOBAL_ANALYSISFORECAST_PHY_001_024/description) + +**Filename**: `cmems_mod_glo_phy-thetao_anfc_0.083deg_PT6H-i_1730799065517.nc` + +**Description**: +This dataset was downloaded from the **Global Ocean Physics Analysis and Forecast** service. It provides data for global ocean physics, focusing on seawater potential temperature. + +- **Product Identifier**: `GLOBAL_ANALYSISFORECAST_PHY_001_024` +- **Product Name**: Global Ocean Physics Analysis and Forecast +- **Dataset Identifier**: `cmems_mod_glo_phy-thetao_anfc_0.083deg_PT6H-i` + +**Variable Visualized**: + +- **Sea Water Potential Temperature (thetao)**: Measured in degrees Celsius [°C]. + +**Geographical Area of Interest**: + +- **Region**: Around the United Kingdom +- **Coordinates**: + - **Northern Latitude**: 65.312 + - **Eastern Longitude**: 6.1860 + - **Southern Latitude**: 46.829 + - **Western Longitude**: -13.90 + +**Depth Range**: + +- **Minimum Depth**: 0.49 meters +- **Maximum Depth**: 5727.9 meters + +**File Size**: + +- **267.5 MB** + +## Helpful Auxiliary Software + +This section details a number of useful pieces of software that make the development of GPU code easier. Notably, a lot of these sit within Visual Studio Code, chosen as these are what the author was exposed to when first starting in GPU development. + +### Using Visual Studio Code (VSCode) + +Visual Studio Code (VSCode) can be installed from [its website](https://code.visualstudio.com/). + +#### Remote-SSH + +This guide walks you through setting up and using **Remote-SSH** in Visual Studio Code (VSCode) to connect to a remote machine. + +##### Install the Remote - SSH Extension + +Install from [Remote-SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) or via the following steps: + +1. Open **VSCode**. +2. Go to the **Extensions** view by clicking on the square icon in the sidebar or pressing `Ctrl+Shift+X` (Windows/Linux) or `Cmd+Shift+X` (Mac). +3. Search for "**Remote - SSH**" and install the extension from Microsoft. + +##### Configure SSH on Your Local Machine + +Ensure you can SSH into the remote machine from your terminal. If SSH is not already configured: + +1. **Generate SSH Keys** (if not already done): + - Open a terminal on your local machine. + - Run the command `ssh-keygen` and follow the prompts to generate a key pair. This will create keys in `~/.ssh/` by default. + +2. **Copy Your Public Key to the Remote Machine**: + - Run the command `ssh-copy-id user@hostname`, replacing `user` and `hostname` with your remote machine’s username and IP address or hostname. + - Enter your password when prompted. This step ensures you can log in without repeatedly typing your password. + +##### Add SSH Configuration in VSCode + +1. Open **VSCode**. +2. Press `Ctrl+Shift+P` (Windows/Linux) or `Cmd+Shift+P` (Mac) to open the command palette. +3. Type and select **Remote-SSH: Open Configuration File**. +4. Choose the SSH configuration file (usually located at `~/.ssh/config`). + +5. Add a new SSH configuration to the file, specifying the remote machine’s details. Here’s an example configuration: + + ```ssh-config + Host my-remote-machine + HostName + User + IdentityFile ~/.ssh/id_rsa # Path to your SSH private key + Port 22 # Default SSH port; change if needed + ``` + +##### Connecting to remote from within VSCode + +You should now be able to connect to the remote machine from within VSCode but using `Ctrl+Shift+P` (Windows/Linux) or `Cmd+Shift+P` (Mac) and then selecting `Remote-SSH: Connect to host...` which should then present a list with the name of the machine you gave in the config file, in the above case `my-remote-machine`. You will then be asked for a password if you protected your ssh key. Once connected, a new VSCode window will be created, and you should have a fully functioning ID on the remote machine. + +#### Live Server + +As this course produces 3D outputs, some supporting code will generate interactive HTML dashboards to make exploring the output data easier. The VSCode Live Server extension makes the process of viewing these dashboards with your local web browser easier. + +##### Install the Live Server Extension + +Install from [Live Server](https://marketplace.visualstudio.com/items?itemName=ritwickdey.LiveServer) or via the following steps: + +1. Open **VSCode**. +2. Go to the **Extensions** view by clicking on the square icon in the sidebar or pressing `Ctrl+Shift+X` (Windows/Linux) or `Cmd+Shift+X` (Mac). +3. Search for "**Live Server**" and install the extension by **Ritwick Dey**. + +--- + +##### Start the Live Server + +1. **Right-click** on the HTML file in the editor and select **Open with Live Server**. + +##### View Changes in Real-Time + +- As you edit and save your HTML, CSS, or JavaScript files, the browser will automatically refresh to display your changes. +- This eliminates the need to manually refresh the browser manually, speeding up development. \ No newline at end of file diff --git a/individual_modules/intro_to_GPUs/slides/GPU_Training_Day_Slides.pptx b/individual_modules/intro_to_GPUs/slides/GPU_Training_Day_Slides.pptx new file mode 100644 index 00000000..04ac1c85 Binary files /dev/null and b/individual_modules/intro_to_GPUs/slides/GPU_Training_Day_Slides.pptx differ diff --git a/individual_modules/intro_to_GPUs/slurm.ipynb b/individual_modules/intro_to_GPUs/slurm.ipynb new file mode 100644 index 00000000..065e4b78 --- /dev/null +++ b/individual_modules/intro_to_GPUs/slurm.ipynb @@ -0,0 +1,1698 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "61cf56c7-b6e3-4824-a0c9-bedca391557f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Slurm Job Scheduling \n", + "\n", + "## Learning Objectives\n", + "\n", + "By the end of this section, learners will be able to:\n", + "\n", + "- Explain the purpose of job scheduling in HPC systems and why Slurm is used. \n", + "- Create, edit, and submit Slurm job scripts that specify resources and commands for execution. \n", + "- Use key Slurm commands to manage jobs, including `sbatch`, `squeue`, `scancel`, and `srun`. \n", + "- Interpret job status outputs from `squeue` and understand job states such as running, pending, and cancelled. \n", + "- Configure resource requests in job scripts, including CPUs, GPUs, memory, partitions, and time limits. \n", + "- Cancel jobs individually or in bulk, and verify job cancellations using Slurm commands. \n", + "- Implement job dependencies with `--dependency` to control execution order in multi-step workflows. \n", + "- Differentiate between batch jobs (`sbatch`) and interactive jobs (`srun`) and identify when each is appropriate. \n", + "- Apply practical strategies for working efficiently with Slurm, including setting output files, realistic resource requests, and monitoring running jobs. \n", + "- Collect and interpret system information from Slurm job outputs to better understand available compute and GPU resources. \n", + "\n", + "\n", + "## Overview\n", + "\n", + "When working on a multi-user HPC system (like a cluster or supercomputer), you typically **don't run big GPU jobs directly on the login node**. Instead, you use a scheduler to queue up your work. **Slurm** (Simple Linux Utility for Resource Management) is a widely used job scheduler for HPC environments. It manages allocating computer resources (CPUs, GPUs, memory) to user jobs and queues them to run in time, depending on other jobs. \n", + "\n", + "This section will introduce **Slurm** for beginners, covering how to submit and manage jobs using commands like `sbatch`, `squeue`, and `scancel`, and how to set up job dependencies so jobs run in a certain order. \n", + "\n", + "## What is a Slurm job?\n", + "A *job* in Slurm is a unit of work, usually defined by a **job script**. The job script is a bash script (or another shell) that specifies resources needed (via special `#SBATCH` directives) and the commands to execute. When you submit this script, Slurm will find an available compute node (or nodes) that meet your requirements (CPU cores, GPUs, time, etc.) and run the script there, not on the login machine.\n", + "\n", + "## Creating a Simple Job Script \n", + "Here's a fundamental example of a Slurm job script, which we could call `myjob.slurm`: \n", + "\n", + "```bash \n", + "#!/bin/bash\n", + "#SBATCH --job-name=testjob # Name of the job\n", + "#SBATCH --output=job_%j.out # Output file (%j will be replaced with job ID)\n", + "#SBATCH --error=job_%j.err # Error file (a separate file for stderr, optional)\n", + "#SBATCH --time=0-00:05 # Wall time (DD-HH:MM) here 5 minutes\n", + "#SBATCH --partition=gpu # Partition/queue name, e.g., 'gpu' or as configured\n", + "#SBATCH --gres=gpu:1 # Request 1 GPU (generic resource)\n", + "#SBATCH --cpus-per-task=4 # Request 4 CPU cores\n", + "#SBATCH --mem=16G # Request 16 GB of RAM\n", + "\n", + "echo \"Hello from job $SLURM_JOB_ID running on $SLURM_NODELIST\"\n", + "sleep 60 # Simulate work by sleeping for 60 seconds\n", + "```\n", + "\n", + "The script uses `#SBATCH` lines to request resources: \n", + "\n", + "- A job name for easy identification\n", + "- Output/error file names\n", + "- A time limit of 5 minutes\n", + "- The partition (queue) to run on (often clusters have a special partition for GPU jobs)\n", + "- 1 GPU(`--gres=gpu:1` means one GPU)\n", + "- 4 CPU cores and 16GB of memory\n", + "\n", + "The body of the script prints a message and sleeps 60 seconds (as a placeholder for real work). `SLURM_JOB_ID` and `$SLUM_NODELIST` are environment variables Slurm sets for your job. \n", + "\n", + "## Submitting Jobs with `sbatch` \n", + "To submit the above job scripts to Slurm, use the `sbatch` commands: \n", + "\n", + "```bash \n", + "$ sbatch myjob.slurm\n", + "Submitted batch job 123456\n", + "```\n", + "\n", + "Slurm will respond with a job ID (in this example, `123456`). At this point, your job is in the queue. It might start immediately if resources are free or wait in line if the cluster is busy. \n", + "\n", + "Key points about `sbatch`:\n", + "It queues the job, and then you **return to your shell prompt**. The job runs asynchronously in the background on the cluster. \n", + "The `sbatch` command is non-interactive; it just submits the job. You won't see the job output on your screen live, it will go to the files defined by --output`/`--error` in the script. \n", + "\n", + "## Checking Job Status with `squeue` \n", + "\n", + "Once a job is submitted, you'll want to check its status (queues, running, finished). Use `squeue` to view the job queue: \n", + "\n", + "```bash \n", + "$ squeue -u your_username\n", + "```\n", + "\n", + "This shows all your jobs - using `squeue` alone shows everyone's jobs, but that can be long on busy systems. Typical `squeue` output columns include: \n", + "- **JOBID**: The job ID (e.g. 123456) \n", + "- **PARTITION**: Which partition/queue is it in?.\n", + "- **NAME**: the job name. \n", + "- **USER**: who submitted it. \n", + "- **ST**: state (R = running, PD = pending/waiting, CG = completing, F = finished, CA = canceled, etc.)\n", + "- **TIME**: how long it's been running (or pending) \n", + "- **NODES**: number of nodes allocated. \n", + "- **NODELIST(REASON)**: which node(s) it's on, or the reason it's pending (e.g. resource, priority, etc.) \n", + "\n", + "For example, if your job is waiting, you might see `PD` and `REASON` might be \"Resources\", meaning it is waiting for resources to free up. \n", + "\n", + "```{note}\n", + "You can also filter by job ID (`squeue -j 123456) or other criteria. Slurm has many options, but checking by username is simplest to see all your jobs. \n", + "```\n", + "\n", + "## Canceling a job with `scancel`\n", + "If you need to stop a job (maybe you realised there's a bug or its taking too long), you can cancel it: \n", + "\n", + "```bash \n", + "$ scancel 123456\n", + "```\n", + "\n", + "Replace `123456` with the job ID you want to cancel. This will terminate the job if it's running or remove it from the queue if it hasn't started yet. After cancelling, use `squeue` to verify it's gone or see it marked as CA (cancelled). \n", + "\n", + "You can cancel all your jobs with `scancel -u your_username`. You can also cancel an entire job array or a range of jobs if needed. \n", + "\n", + "## Job Dependencies: Ordering Jobs \n", + "Slurm allows you to chain jobs so that one doesn't start until the other is complete (and, optionally, only if it succeeds). This is done with the `--dependency` option of `sbatch`. \n", + "\n", + "**Use case**: Suppose you have two jobs, and *Job2* should run only after *Job1* finishes successfully. It could be the case that Job1 generates data that Job2 will process. You can submit Job1 normally, then submit Job2 with a dependency on Job1. \n", + "\n", + "Submit the first job and note its job ID: \n", + "\n", + "```bash \n", + "$ sbatch job1.slurm\n", + "Submitted batch job 111111\n", + "```\n", + "\n", + "Submit the second job with dependency: \n", + "\n", + "```bash \n", + "$ sbatch --dependency=afterok:111111 job2.slurm\n", + "Submitted batch job 111112\n", + "```\n", + "\n", + "The `dependency=afterok:111111` means \"run this job after 111111 finished *after* OK (exit code 0).\" In other words, job2 will wait until job1 is done *successfully*. If job1 fails (non-zero exit), job2 will not run (it will be canceled due to dependency failure). \n", + "\n", + "There are a number of other dependency types, including: \n", + "- `afterany:`: run after job finishes regardless of success or failure.\n", + "- `after:`: run after job starts (not commonly used; `afterok` is more typical).\n", + "- `singleton`: ensure only one job with the same name/user runs at a time (to avoid duplicates).\n", + "You can chain multiple job IDs like `--dependency=afterok:ID1:ID2` (job runs after both ID1 and ID2 succeed).\n", + "\n", + "Dependencies are powerful for building task pipelines. For instance, you could have a preprocessing job, then a training job, then a post-processing job, each submitted with appropriate `--dependency` so they execute in sequence without manual intervention. \n", + "\n", + "### Interactive Jobs (srun)\n", + "While `sbatch` is for batch submission, Slurm also has `srun` for running tasks interactively (especially useful for debugging or running short tests on compute nodes). For example, `srun --pty bash` will give you an interactive shell on a compute note. This course focuses on batch jobs for GPU tasks, but keep in mind that `srun` exists for interactive use. \n", + "\n", + "## Practical Tips for Slurm \n", + "- **Default Behaviour**: If you don't specify an output file, Slurm, by default, writes output to a file like `slurm-.out`. It's better to set `--output` to something meaningful. \n", + "- **Resource Requests**: Always request resources (time, memory, GPUs) realistically. If you ask for too little, your job might be killed for exceeding memory or time. If you ask for too much, you could wait longer in the queue. \n", + "- **Partition/Queues**: Clusters often have multiple partitions (e.g. `GPU`, `CPU`, `long`, `debug`). Make sure to use an appropriate one, as each has limits (a debug queue might only allow short 30-minute jobs, for example). \n", + "- **Monitoring**: You can monitor usage on a running job with commands like `sstat` (for stats) or by logging into the node (if interactive) and using tools like `nvidia-smi` to see GPU usage. \n", + "\n", + "## Summary: Key Slurm Commands\n", + "- `sbatch