diff --git a/.gitignore b/.gitignore index bd874548..c64b17c2 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ docs/ # translation temp files po/*~ +# Python cache directory +__pycache__/ \ No newline at end of file diff --git a/README.md b/README.md index ce3be6ed..f4149440 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,6 @@ options(repos = c( # Setup install from github install.packages("devtools") library(devtools) -# Install Uni of Shef Varnish theme -install_github("RSE-Sheffield/uos-varnish") # Install remaining official carpentries packages install.packages(c("sandpaper", "tinkr", "pegboard")) ``` diff --git a/config.yaml b/config.yaml index f96026a1..fbcae659 100644 --- a/config.yaml +++ b/config.yaml @@ -65,6 +65,7 @@ episodes: - profiling-lines.md - profiling-conclusion.md - optimisation-introduction.md +- optimisation-using-python.md - optimisation-data-structures-algorithms.md - long-break1.md - optimisation-minimise-python.md @@ -75,7 +76,7 @@ episodes: # Information for Learners learners: - setup.md -- registration.md +# - registration.md - acknowledgements.md - ppp.md - reference.md @@ -91,8 +92,5 @@ profiles: # This space below is where custom yaml items (e.g. pinning # sandpaper and varnish versions) should live -varnish: RSE-Sheffield/uos-varnish@main -url: 'https://rse.shef.ac.uk/pando-python' - -analytics: | - \ No newline at end of file +# varnish: RSE-Sheffield/uos-varnish@main +# url: 'https://rse.shef.ac.uk/pando-python' diff --git a/episodes/fig/cint_vs_pyint.png b/episodes/fig/cint_vs_pyint.png new file mode 100644 index 00000000..0b976a94 Binary files /dev/null and b/episodes/fig/cint_vs_pyint.png differ diff --git a/episodes/fig/numpyarray_vs_pylist.png b/episodes/fig/numpyarray_vs_pylist.png new file mode 100644 index 00000000..192bfe78 Binary files /dev/null and b/episodes/fig/numpyarray_vs_pylist.png differ diff --git a/episodes/files/snippets/builtin_str_split.py b/episodes/files/snippets/builtin_str_split.py new file mode 100644 index 00000000..04e176a3 --- /dev/null +++ b/episodes/files/snippets/builtin_str_split.py @@ -0,0 +1,42 @@ +import random +from timeit import timeit + +N = 10_000 # Number of elements in the list + +# Ensure every list is the same +random.seed(12) +f = [f" {i:0>6d} {random.random():8.4f} " for i in range(N)] + +def manualSplit(): # bad habits + data = {} + for line in f: + first_char = line.find("0") + end_time = line.find(" ", first_char, -1) + + energy_found = line.find(".", end_time, -1) + begin_energy = line.rfind(" ", end_time, energy_found) + end_energy = line.find(" ", energy_found, -1) + if end_energy == -1: + end_energy = len(line) + + time = line[first_char:end_time] + energy = line[begin_energy:end_energy] + + data[time] = energy + return data + +def builtinSplit(): + data = {} + for line in f: + time, energy = line.split() + data[time] = energy + return data + +def dictComprehension(): + return {time: energy for time, energy in (line.split() for line in f)} + + +repeats = 1000 +print(f"manualSplit: {timeit(manualSplit, globals=globals(), number=repeats):.3f}ms") +print(f"builtinSplit: {timeit(builtinSplit, globals=globals(), number=repeats):.3f}ms") +print(f"dictComprehension: {timeit(dictComprehension, globals=globals(), number=repeats):.3f}ms") diff --git a/episodes/files/snippets/builtin_sum.py b/episodes/files/snippets/builtin_sum.py new file mode 100644 index 00000000..c4d4b6d5 --- /dev/null +++ b/episodes/files/snippets/builtin_sum.py @@ -0,0 +1,30 @@ +import random +from timeit import timeit + +N = 100_000 # Number of elements in the list + +# Ensure every list is the same +random.seed(12) +my_data = [random.random() for i in range(N)] + + +def manualSumC(): # bad habits + n = 0 + for i in range(len(my_data)): + n += my_data[i] + return n + +def manualSumPy(): # slightly improved + n = 0 + for evt_count in my_data: + n += evt_count + return n + +def builtinSum(): # fastest and most readable + return sum(my_data) + + +repeats = 1000 +print(f"manualSumC: {timeit(manualSumC, globals=globals(), number=repeats):.3f}ms") +print(f"manualSumPy: {timeit(manualSumPy, globals=globals(), number=repeats):.3f}ms") +print(f"builtinSum: {timeit(builtinSum, globals=globals(), number=repeats):.3f}ms") diff --git a/episodes/files/snippets/numpy_array_resize.py b/episodes/files/snippets/numpy_array_resize.py new file mode 100644 index 00000000..99b75ce9 --- /dev/null +++ b/episodes/files/snippets/numpy_array_resize.py @@ -0,0 +1,25 @@ +from timeit import timeit +import numpy + +N = 100_000 # Number of elements in list/array + +def list_append(): + ls = [] + for i in range(N): + ls.append(i) + +def array_resize(): + ar = numpy.zeros(1) + for i in range(1, N): + ar.resize(i+1) + ar[i] = i + +def array_preallocate(): + ar = numpy.zeros(N) + for i in range(1, N): + ar[i] = i + +repeats = 1000 +print(f"list_append: {timeit(list_append, number=repeats):.2f}ms") +print(f"array_resize: {timeit(array_resize, number=repeats):.2f}ms") +print(f"array_preallocate: {timeit(array_preallocate, number=repeats):.2f}ms") diff --git a/episodes/files/snippets/parallel-download.py b/episodes/files/snippets/parallel-download.py new file mode 100644 index 00000000..a4788e7d --- /dev/null +++ b/episodes/files/snippets/parallel-download.py @@ -0,0 +1,44 @@ +from concurrent.futures import ThreadPoolExecutor, as_completed +from timeit import timeit +import requests # install with `pip install requests` + + +def download_file(url, filename): + response = requests.get(url) + with open(filename, 'wb') as f: + f.write(response.content) + return filename + +downloaded_files = [] + +def sequentialDownload(): + for mass in range(10, 20): + url = f"https://github.com/SNEWS2/snewpy-models-ccsn/raw/refs/heads/main/models/Warren_2020/stir_a1.23/stir_multimessenger_a1.23_m{mass}.0.h5" + f = download_file(url, f"seq_{mass}.h5") + downloaded_files.append(f) + +def parallelDownload(): + pool = ThreadPoolExecutor(max_workers=6) + jobs = [] + for mass in range(10, 20): + url = f"https://github.com/SNEWS2/snewpy-models-ccsn/raw/refs/heads/main/models/Warren_2020/stir_a1.23/stir_multimessenger_a1.23_m{mass}.0.h5" + local_filename = f"par_{mass}.h5" + jobs.append(pool.submit(download_file, url, local_filename)) + + for result in as_completed(jobs): + if result.exception() is None: + # handle return values of the parallelised function + f = result.result() + downloaded_files.append(f) + else: + # handle errors + print(result.exception()) + + pool.shutdown(wait=False) + + +print(f"sequentialDownload: {timeit(sequentialDownload, globals=globals(), number=1):.3f} s") +print(downloaded_files) +downloaded_files = [] +print(f"parallelDownload: {timeit(parallelDownload, globals=globals(), number=1):.3f} s") +print(downloaded_files) \ No newline at end of file diff --git a/episodes/files/snippets/read-write.py b/episodes/files/snippets/read-write.py new file mode 100644 index 00000000..20fefafc --- /dev/null +++ b/episodes/files/snippets/read-write.py @@ -0,0 +1,50 @@ +import os, time + +# Generate 10MB +data_len = 10000000 +data = os.urandom(data_len) +file_ct = 1000 +file_len = int(data_len/file_ct) + +# Write one large file +start = time.perf_counter() +large_file = open("large.bin", "wb") +large_file.write(data) +large_file.close () +large_write_s = time.perf_counter() - start + +# Write multiple small files +start = time.perf_counter() +for i in range(file_ct): + small_file = open(f"small_{i}.bin", "wb") + small_file.write(data[file_len*i:file_len*(i+1)]) + small_file.close() +small_write_s = time.perf_counter() - start + +# Read back the large file +start = time.perf_counter() +large_file = open("large.bin", "rb") +t = large_file.read(data_len) +large_file.close () +large_read_s = time.perf_counter() - start + +# Read back the small files +start = time.perf_counter() +for i in range(file_ct): + small_file = open(f"small_{i}.bin", "rb") + t = small_file.read(file_len) + small_file.close() +small_read_s = time.perf_counter() - start + +# Print Summary +print(f"{1:5d}x{data_len/1000000}MB Write: {large_write_s:.5f} seconds") +print(f"{file_ct:5d}x{file_len/1000}KB Write: {small_write_s:.5f} seconds") +print(f"{1:5d}x{data_len/1000000}MB Read: {large_read_s:.5f} seconds") +print(f"{file_ct:5d}x{file_len/1000}KB Read: {small_read_s:.5f} seconds") +print(f"{file_ct:5d}x{file_len/1000}KB Write was {small_write_s/large_write_s:.1f} slower than 1x{data_len/1000000}MB Write") +print(f"{file_ct:5d}x{file_len/1000}KB Read was {small_read_s/large_read_s:.1f} slower than 1x{data_len/1000000}MB Read") + +# Cleanup +os.remove("large.bin") +for i in range(file_ct): + os.remove(f"small_{i}.bin") \ No newline at end of file diff --git a/episodes/files/snippets/test_demonstration.py b/episodes/files/snippets/test_demonstration.py new file mode 100644 index 00000000..12f57c54 --- /dev/null +++ b/episodes/files/snippets/test_demonstration.py @@ -0,0 +1,8 @@ +# A simple function to be tested, this could instead be an imported package +def squared(x): + return x**2 + +# A simple test case +def test_example(): + assert squared(5) == 24 + diff --git a/episodes/optimisation-conclusion.md b/episodes/optimisation-conclusion.md index d9ed9f4e..8251f2c2 100644 --- a/episodes/optimisation-conclusion.md +++ b/episodes/optimisation-conclusion.md @@ -25,6 +25,7 @@ Hopefully with the information from this course you will be in a better position This course's website can be used as a reference manual when profiling your own code. + ::::::::::::::::::::::::::::::::::::: keypoints diff --git a/episodes/optimisation-data-structures-algorithms.md b/episodes/optimisation-data-structures-algorithms.md index 2d85c9b1..4c55e7dc 100644 --- a/episodes/optimisation-data-structures-algorithms.md +++ b/episodes/optimisation-data-structures-algorithms.md @@ -65,7 +65,8 @@ CPython for example uses [`newsize + (newsize >> 3) + 6`](https://github.com/pyt This has two implications: -* If you are creating large static lists, they will use upto 12.5% excess memory. +* If you are creating large static lists, they may use up to 12.5% excess memory. + * If you are growing a list with `append()`, there will be large amounts of redundant allocations and copies as the list grows. ### List Comprehension @@ -74,7 +75,7 @@ If creating a list via `append()` is undesirable, the natural alternative is to List comprehension can be twice as fast at building lists than using `append()`. This is primarily because list-comprehension allows Python to offload much of the computation into faster C code. -General python loops in contrast can be used for much more, so they remain in Python bytecode during computation which has additional overheads. +General Python loops in contrast can be used for much more, so they remain in Python bytecode during computation which has additional overheads. This can be demonstrated with the below benchmark: @@ -112,7 +113,7 @@ Results will vary between Python versions, hardware and list lengths. But in thi ## Tuples -In contrast, Python's tuples are immutable static arrays (similar to strings), their elements cannot be modified and they cannot be resized. +In contrast to lists, Python's tuples are immutable static arrays (similar to strings): Their elements cannot be modified and they cannot be resized. Their potential use-cases are greatly reduced due to these two limitations, they are only suitable for groups of immutable properties. @@ -152,6 +153,22 @@ Since Python 3.6, the items within a dictionary will iterate in the order that t Python's dictionaries are implemented as hashing data structures. +Explaining how these work will get a bit technical, so let’s start with an analogy: + +A Python list is like having a single long bookshelf. When you buy a new book (append a new element to the list), you place it at the far end of the shelf, right after all the previous books. + +A hashing data structure is more like a bookcase, with several shelves, one for each genre: There’s a shelf for detective fiction, a shelf for romance novels, a shelf for sci-fi stories, and so on. When you buy a new romance novel, you place it on the appropriate shelf, next to the previous books in that genre. +And as you get more books, at some point you’ll move to a larger bookcase with more shelves (and thus more fine-grained genre categories), to make sure you don’t have too many books on a single shelf. + +Now, let’s say a friend asks me whether I have the book “Dune”. +If I had my books arranged on a single bookshelf (in a list), I would have to look through every book I own in order to find “Dune”. +However, if I had a bookcase with several shelves (a hashing data structure), I know immediately that I need to check the sci-fi shelf, so I’d be able to find it much more quickly! + + +::::::::::::::::::::::::::::::::::::: callout + +### Technical explanation + Within a hashing data structure each inserted key is hashed to produce a (hopefully unique) integer key. The dictionary is pre-allocated to a default size, and the key is assigned the index within the dictionary equivalent to the hash modulo the length of the dictionary. If that index doesn't already contain another key, the key (and any associated values) can be inserted. @@ -160,7 +177,7 @@ When the hashing data structure exceeds a given load factor (e.g. 2/3 of indices ![An visual explanation of linear probing, CPython uses an advanced form of this.](episodes/fig/hash_linear_probing.png){alt="A diagram demonstrating how the keys (hashes) 37, 64, 14, 94, 67 are inserted into a hash table with 11 indices. This is followed by the insertion of 59, 80 and 39 which require linear probing to be inserted due to collisions."} -To retrieve or check for the existence of a key within a hashing data structure, the key is hashed again and a process equivalent to insertion is repeated. However, now the key at each index is checked for equality with the one provided. If any empty index is found before an equivalent key, then the key must not be present in the ata structure. +To retrieve or check for the existence of a key within a hashing data structure, the key is hashed again and a process equivalent to insertion is repeated. However, now the key at each index is checked for equality with the one provided. If any empty index is found before an equivalent key, then the key must not be present in the data structure. ### Keys @@ -189,6 +206,8 @@ dict[MyKey("one", 2, 3.0)] = 12 ``` The only limitation is that where two objects are equal they must have the same hash, hence all member variables which contribute to `__eq__()` should also contribute to `__hash__()` and vice versa (it's fine to have irrelevant or redundant internal members contribute to neither). +::::::::::::::::::::::::::::::::::::: + ## Sets Sets are dictionaries without the values (both are declared using `{}`), a collection of unique keys equivalent to the mathematical set. *Modern CPython now uses a set implementation distinct from that of it's dictionary, however they still behave much the same in terms of performance characteristics.* @@ -325,7 +344,7 @@ def binary_search_list(): if k != len(ls) and ls[k] == i: j += 1 - + repeats = 1000 gen_time = timeit(generateInputs, number=repeats) print(f"search_set: {timeit(search_set, number=repeats)-gen_time:.2f}ms") @@ -334,7 +353,7 @@ print(f"binary_search_list: {timeit(binary_search_list, number=repeats)-gen_time ``` Searching the set is fastest performing 25,000 searches in 0.04ms. -This is followed by the binary search of the (sorted) list which is 145x slower, although the list has been filtered for duplicates. A list still containing duplicates would be longer, leading to a more expensive search. +This is followed by the binary search of the (sorted) list which is 145x slower, although the list has been filtered for duplicates. A list still containing duplicates would be longer, leading to a more expensive search. The linear search of the list is more than 56,600x slower than the fastest, it really shouldn't be used! ```output diff --git a/episodes/optimisation-introduction.md b/episodes/optimisation-introduction.md index 1650962a..aa3a26bb 100644 --- a/episodes/optimisation-introduction.md +++ b/episodes/optimisation-introduction.md @@ -19,13 +19,15 @@ exercises: 0 ## Introduction -Now that you're able to find the most expensive components of your code with profiling, it becomes time to learn how to identify whether that expense is reasonable. +Now that you're able to find the most expensive components of your code with profiling, we can think about ways to improve it. +However, the best way to do this will depend a lot on your specific code! For example, if your code is spending 60 seconds waiting to download data files and then 1 second to analyse that data, then optimizing your data analysis code won’t make much of a difference. +We’ll talk briefly about some of these external bottlenecks at the end. For now, we’ll assume that you’re not waiting for anything else we’ll look at the performance of your code. In order to optimise code for performance, it is necessary to have an understanding of what a computer is doing to execute it. -Even a high-level understanding of how you code executes, such as how Python and the most common data-structures and algorithms are implemented, can help you to identify suboptimal approaches when programming. If you have learned to write code informally out of necessity, to get something to work, it's not uncommon to have collected some bad habits along the way. +Even a high-level understanding of how you code executes, such as how Python and the most common data-structures and algorithms are implemented, can help you to identify suboptimal approaches when programming. If you have learned to write code informally out of necessity, to get something to work, it's not uncommon to have collected some “unpythonic” habits along the way. The remaining content is often abstract knowledge, that is transferable to the vast majority of programming languages. This is because the hardware architecture, data-structures and algorithms used are common to many languages and they hold some of the greatest influence over performance bottlenecks. @@ -44,6 +46,68 @@ Therefore, the balance between the impact to both performance and maintainabilit This is not to say, don't consider performance when first writing code. The selection of appropriate algorithms and data-structures covered in this course form good practice, simply don't fret over a need to micro-optimise every small component of the code that you write. +### Performance of Python + +If you’ve read about different programming languages, you may have heard that there’s a difference between “interpreted” languages (like Python) and “compiled” languages (like C). You may have heard that Python is slow *because* it is an interpreted language. +To understand where this comes from (and how to get around it), let’s talk a little bit about how Python works. + + +![Illustration of integers in C and Python.](episodes/fig/cint_vs_pyint.png){alt="A diagram illustrating the difference between integers in C and Python. In C, the integer is a raw number in memory. In Python, it additionally contains a header with meta data."} + +In C, integers (or other basic types) are raw objects in memory. It is up to the programmer to keep track of the data type. +The compiler can then turn the source code directly into machine code. This allows the compiler to perform low-level optimisations that better exploit hardware nuance to achieve fast performance. This however comes at the cost of compiled software not being cross-platform. + +```C +/* C code */ +int a = 1; +int b = 2; +int c = a + b; +``` + +In Python, everything is a complex object. The interpreter uses extra fields in the header to keep track of data types at runtime or take care of memory management. +This adds a lot more flexibility and makes life easier for programmers. However, it comes at the cost of some overhead in both time and memory usage. + +```python +# Python code +a = 1 +b = 2 +c = a + b +``` + +::::::::::::::::::::::::::::::::::::: callout + +Objects store both their raw data (like an integer or string) and some internal information used by the interpreter. +We can see that additional storage space with `sys.getsizeof()`, which shows how many bytes an object takes up: + +```Python +import sys + +sys.getsizeof("") # 41 +sys.getsizeof("a") # 42 +sys.getsizeof("ab") # 43 + +sys.getsizeof([]) # 56 +sys.getsizeof(["a"]) # 64 + +sys.getsizeof(1) # 28 +``` + +(Note: For container objects (like lists and dictionaries) or custom classes, values returned by `getsizeof()` are implementation-dependent and may not reflect the actual memory usage.) + +::::::::::::::::::::::::::::::::::::::::::::: + +We effectively gain programmer performance by sacrificing some code performance. A lot of the time, computers are “fast enough”, so this is the right trade-off most of the time, as Donald Knuth said. + +However, there are the few other cases where code performance really matters. To handle these cases, Python has the capability to integrate with code written in lower-level programming language (like C, Fortran or Rust) under the hood. +Some performance-sensitive libraries therefore perform a lot of the work in such low-level code, before returning a nice Python object back to you. +(We’ll discuss NumPy in a later section; but many parts of the Python standard library also use this pattern.) + +Therefore, **it is often best to tell the interpreter/library at a high level *what you want*, and let it figure out *how to do it*.** + +That way, the interpreter/library is free to do all its work in the low-level code, and adds overhead only once, when it creates and returns a Python object in the end. +This usually makes your code more readable, too: When I read you code, I can see exactly *what you want to do*, without getting overwhelmed by overly detailed step-by-step instructions. +We’ll return to this point a few times throughout the section. + ## Ensuring Reproducible Results diff --git a/episodes/optimisation-memory.md b/episodes/optimisation-memory.md index 27500316..fecd0a70 100644 --- a/episodes/optimisation-memory.md +++ b/episodes/optimisation-memory.md @@ -8,7 +8,7 @@ exercises: 0 - How does a CPU look for a variable it requires? - What impact do cache lines have on memory accesses? -- Why is it faster to read/write a single 100mb file, than 100 1mb files? +- Why is it faster to read/write a single 100 MB file, than 100 files of 1 MB each? :::::::::::::::::::::::::::::::::::::::::::::::: @@ -32,7 +32,7 @@ But the CPU has much smaller caches on-board, to make accessing the most recent ![An annotated photo of a computer's hardware.](episodes/fig/annotated-motherboard.jpg){alt="An annotated photo of inside a desktop computer's case. The CPU, RAM, power supply, graphics cards (GPUs) and harddrive are labelled."} -When reading a variable, to perform an operation with it, the CPU will first look in it's registers. These exist per core, they are the location that computation is actually performed. Accessing them is incredibly fast, but there only exists enough storage for around 32 variables (typical number, e.g. 4 bytes). +When reading a variable, to perform an operation with it, the CPU will first look in its registers. These exist per core, they are the location that computation is actually performed. Accessing them is incredibly fast, but there only exists enough storage for around 32 variables (typical number, e.g. 4 bytes). As the register file is so small, most variables won't be found and the CPU's caches will be searched. It will first check the current processing core's L1 (Level 1) cache, this small cache (typically 64 KB per physical core) is the smallest and fastest to access cache on a CPU. If the variable is not found in the L1 cache, the L2 cache that is shared between multiple cores will be checked. This shared cache, is slower to access but larger than L1 (typically 1-3MB per core). @@ -42,7 +42,7 @@ If the variable has not been found in any of the CPU's cache, the CPU will look Correspondingly, the earlier the CPU finds the variable the faster it will be to access. However, to fully understand the cache's it's necessary to explain what happens once a variable has been found. -If a variable is not found in the caches, so must be fetched from RAM. +If a variable is not found in the caches, it must be fetched from RAM. The full 64 byte cache line containing the variable, will be copied first into the CPU's L3, then L2 and then L1. Most variables are only 4 or 8 bytes, so many neighbouring variables are also pulled into the caches. Similarly, adding new data to a cache evicts old data. @@ -80,9 +80,9 @@ The latency to access files on disk is another order of magnitude higher than ac As such, disk accesses similarly benefit from sequential accesses and reading larger blocks together rather than single variables. Python's `io` package is already buffered, so automatically handles this for you in the background. -However before a file can be read, the file system on the disk must be polled to transform the file path to it's address on disk to initiate the transfer (or throw an exception). +However before a file can be read, the file system on the disk must be polled to transform the file path to its address on disk to initiate the transfer (or throw an exception). -Following the common theme of this episode, the cost of accessing randomly scattered files can be significantly slower than accessing a single larger file of the same size. +Following the common theme of this episode, accessing randomly scattered files can be significantly slower than accessing a single larger file of the same size. This is because for each file accessed, the file system must be polled to transform the file path to an address on disk. Traditional hard disk drives particularly suffer, as the read head must physically move to locate data. @@ -91,7 +91,7 @@ Hence, it can be wise to avoid storing outputs in many individual files and to i This is even visible outside of your own code. If you try to upload/download 1 GB to HPC. The transfer will be significantly faster, assuming good internet bandwidth, if that's a single file rather than thousands. -The below example code runs a small benchmark, whereby 10MB is written to disk and read back whilst being timed. In one case this is as a single file, and the other, 1000 file segments. +The below example code runs a small benchmark, whereby 10MB is written to disk and read back whilst being timed. In one case this is as a single file, and in the other, 1000 file segments. ```python import os, time @@ -156,13 +156,75 @@ Repeated runs show some noise to the timing, however the slowdown is consistentl You might not even be reading 1000 different files. You could be reading the same file multiple times, rather than reading it once and retaining it in memory during execution. An even greater overhead would apply. +## Accessing the network + +When transfering files over a network, similar effects apply. There is a fixed overhead for every file transfer (no matter how big the file), so downloading many small files will be slower than downloading a single large file of the same total size. + +Because of this overhead, downloading many small files often does not use all the available bandwidth. It may be possible to speed things up by parallelising downloads. + +```Python +from concurrent.futures import ThreadPoolExecutor, as_completed +from timeit import timeit +import requests # install with `pip install requests` + + +def download_file(url, filename): + response = requests.get(url) + with open(filename, 'wb') as f: + f.write(response.content) + return filename + +downloaded_files = [] + +def sequentialDownload(): + for mass in range(10, 20): + url = f"https://github.com/SNEWS2/snewpy-models-ccsn/raw/refs/heads/main/models/Warren_2020/stir_a1.23/stir_multimessenger_a1.23_m{mass}.0.h5" + f = download_file(url, f"seq_{mass}.h5") + downloaded_files.append(f) + +def parallelDownload(): + pool = ThreadPoolExecutor(max_workers=6) + jobs = [] + for mass in range(10, 20): + url = f"https://github.com/SNEWS2/snewpy-models-ccsn/raw/refs/heads/main/models/Warren_2020/stir_a1.23/stir_multimessenger_a1.23_m{mass}.0.h5" + local_filename = f"par_{mass}.h5" + jobs.append(pool.submit(download_file, url, local_filename)) + + for result in as_completed(jobs): + if result.exception() is None: + # handle return values of the parallelised function + f = result.result() + downloaded_files.append(f) + else: + # handle errors + print(result.exception()) + + pool.shutdown(wait=False) + + +print(f"sequentialDownload: {timeit(sequentialDownload, globals=globals(), number=1):.3f} s") +print(downloaded_files) +downloaded_files = [] +print(f"parallelDownload: {timeit(parallelDownload, globals=globals(), number=1):.3f} s") +print(downloaded_files) +``` + +Depending on your internet connection, results may vary significantly, but the parallel download will usually be quite a bit faster. Note also that the order in which the parallel downloads finish will vary. + +```output +sequentialDownload: 3.225 s +['seq_10.h5', 'seq_11.h5', 'seq_12.h5', 'seq_13.h5', 'seq_14.h5', 'seq_15.h5', 'seq_16.h5', 'seq_17.h5', 'seq_18.h5', 'seq_19.h5'] +parallelDownload: 0.285 s +['par_11.h5', 'par_12.h5', 'par_15.h5', 'par_13.h5', 'par_10.h5', 'par_14.h5', 'par_16.h5', 'par_19.h5', 'par_17.h5', 'par_18.h5'] +``` + ## Latency Overview Latency can have a big impact on the speed that a program executes, the below graph demonstrates this. Note the log scale! -![A graph demonstrating the wide variety of latencies a programmer may experience when accessing data.](episodes/fig/latency.png){alt="A horizontal bar chart displaying the relative latencies for L1/L2/L3 cache, RAM, SSD, HDD and a packet being sent from London to California and back. These latencies range from 1 nanosecond to 140 milliseconds and are displayed with a log scale."} +![A graph demonstrating the wide variety of latencies a programmer may experience when accessing data.](episodes/fig/latency.png){alt="A horizontal bar chart displaying the relative latencies for L1/L2/L3 cache, RAM, SSD, HDD and a packet being sent from London to California and back. These latencies range from 1 nanosecond to 140 milliseconds and are displayed with a log scale."} -The lower the latency typically the higher the effective bandwidth. L1 and L2 cache have 1TB/s, RAM 100GB/s, SSDs upto 32 GB/s, HDDs upto 150MB/s. Making large memory transactions even slower. +The lower the latency typically the higher the effective bandwidth (L1 and L2 cache have 1 TB/s, RAM 100 GB/s, SSDs up to 32 GB/s, HDDs up to 150 MB/s), making large memory transactions even slower. ## Memory Allocation is not Free diff --git a/episodes/optimisation-minimise-python.md b/episodes/optimisation-minimise-python.md index 0e2666f6..8e574396 100644 --- a/episodes/optimisation-minimise-python.md +++ b/episodes/optimisation-minimise-python.md @@ -1,12 +1,11 @@ --- -title: "Understanding Python (NumPy/Pandas)" +title: "Using Scientific Python Libraries (NumPy, Pandas and more)" teaching: 30 exercises: 0 --- :::::::::::::::::::::::::::::::::::::: questions -- Why are Python loops slow? - Why is NumPy often faster than raw Python? - How can processing rows of a Pandas data table be made faster? @@ -14,278 +13,15 @@ exercises: 0 ::::::::::::::::::::::::::::::::::::: objectives -- Able to identify when Python code can be rewritten to perform execution in the back-end. - Able to utilise NumPy's vectorisation when operating on arrays of data. - Able to efficiently process rows when working with data tables. :::::::::::::::::::::::::::::::::::::::::::::::: -Python is an interpreted programming language. When you execute your `.py` file, the (default) CPython back-end compiles your Python source code to an intermediate bytecode. This bytecode is then interpreted in software at runtime generating instructions for the processor as necessary. This interpretation stage, and other features of the language, harm the performance of Python (whilst improving it's usability). +Earlier, we saw that builtin functions in Python, like `sum()`, are often faster than manually looping over a list. This is because those high-level functions are able to do most of the work in the C backend -In comparison, many languages such as C/C++ compile directly to machine code. This allows the compiler to perform low-level optimisations that better exploit hardware nuance to achieve fast performance. This however comes at the cost of compiled software not being cross-platform. +Packages like NumPy and Pandas work similarly: They have been written in compiled languages to expose this performance across a wide range of scientific workloads. -Whilst Python will rarely be as fast as compiled languages like C/C++, it is possible to take advantage of the CPython back-end and packages such as NumPy and Pandas that have been written in compiled languages to expose this performance. - -A simple example of this would be to perform a linear search of a list (in the previous episode we did say this is not recommended). -The below example creates a list of 2500 integers in the inclusive-exclusive range `[0, 5000)`. -It then searches for all of the even numbers in that range. -`searchlistPython()` is implemented manually, iterating `ls` checking each individual item in Python code. -`searchListC()` in contrast uses the `in` operator to perform each search, which allows CPython to implement the inner loop in it's C back-end. - -```python -import random - -N = 2500 # Number of elements in list -M = 2 # N*M == Range over which the elements span - -def generateInputs(): - random.seed(12) # Ensure every list is the same - return [random.randint(0, int(N*M)) for i in range(N)] - -def searchListPython(): - ls = generateInputs() - ct = 0 - for i in range(0, int(N*M), M): - for j in range(0, len(ls)): - if ls[j] == i: - ct += 1 - break - -def searchListC(): - ls = generateInputs() - ct = 0 - for i in range(0, int(N*M), M): - if i in ls: - ct += 1 - -repeats = 1000 -gen_time = timeit(generateInputs, number=repeats) -print(f"searchListPython: {timeit(searchListPython, number=repeats)-gen_time:.2f}ms") -print(f"searchListC: {timeit(searchListC, number=repeats)-gen_time:.2f}ms") -``` - -This results in the manual Python implementation being 5x slower, doing the exact same operation! - -```output -searchListPython: 152.15ms -searchListC: 28.43ms -``` - -An easy approach to follow is that if two blocks of code do the same operation, the one that contains less Python is probably faster. This won't apply if you're using 3rd party packages written purely in Python though. - -::::::::::::::::::::::::::::::::::::: callout - -## Python bytecode - -You can use `dis` to view the bytecode generated by Python, the amount of byte-code more strongly correlates with how much code is being executed by the Python interpreter. However, this still does not account for whether functions called are implemented using Python or C. - -The pure Python search compiles to 82 lines of byte-code. - -```python -import dis - -def searchListPython(): - ls = generateInputs() - ct = 0 - for i in range(0, int(N*M), M): - for j in range(0, len(ls)): - if ls[j] == i: - ct += 1 - break - -dis.dis(searchListPython) -``` -```output - 11 0 LOAD_GLOBAL 0 (generateInputs) - 2 CALL_FUNCTION 0 - 4 STORE_FAST 0 (ls) - - 12 6 LOAD_CONST 1 (0) - 8 STORE_FAST 1 (ct) - - 13 10 LOAD_GLOBAL 1 (range) - 12 LOAD_CONST 1 (0) - 14 LOAD_GLOBAL 2 (int) - 16 LOAD_GLOBAL 3 (N) - 18 LOAD_GLOBAL 4 (M) - 20 BINARY_MULTIPLY - 22 CALL_FUNCTION 1 - 24 LOAD_GLOBAL 4 (M) - 26 CALL_FUNCTION 3 - 28 GET_ITER - >> 30 FOR_ITER 24 (to 80) - 32 STORE_FAST 2 (i) - - 14 34 LOAD_GLOBAL 1 (range) - 36 LOAD_CONST 1 (0) - 38 LOAD_GLOBAL 5 (len) - 40 LOAD_FAST 0 (ls) - 42 CALL_FUNCTION 1 - 44 CALL_FUNCTION 2 - 46 GET_ITER - >> 48 FOR_ITER 14 (to 78) - 50 STORE_FAST 3 (j) - - 15 52 LOAD_FAST 0 (ls) - 54 LOAD_FAST 3 (j) - 56 BINARY_SUBSCR - 58 LOAD_FAST 2 (i) - 60 COMPARE_OP 2 (==) - 62 POP_JUMP_IF_FALSE 38 (to 76) - - 16 64 LOAD_FAST 1 (ct) - 66 LOAD_CONST 2 (1) - 68 INPLACE_ADD - 70 STORE_FAST 1 (ct) - - 17 72 POP_TOP - 74 JUMP_FORWARD 1 (to 78) - - 15 >> 76 JUMP_ABSOLUTE 24 (to 48) - >> 78 JUMP_ABSOLUTE 15 (to 30) - - 13 >> 80 LOAD_CONST 0 (None) - 82 RETURN_VALUE -``` - -Whereas the `in` variant only compiles to 54. - -```python -import dis - -def searchListC(): - ls = generateInputs() - ct = 0 - for i in range(0, int(N*M), M): - if i in ls: - ct += 1 - -dis.dis(searchListC) -``` -```output - 4 0 LOAD_GLOBAL 0 (generateInputs) - 2 CALL_FUNCTION 0 - 4 STORE_FAST 0 (ls) - - 5 6 LOAD_CONST 1 (0) - 8 STORE_FAST 1 (ct) - - 6 10 LOAD_GLOBAL 1 (range) - 12 LOAD_CONST 1 (0) - 14 LOAD_GLOBAL 2 (int) - 16 LOAD_GLOBAL 3 (N) - 18 LOAD_GLOBAL 4 (M) - 20 BINARY_MULTIPLY - 22 CALL_FUNCTION 1 - 24 LOAD_GLOBAL 4 (M) - 26 CALL_FUNCTION 3 - 28 GET_ITER - >> 30 FOR_ITER 10 (to 52) - 32 STORE_FAST 2 (i) - - 7 34 LOAD_FAST 2 (i) - 36 LOAD_FAST 0 (ls) - 38 CONTAINS_OP 0 - 40 POP_JUMP_IF_FALSE 25 (to 50) - - 8 42 LOAD_FAST 1 (ct) - 44 LOAD_CONST 2 (1) - 46 INPLACE_ADD - 48 STORE_FAST 1 (ct) - >> 50 JUMP_ABSOLUTE 15 (to 30) - - 6 >> 52 LOAD_CONST 0 (None) - 54 RETURN_VALUE -``` - -::::::::::::::::::::::::::::::::::::::::::::: - -## Scope - -When Python executes your code, it has to find the variables and functions that you're using. - -This adds an additional cost to accessing variables and calling functions in Python, which isn't typically seen in compiled languages. - -In particular, it will first check whether the variable or functions has been declared within the current function (local scope), if it can't find it there it will check whether it has been declared in the file (global scope) after which it may even check whether it's from an imported package. - -These are not implicitly cached, therefore repeated accesses to variables and functions, will repeat these checks. - -The implication, is that as local scope variables and functions are checked first, they will be faster to use. - -If you're only accessing a variable once or twice that's nothing to worry about, this is a relatively small cost. -But if a variable or functions is being accessed regularly, such as within a loop, the impact may become visible. - -The below example provides a small demonstration of this in practice. - -```py -from timeit import timeit - -N = 1000000 -repeats = 1000 - -def test_list_global(): - t = 0 - for i in range(N): - if t > N: - break - t += i - -def test_list_local(): - t = 0 - N_local = N - for i in range(N_local): - if t > N_local: - break - t += i - -print(f"Global Scope: {timeit(test_list_global, number=repeats):.5f}ms") -print(f"Local Scope: {timeit(test_list_local, number=repeats):.5f}ms") -``` - -This is only a trivial example, whereby `N` has been copied to the local scope `N_local`, but local scope is about 20% faster than global scope! - -```output -Global Scope: 0.06416ms -Local Scope: 0.05391ms -``` - -Consider copying highly accessed variables into local scope, you can always copy them back to global scope before you return from a function. - -Copying functions to local scope works much the same as variables, e.g. - -```py -import numpy as np - -def my_function(): - uniform_local = np.random.uniform - - for i in range(10000): - t = uniform_local() -``` - -## Built-in Functions Operators - -In order to take advantage of offloading computation to the CPython back-end it's necessary to be aware of what functionality is present. Those available without importing packages are considered [built-in](https://docs.python.org/3/library/functions.html) functions. - -Built-in functions are typically implemented in the CPython back-end, so their performance benefits from bypassing the Python interpreter. - -In particular, those which are passed an `iterable` (e.g. lists) are likely to provide the greatest benefits to performance. The Python documentation provides equivalent Python code for many of these cases - -* [`all()`](https://docs.python.org/3/library/functions.html#all): boolean and of all items -* [`any()`](https://docs.python.org/3/library/functions.html#all): boolean or of all items -* [`max()`](https://docs.python.org/3/library/functions.html#max): Return the maximum item -* [`min()`](https://docs.python.org/3/library/functions.html#min): Return the minimum item -* [`sum()`](https://docs.python.org/3/library/functions.html#sum): Return the sum of all items - - - -::::::::::::::::::::::::::::::::::::: callout - -The built-in functions [`filter()`](https://docs.python.org/3/library/functions.html#filter) and [`map()`](https://docs.python.org/3/library/functions.html#map) can be used for processing iterables However list-comprehension is likely to be more performant. - - - -::::::::::::::::::::::::::::::::::::::::::::: ## Using NumPy (Effectively) @@ -294,7 +30,9 @@ The built-in functions [`filter()`](https://docs.python.org/3/library/functions. It adds restriction via it's own [basic numeric types](https://numpy.org/doc/stable/user/basics.types.html), and static arrays to enable even greater performance than that of core Python. However if these restrictions are ignored, the performance can become significantly worse. -### Arrays +![Illustration of a NumPy array and a Python list.](episodes/fig/numpyarray_vs_pylist.png){alt="A diagram illustrating the difference between a NumPy array and a Python list. The NumPy array is a raw block of memory containing numerical values. A Python list contains a header with metadata and multiple items, each of which is a reference to another Python object with its own header and value."} + +### NumPy arrays and Python lists live in two separate worlds NumPy's arrays (not to be confused with the core Python `array` package) are static arrays. Unlike core Python's lists, they do not dynamically resize. Therefore if you wish to append to a NumPy array, you must call `resize()` first. If you treat this like `append()` for a Python list, resizing for each individual append you will be performing significantly more copies and memory allocations than a Python list. @@ -316,12 +54,20 @@ def array_resize(): for i in range(1, N): ar.resize(i+1) ar[i] = i - + +def array_preallocate(): + ar = numpy.zeros(N) + for i in range(1, N): + ar[i] = i + repeats = 1000 print(f"list_append: {timeit(list_append, number=repeats):.2f}ms") print(f"array_resize: {timeit(array_resize, number=repeats):.2f}ms") +print(f"array_preallocate: {timeit(array_preallocate, number=repeats):.2f}ms") ``` +For Python lists, we’ve seen earlier that list comprehensions are more efficient, so we prefer to avoid using a large number of `append` operations if possible. Similarly, we should try to avoid resizing NumPy array, where the overhead is even higher. + Resizing a NumPy array is 5.2x slower than a list, probably 10x slower than list comprehension. ```output @@ -368,7 +114,10 @@ The below example demonstrates the overhead of mixing Python lists and NumPy fun Passing a Python list to `numpy.random.choice()` is 65.6x slower than passing a NumPy array. This is the additional overhead of converting the list to an array. If this function were called multiple times, it would make sense to transform the list to an array before calling the function so that overhead is only paid once. -::::::::::::::::::::::::::::::::::::: callout + + + + +### Array broadcasting + +NumPy arrays support “broadcasting” many mathematical operations or functions. +This is a shorthand notation, where the operation/function is applied element-wise without having to loop over the array explicitly: + +```Python +>>> import numpy as np +>>> ar = np.arange(6) +>>> ar +array([0, 1, 2, 3, 4, 5]) +>>> ar + 10 +array([10, 11, 12, 13, 14, 15]) +>>> ar * 2 +array([ 0, 2, 4, 6, 8, 10]) +>>> ar**2 +array([ 0, 1, 4, 9, 16, 25]) +>>> np.exp(ar) +array([ 1. , 2.71828183, 7.3890561 , 20.08553692, + 54.59815003, 148.4131591 ]) +``` + +Note that this does not work with Python lists in many cases: + +```Python +>>> ls = list(range(6)) +>>> ls + 10 +Traceback (most recent call last): + File "", line 1, in + ls + 10 + ~~~^~~~ +TypeError: can only concatenate list (not "int") to list +>>> ls * 2 +[0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5] +>>> ls ** 2 +Traceback (most recent call last): + File "", line 1, in + ls ** 2 + ~~~^^~~ +TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int' +>>> np.exp(ls) +array([ 1. , 2.71828183, 7.3890561 , 20.08553692, + 54.59815003, 148.4131591 ]) +``` ### Vectorisation -The manner by which NumPy stores data in arrays enables it's functions to utilise vectorisation, whereby the processor executes one instruction across multiple variables simultaneously, for every mathematical operation between arrays. +However, broadcasting is not just a nicer way to write mathematical expressions—it can also give a significant performance boost. + +The manner by which NumPy stores data in arrays enables its functions to utilise vectorisation, where the processor executes one instruction across multiple variables simultaneously, for every mathematical operation between arrays. + + + +```sh +> python -m timeit -s "import numpy; ar = numpy.arange(1)" "ar + 10" +1000000 loops, best of 5: 359 nsec per loop +> python -m timeit -s "import numpy; ar = numpy.arange(10)" "ar + 10" +1000000 loops, best of 5: 362 nsec per loop +> python -m timeit -s "import numpy; ar = numpy.arange(100)" "ar + 10" +1000000 loops, best of 5: 364 nsec per loop +``` +Whether we apply the addition to 1, 10 or 100 elements, it takes the same amount of time! + + Earlier in this episode it was demonstrated that using core Python methods over a list, will outperform a loop performing the same calculation faster. The below example takes this a step further by demonstrating the calculation of dot product. +Added Python sum array, skipped a couple of others--> ```python from timeit import timeit -N = 1000000 # Number of elements in list +N = 1_000_000 # Number of elements in list gen_list = f"ls = list(range({N}))" gen_array = f"import numpy;ar = numpy.arange({N}, dtype=numpy.int64)" @@ -437,7 +255,7 @@ NumPy can sometimes take advantage of auto parallelisation, particularly on HPC A small number of functions are backed by BLAS and LAPACK, enabling even greater speedup. -The [supported functions](https://numpy.org/doc/stable/reference/routines.linalg.html) mostly correspond to linear algebra operations. +The [supported functions](https://numpy.org/doc/stable/reference/routines.linalg.html) mostly correspond to linear algebra operations like `numpy.dot()`. The auto-parallelisation of these functions is hardware dependant, so you won't always automatically get the additional benefit of parallelisation. However, HPC systems should be primed to take advantage, so try increasing the number of cores you request when submitting your jobs and see if it improves the performance. @@ -446,7 +264,7 @@ However, HPC systems should be primed to take advantage, so try increasing the n ::::::::::::::::::::::::::::::::::::: -### `vectorize()` + + +## Other libraries that use NumPy + +Across the scientific Python software ecosystem, [many domain-specific packages](https://numpy.org/#:~:text=ECOSYSTEM) are built on top of NumPy arrays. +Similar to the demos above, we can often gain significant performance boosts by using these libraries well. + +::::::::::::::::::::::::::::::::::::: challenge + +Take a look at the [list of libraries on the NumPy website](https://numpy.org/#:~:text=ECOSYSTEM). Are you using any of them already? + +If you’ve brought a project you want to work on: Are there areas of the project where you might benefit from adapting one of these libraries instead of writing your own code from scratch? + +:::::::::::::::::::::::: hint + +These libraries could be specific to your area of research; but they could also include packages from other fields that provide tools you need (e.g. statistics or machine learning)! + +::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::::::::: + + +Which libraries you may use will depend on your research domain; here, we’ll show two examples from our own experience. + +### Example: Image analysis with Shapely + +A colleague had a large data set of images of cells. She had already reconstructed the locations of cell walls and various points of interest and needed to identify which points were located in each cell. +To do this, she used the [Shapely](https://github.com/shapely/shapely) geometry library. + +```Python +points_per_polygon = {} +for polygon_idx in range(n_polygons): + current_polygon = polygons.iloc[polygon_idx,:]["geometry"] + + # manually loop over all points, check if polygon contains that point + out_points = [] + for i in range(n_points): + current_point = points.iloc[i, :] + if current_polygon.contains(current_point["geometry"]): + out_points.append(current_point.name) + + points_per_polygon[polygon_idx] = out_points +``` + +For about 500k points and 1000 polygons, the initial version of the code took about 20 hours to run. + +Luckily, Shapely is built on top of NumPy, so she was able to apply functions to an array of points instead and wrote an improved version, which took just 20 minutes: + +```Python +points_per_polygon = {} +for polygon_idx in range(n_polygons): + current_polygon = polygons.iloc[polygon_idx,:]["geometry"] + + # vectorized: apply `contains` to an array of points at once + points_in_polygon_idx = current_polygon.contains(points_list) + points_in_polygon = point_names_list[points_in_polygon_idx] + + points_per_polygon[polygon_idx] = points_in_polygon.tolist() ``` + +### Example: Interpolating astrophysical spectra with AstroPy + +This is from an open-source package I’m working on, so we can look at the actual pull request where I made this change: https://github.com/SNEWS2/snewpy/pull/310 + +→ See the first table of benchmark results. Note that using a Python `for` loop to calculate the spectrum in 100 different time bins takes 100 times as long as for a single time bin. In the vectorized version, the computing time increases much more slowly. + +(Note that energies were already vectorized—that’s another factor of 100 we got “for free”!) + +Code diff: https://github.com/SNEWS2/snewpy/pull/310/commits/0320b384ff22233818d07913c55c30f5968ae330 + + ## Using Pandas (Effectively) [Pandas](https://pandas.pydata.org/) is the most common Python package used for scientific computing when working with tabular data akin to spreadsheets (DataFrames). @@ -631,6 +519,7 @@ If you can filter your rows before processing, rather than after, you may signif - Python is an interpreted language, this adds an additional overhead at runtime to the execution of Python code. Many core Python and NumPy functions are implemented in faster C/C++, free from this overhead. - NumPy can take advantage of vectorisation to process arrays, which can greatly improve performance. -- Pandas' data tables store columns as arrays, therefore operations applied to columns can take advantage of NumPys vectorisation. +- Many domain-specific packages are built on top of NumPy and can offer similar performance boosts. +- Pandas' data tables store columns as arrays, therefore operations applied to columns can take advantage of NumPy’s vectorisation. :::::::::::::::::::::::::::::::::::::::::::::::: diff --git a/episodes/optimisation-use-latest.md b/episodes/optimisation-use-latest.md index cabe6fae..8c6e7015 100644 --- a/episodes/optimisation-use-latest.md +++ b/episodes/optimisation-use-latest.md @@ -36,6 +36,16 @@ These improvements are often free, requiring minimal changes to any code (unlike Performance regressions within major packages should be considered rare, they often track performance alongside their test suites. +::::::::::::::::::::::::::::::::::::: callout + +## Support for older Python versions in the Scientific Python ecosystem + +In the last few years, many important packages in the Scientific Python ecosystem have agreed [a common policy](https://scientific-python.org/specs/spec-0000/) on how long to support previous versions of Python. +Since October 2024, these packages stopped supporting Python 3.10; so if you are still using Python 3.10 (or even older versions), you’re now losing access to new features and performance improvements in NumPy, SciPy, Matplotlib and many other libraries. Time to update! + +::::::::::::::::::::::::::::::::::::: + + However, the more packages and language features your code touches, and the older the Python it currently uses, the greater chance of incompatibilities making it difficult to upgrade. @@ -52,6 +62,12 @@ This could cause your code to crash, or worse subtly change your results. +If you have been working with an existing Python installation, the upgrade process for Python itself depends on how you installed your current version. (E.g. via conda, official installer from python.org, package manager like Homebrew/apt/yum/…) + +For packages you’re using, you can update those in the same way you installed them: + +* via `pip`, e.g. `pip install --upgrade numpy` +* via `conda`, e.g. `conda update ` diff --git a/episodes/optimisation-using-python.md b/episodes/optimisation-using-python.md new file mode 100644 index 00000000..e8efd0f4 --- /dev/null +++ b/episodes/optimisation-using-python.md @@ -0,0 +1,389 @@ +--- +title: "Using Python Language Features and the Standard Library" +teaching: 10 +exercises: 5 +--- + +:::::::::::::::::::::::::::::::::::::: questions + +- Why are Python loops slower than specialised functions? +- How can I make my code more readable and faster? + +:::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: objectives + +- Able to utilise Python language features effectively +- Able to search Python documentation for functionality available in built-in types and in the standard library +- Able to identify when Python code can be rewritten to perform execution in the back-end. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::: instructor + +This episode discusses relatively fundamental features of Python. + +For students experienced with writing Python, many of these points may be unnecessary. However, self-taught students—especially if they have previously studied lower-level languages with a less powerful standard library—may have adopted “unpythonic” habits and will particularly benefit from this section. + +:::::::::::::::::::::::::::::::::::::::::::::::: + +Before we look at data structures, algorithms and third-party libraries, we should take a few minutes to make sure we’re familiar with the fundamentals of Python. +If you’ve learned to program in another language, chances are you’ve picked up some habits from that language that don’t work well in Python. + +Most of the bad habits that took me a while to unlearn—and that I’ve observed in others, too—come from learning to program in a lower-level language (like C or Delphi), with a less powerful standard library. + + +## Built-in Functions + +For example, back when I was in undergrad, if you’d asked me to sum up a bunch of data points, I would have written something like the first function in this code sample: + +```Python +import random +from timeit import timeit + +N = 100_000 # Number of elements in the list + +# Ensure every list is the same +random.seed(12) +my_data = [random.random() for i in range(N)] + + +def manualSumC(): # my first attempt + n = 0 + for i in range(len(my_data)): + n += my_data[i] + return n + +def manualSumPy(): # slightly improved + n = 0 + for evt_count in my_data: + n += evt_count + return n + +def builtinSum(): # fastest and most readable + return sum(my_data) + + +repeats = 1000 +print(f"manualSumC: {timeit(manualSumC, globals=globals(), number=repeats):.3f}ms") +print(f"manualSumPy: {timeit(manualSumPy, globals=globals(), number=repeats):.3f}ms") +print(f"builtinSum: {timeit(builtinSum, globals=globals(), number=repeats):.3f}ms") +``` + +Even just replacing the iteration over indices (which may be a habit you’ve picked up if you first learned to program in C) with a more pythonic iteration over the elements themselves speeds up the code by about 2×. +But even better, by switching to the builtin `sum` function our code becomes about 8× faster, doing the exact same operation! + +```output +manualSumC: 1.624ms +manualSumPy: 0.740ms +builtinSum: 0.218ms +``` + +This is because [built-in functions](https://docs.python.org/3/library/functions.html) (i.e. those that are available without importing packages) are typically implemented in the CPython back-end, so their performance benefits from bypassing the Python interpreter. + +In particular, those which are passed an `iterable` (e.g. lists) are likely to provide the greatest benefits to performance. The Python documentation provides equivalent Python code for many of these cases. + +* [`all()`](https://docs.python.org/3/library/functions.html#all): boolean and of all items +* [`any()`](https://docs.python.org/3/library/functions.html#all): boolean or of all items +* [`max()`](https://docs.python.org/3/library/functions.html#max): Return the maximum item +* [`min()`](https://docs.python.org/3/library/functions.html#min): Return the minimum item +* [`sum()`](https://docs.python.org/3/library/functions.html#sum): Return the sum of all items + + + + + +This is a nice illustration of the principle we discussed earlier: +It is often best to tell the interpreter/library at a high level *what you want*, and let it figure out *how to do it*. + + +## Example: Searching an element in a list + +A simple example of this would be to perform a linear search of a list. (Though as we’ll see in the next section, there’s a much more effective way to do this!) +The below example creates a list of 2500 integers in the inclusive-exclusive range `[0, 5000)`. +It then searches for all of the even numbers in that range. +`manualSearch()` is implemented manually, iterating `ls` checking each individual item in Python code. + +`operatorSearch()` in contrast uses the `in` operator to perform each search, which allows CPython to implement the inner loop in it's C back-end. + +```python +import random + +N = 2500 # Number of elements in list +M = 2 # N*M == Range over which the elements span + +def generateInputs(): + random.seed(12) # Ensure every list is the same + return [random.randint(0, int(N*M)) for i in range(N)] + +def manualSearch(): + ls = generateInputs() + ct = 0 + for i in range(0, int(N*M), M): + for j in range(0, len(ls)): + if ls[j] == i: + ct += 1 + break + +def operatorSearch(): + ls = generateInputs() + ct = 0 + for i in range(0, int(N*M), M): + if i in ls: + ct += 1 + +repeats = 1000 +gen_time = timeit(generateInputs, number=repeats) +print(f"manualSearch: {timeit(manualSearch, number=repeats)-gen_time:.2f}ms") +print(f"operatorSearch: {timeit(operatorSearch, number=repeats)-gen_time:.2f}ms") +``` + +This results in the manual Python implementation being 5x slower, doing the exact same operation! + +```output +manualSearch: 152.15ms +operatorSearch: 28.43ms +``` + +An easy approach to follow is that if two blocks of code do the same operation, the one that contains less Python is probably faster. This won't apply if you're using 3rd party packages written purely in Python though. + +::::::::::::::::::::::::::::::::::::: callout + +### Python bytecode + + + + +You can use `dis` to view the bytecode generated by Python, the amount of byte-code more strongly correlates with how much code is being executed by the Python interpreter. However, this still does not account for whether functions called are implemented using Python or C. + +The pure Python search compiles to 82 lines of byte-code. + +```python +import dis + +def manualSearch(): + ls = generateInputs() + ct = 0 + for i in range(0, int(N*M), M): + for j in range(0, len(ls)): + if ls[j] == i: + ct += 1 + break + +dis.dis(manualSearch) +``` +```output + 11 0 LOAD_GLOBAL 0 (generateInputs) + 2 CALL_FUNCTION 0 + 4 STORE_FAST 0 (ls) + + 12 6 LOAD_CONST 1 (0) + 8 STORE_FAST 1 (ct) + + 13 10 LOAD_GLOBAL 1 (range) + 12 LOAD_CONST 1 (0) + 14 LOAD_GLOBAL 2 (int) + 16 LOAD_GLOBAL 3 (N) + 18 LOAD_GLOBAL 4 (M) + 20 BINARY_MULTIPLY + 22 CALL_FUNCTION 1 + 24 LOAD_GLOBAL 4 (M) + 26 CALL_FUNCTION 3 + 28 GET_ITER + >> 30 FOR_ITER 24 (to 80) + 32 STORE_FAST 2 (i) + + 14 34 LOAD_GLOBAL 1 (range) + 36 LOAD_CONST 1 (0) + 38 LOAD_GLOBAL 5 (len) + 40 LOAD_FAST 0 (ls) + 42 CALL_FUNCTION 1 + 44 CALL_FUNCTION 2 + 46 GET_ITER + >> 48 FOR_ITER 14 (to 78) + 50 STORE_FAST 3 (j) + + 15 52 LOAD_FAST 0 (ls) + 54 LOAD_FAST 3 (j) + 56 BINARY_SUBSCR + 58 LOAD_FAST 2 (i) + 60 COMPARE_OP 2 (==) + 62 POP_JUMP_IF_FALSE 38 (to 76) + + 16 64 LOAD_FAST 1 (ct) + 66 LOAD_CONST 2 (1) + 68 INPLACE_ADD + 70 STORE_FAST 1 (ct) + + 17 72 POP_TOP + 74 JUMP_FORWARD 1 (to 78) + + 15 >> 76 JUMP_ABSOLUTE 24 (to 48) + >> 78 JUMP_ABSOLUTE 15 (to 30) + + 13 >> 80 LOAD_CONST 0 (None) + 82 RETURN_VALUE +``` + +Whereas the `in` variant only compiles to 54. + +```python +import dis + +def operatorSearch(): + ls = generateInputs() + ct = 0 + for i in range(0, int(N*M), M): + if i in ls: + ct += 1 + +dis.dis(operatorSearch) +``` +```output + 4 0 LOAD_GLOBAL 0 (generateInputs) + 2 CALL_FUNCTION 0 + 4 STORE_FAST 0 (ls) + + 5 6 LOAD_CONST 1 (0) + 8 STORE_FAST 1 (ct) + + 6 10 LOAD_GLOBAL 1 (range) + 12 LOAD_CONST 1 (0) + 14 LOAD_GLOBAL 2 (int) + 16 LOAD_GLOBAL 3 (N) + 18 LOAD_GLOBAL 4 (M) + 20 BINARY_MULTIPLY + 22 CALL_FUNCTION 1 + 24 LOAD_GLOBAL 4 (M) + 26 CALL_FUNCTION 3 + 28 GET_ITER + >> 30 FOR_ITER 10 (to 52) + 32 STORE_FAST 2 (i) + + 7 34 LOAD_FAST 2 (i) + 36 LOAD_FAST 0 (ls) + 38 CONTAINS_OP 0 + 40 POP_JUMP_IF_FALSE 25 (to 50) + + 8 42 LOAD_FAST 1 (ct) + 44 LOAD_CONST 2 (1) + 46 INPLACE_ADD + 48 STORE_FAST 1 (ct) + >> 50 JUMP_ABSOLUTE 15 (to 30) + + 6 >> 52 LOAD_CONST 0 (None) + 54 RETURN_VALUE +``` + +::::::::::::::::::::::::::::::::::::::::::::: + + +## Example: Parsing data from a text file + +In C, since there is no high-level `string` datatype, parsing strings can be fairly arduous work where you repeatedly look for the index of a separator character in the string and use that index to split the string up. + + +::::::::::::::::::::::::::::::::::::: challenge + +Let’s say we have read in some data from a text file, each line containing a time bin and a mean energy: + +```python +f = [ + ' 0000 0.9819 ', + ' 0001 0.3435 ', + # ... + ' 0099 0.2275 ', + ' 0100 0.7067 ', + # ... +] +``` + +A colleague (who learned to program in C before he started using Python) wrote the following code to parse the data into a dictionary: +```python +def manualSplit(): + data = {} + for line in f: + first_char = line.find("0") + end_time = line.find(" ", first_char, -1) + + energy_found = line.find(".", end_time, -1) + begin_energy = line.rfind(" ", end_time, energy_found) + end_energy = line.find(" ", energy_found, -1) + if end_energy == -1: + end_energy = len(line) + + time = line[first_char:end_time] + energy = line[begin_energy + 1:end_energy] + + data[time] = energy + return data +``` + +Can you find a shorter, more easily understandable way to write this in Python? + + + +:::::::::::::::::::::::: hint + +Python strings have a lot of methods to perform common operations, like removing suffixes, replacing substrings, joining or splitting, stripping whitespaces, and much more. See the documentation at https://docs.python.org/3/library/stdtypes.html#string-methods for a full list. + +::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::: solution + +```python + +def builtinSplit(): + data = {} + for line in f: + time, energy = line.split() + data[time] = energy + return data +``` + +This is much more readable. +The code that’s executed by CPython may use similar indexing steps as in `manualSplit`; however, since this is all happening “under the hood” in C code, it is once again faster. + +```python + +N = 10_000 # Number of elements in the list + +# Ensure every list is the same +random.seed(12) +f = [f" {i:0>6d} {random.random():8.4f} " for i in range(N)] + +repeats = 1000 +print(f"manualSplit: {timeit(manualSplit, globals=globals(), number=repeats):.3f}ms") +print(f"builtinSplit: {timeit(builtinSplit, globals=globals(), number=repeats):.3f}ms") +``` + +```output +manualSplit: 1.797ms +builtinSplit: 0.796ms +``` + +::::::::::::::::::::::::::::::::: +::::::::::::::::::::::::::::::::::::::::::::::: + + +::::::::::::::::::::::::::::::::::::: challenge + +If you’ve brought a project you want to work on: Do you have any similar code in there, which is hard to understand because it contains a lot of low-level step-by-step instructions? + +:::::::::::::::::::::::: hint + +Typical cases might include reading data from a file, …, … TODO: more examples? + +(Before you try to rewrite those parts of your code, use a profiler to see whether those parts have a noticeable impact on the overall performance of your project. Remember the Donald Knuth quote!) + +::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::::::::: \ No newline at end of file diff --git a/index.md b/index.md index af9940b6..df6f5eda 100644 --- a/index.md +++ b/index.md @@ -17,7 +17,7 @@ If you are now comfortable using Python, this course may be of interest to suppl This is an all-day course, however it normally finishes by early afternoon. -If you would like to register to take the course, check the [registration information](learners/registration.md). + @@ -35,7 +35,7 @@ If you would like to register to take the course, check the [registration inform After attending this training, participants will be able to: - identify the most expensive functions and lines of code using `cprofile` and `line_profiler`. -- evaluate code to determine the limiting factors of it's performance. +- evaluate code to determine the limiting factors of its performance. - recognise and implement optimisations for common limiting factors of performance. :::::::::::::::::::::::::::::::::::::::::: prereq @@ -47,7 +47,7 @@ Before joining Performance Profiling & Optimisation (Python) Training, participa - implement basic algorithms in Python. - follow the control flow of Python code, and dry run the execution in their head or on paper. -See the [Research Computing Training Hub](https://sites.google.com/sheffield.ac.uk/research-training/research-training) for other courses to help with learning these skills. + ::::::::::::::::::::::::::::::::::::::::::::::::::