|
6 | 6 | import os |
7 | 7 | import datetime |
8 | 8 | import calendar |
| 9 | +import psutil |
9 | 10 | import json |
10 | 11 | from collections import defaultdict |
11 | 12 | import logging |
@@ -606,3 +607,41 @@ def get_clusters(ledger): |
606 | 607 | cluster_mapping[item[0]] = cluster_name |
607 | 608 |
|
608 | 609 | return cluster_mapping |
| 610 | + |
| 611 | + |
| 612 | +def get_concurrency_per_ledger(): |
| 613 | + """ |
| 614 | + Computes the maximum number of parallel processes that can run per ledger, |
| 615 | + based on the system's available memory. |
| 616 | + :returns: a dictionary where the keys are ledger names and values are integers |
| 617 | + """ |
| 618 | + system_memory_total = psutil.virtual_memory().total # Get the system's total memory |
| 619 | + system_memory_total -= 10**9 # Leave 1GB of memory to be used by other processes |
| 620 | + |
| 621 | + concurrency = {} |
| 622 | + too_large_ledgers = set() |
| 623 | + input_dirs = get_input_directories() |
| 624 | + for ledger in get_ledgers(): |
| 625 | + # Find the size of the largest input file per ledger |
| 626 | + max_file_size = 0 |
| 627 | + for input_dir in input_dirs: |
| 628 | + for folder, _, files in os.walk(input_dir): |
| 629 | + for file in files: |
| 630 | + if file.startswith(ledger): |
| 631 | + max_file_size = max(max_file_size, os.stat(os.path.join(folder, file)).st_size) |
| 632 | + # Compute the max number of processes that can open the largest ledger file |
| 633 | + # and run in parallel without exhausting the system's memory. |
| 634 | + if max_file_size > 0: |
| 635 | + # When loaded in (a dict in) memory, each file consumes approx. 2.5x space compared to storage. |
| 636 | + concurrency[ledger] = int(system_memory_total / (2.5 * max_file_size)) |
| 637 | + # Find if some ledger files are too large to fit in the system's available memory. |
| 638 | + if concurrency[ledger] == 0: |
| 639 | + too_large_ledgers.add(ledger) |
| 640 | + else: |
| 641 | + concurrency[ledger] = 1 |
| 642 | + |
| 643 | + if too_large_ledgers: |
| 644 | + raise ValueError('The max input files of the following ledgers are too' |
| 645 | + 'large to load in memory' + ','.join(too_large_ledgers)) |
| 646 | + |
| 647 | + return concurrency |
0 commit comments