Performance updates (#152)

dimkarakostas · web-flow · commit 48168923e268 · 2024-06-04T16:18:16.000+01:00
* Pass sorted block distribution to metrics functions

* Release parsed/mapped data variables to reduce memory

* Fix tests

* Fix typo in max entropy computation
diff --git a/consensus_decentralization/aggregate.py b/consensus_decentralization/aggregate.py
@@ -13,16 +13,15 @@ class Aggregator:
     blocks they produced
     """
 
-    def __init__(self, project, io_dir, data_to_aggregate):
+    def __init__(self, project, io_dir):
         """
         :param project: str. Name of the project
         :param io_dir: Path. Path to the project's output directory
-        :param data_to_aggregate: list of dictionaries, sorted by 'timestamp'; the data that will be aggregated
         """
         self.project = project
-        self.data_to_aggregate = data_to_aggregate
-        self.data_start_date = hlp.get_timeframe_beginning(hlp.get_date_from_block(data_to_aggregate[0]))
-        self.data_end_date = hlp.get_timeframe_beginning(hlp.get_date_from_block(data_to_aggregate[-1]))
+        self.data_to_aggregate = hlp.read_mapped_project_data(io_dir)
+        self.data_start_date = hlp.get_timeframe_beginning(hlp.get_date_from_block(self.data_to_aggregate[0]))
+        self.data_end_date = hlp.get_timeframe_beginning(hlp.get_date_from_block(self.data_to_aggregate[-1]))
         self.aggregated_data_dir = io_dir / 'blocks_per_entity'
         self.aggregated_data_dir.mkdir(parents=True, exist_ok=True)
 
@@ -91,7 +90,7 @@ def divide_timeframe(timeframe, granularity):
     return list(zip(start_dates, end_dates))
 
 
-def aggregate(project, output_dir, timeframe, aggregate_by, force_aggregate, mapped_data=None):
+def aggregate(project, output_dir, timeframe, aggregate_by, force_aggregate):
     """
     Aggregates the results of the mapping process for the given project and timeframe. The results are saved in a csv
     file in the project's output directory. Note that the output file is created (just with the headers) even if there
@@ -103,15 +102,11 @@ def aggregate(project, output_dir, timeframe, aggregate_by, force_aggregate, map
         year, all
     :param force_aggregate: bool. If True, then the aggregation will be performed, regardless of whether aggregated
         data for the project and specified granularity already exist
-    :param mapped_data: list of dictionaries (the data that will be aggregated). If None, then the data will be read
-    from the project's output directory
     :returns: a list of strings that correspond to the time chunks of the aggregation or None if no aggregation took
     place (the corresponding output file already existed and force_aggregate was set to False)
     """
     project_io_dir = output_dir / project
-    if mapped_data is None:
-        mapped_data = hlp.read_mapped_project_data(project_io_dir)
-    aggregator = Aggregator(project, project_io_dir, mapped_data)
+    aggregator = Aggregator(project, project_io_dir)
 
     filename = hlp.get_blocks_per_entity_filename(aggregate_by=aggregate_by, timeframe=timeframe)
     output_file = aggregator.aggregated_data_dir / filename
diff --git a/consensus_decentralization/analyze.py b/consensus_decentralization/analyze.py
@@ -39,6 +39,7 @@ def analyze(projects, aggregated_data_filename, output_dir):
         csv_contents[metric] = [['timeframe'] + projects]
 
     for column_index, project in enumerate(projects):
+        logging.info(f'Calculating {project} metrics')
         aggregate_output[project] = {}
         aggregated_data_dir = output_dir / project / 'blocks_per_entity'
         time_chunks, blocks_per_entity = hlp.get_blocks_per_entity_from_file(aggregated_data_dir / aggregated_data_filename)
@@ -50,24 +51,26 @@ def analyze(projects, aggregated_data_filename, output_dir):
             for tchunk, nblocks in block_values.items():
                 if nblocks > 0:
                     chunks_with_blocks.add(tchunk)
-        for metric_name, metric, param in metric_params:
-            logging.info(f'Calculating {metric_name}')
 
-            for row_index, time_chunk in enumerate(time_chunks):
-                time_chunk_blocks_per_entity = {}
-                if column_index == 0:
+        for row_index, time_chunk in enumerate(time_chunks):
+            time_chunk_blocks_per_entity = {}
+            if column_index == 0:
+                for metric_name, _, _ in metric_params:
                     csv_contents[metric_name].append([time_chunk])
-                if time_chunk in chunks_with_blocks:
-                    for entity, block_values in blocks_per_entity.items():
-                        try:
-                            time_chunk_blocks_per_entity[entity] = block_values[time_chunk]
-                        except KeyError:
-                            time_chunk_blocks_per_entity[entity] = 0
+            if time_chunk in chunks_with_blocks:
+                for entity, block_values in blocks_per_entity.items():
+                    try:
+                        time_chunk_blocks_per_entity[entity] = block_values[time_chunk]
+                    except KeyError:
+                        time_chunk_blocks_per_entity[entity] = 0
+            sorted_time_chunk_blocks = sorted(time_chunk_blocks_per_entity.values(), reverse=True)
+
+            for metric_name, metric, param in metric_params:
                 func = eval(f'compute_{metric}')
                 if param:
-                    result = func(time_chunk_blocks_per_entity, param)
+                    result = func(sorted_time_chunk_blocks, param)
                 else:
-                    result = func(time_chunk_blocks_per_entity)
+                    result = func(sorted_time_chunk_blocks)
                 csv_contents[metric_name][row_index + 1].append(result)
                 aggregate_output[project][time_chunk][metric_name] = result
 
diff --git a/consensus_decentralization/metrics/entropy.py b/consensus_decentralization/metrics/entropy.py
@@ -1,18 +1,17 @@
 from math import log
 
 
-def compute_entropy(blocks_per_entity, alpha):
+def compute_entropy(block_distribution, alpha):
     """
     Calculates the entropy of a distribution of blocks to entities
     Pi is the relative frequency of each entity.
     Renyi entropy: 1/(1-alpha) * log2 (sum (Pi**alpha))
     Shannon entropy (alpha=1): −sum P(Si) log2 (Pi)
     Min entropy (alpha=-1): -log max Pi
-    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order
     :param alpha: the entropy parameter (depending on its value the corresponding entropy measure is used)
     :returns: a float that represents the entropy of the data or None if the data is empty
     """
-    block_distribution = blocks_per_entity.values()
     all_blocks = sum(block_distribution)
     if all_blocks == 0:
         return None
@@ -35,13 +34,13 @@ def compute_entropy(blocks_per_entity, alpha):
 
 
 def compute_max_entropy(num_entities, alpha):
-    return compute_entropy({i: 1 for i in range(num_entities)}, alpha)
+    return compute_entropy([1 for i in range(num_entities)], alpha)
 
 
-def compute_entropy_percentage(blocks_per_entity, alpha):
-    if sum(blocks_per_entity.values()) == 0:
+def compute_entropy_percentage(block_distribution, alpha):
+    if sum(block_distribution) == 0:
         return None
     try:
-        return compute_entropy(blocks_per_entity, alpha) / compute_max_entropy(len(blocks_per_entity), alpha)
+        return compute_entropy(block_distribution, alpha) / compute_max_entropy(len(block_distribution), alpha)
     except ZeroDivisionError:
         return 0
diff --git a/consensus_decentralization/metrics/gini.py b/consensus_decentralization/metrics/gini.py
@@ -1,15 +1,15 @@
 import numpy as np
 
 
-def compute_gini(blocks_per_entity):
+def compute_gini(block_distribution):
     """
     Calculates the Gini coefficient of a distribution of blocks to entities
-    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order
     :returns: a float that represents the Gini coefficient of the given distribution or None if the data is empty
     """
-    if sum(blocks_per_entity.values()) == 0:
+    if sum(block_distribution) == 0:
         return None
-    array = np.array(list(blocks_per_entity.values()))
+    array = np.array(block_distribution)
     return gini(array)
 
 
diff --git a/consensus_decentralization/metrics/herfindahl_hirschman_index.py b/consensus_decentralization/metrics/herfindahl_hirschman_index.py
@@ -1,20 +1,20 @@
-def compute_hhi(blocks_per_entity):
+def compute_hhi(block_distribution):
     """
     Calculates the Herfindahl-Hirschman index of a distribution of blocks to entities
     From investopedia: The HHI is calculated by squaring the market share of each firm competing in a market and then
     summing the resulting numbers. It can range from close to 0 to 10,000, with lower values indicating a less
     concentrated market. The U.S. Department of Justice considers a market with an HHI of less than 1,500 to be a
     competitive marketplace, an HHI of 1,500 to 2,500 to be a moderately concentrated marketplace,
     and an HHI of 2,500 or greater to be a highly concentrated marketplace.
-    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order
     :return: float between 0 and 10,000 that represents the HHI of the given distribution or None if the data is empty
     """
-    total_blocks = sum(blocks_per_entity.values())
+    total_blocks = sum(block_distribution)
     if total_blocks == 0:
         return None
 
     hhi = 0
-    for num_blocks in blocks_per_entity.values():
-        hhi += pow(num_blocks / total_blocks * 100, 2)
+    for num_blocks in block_distribution:
+        hhi += pow(100 * num_blocks / total_blocks, 2)
 
     return hhi
diff --git a/consensus_decentralization/metrics/max_power_ratio.py b/consensus_decentralization/metrics/max_power_ratio.py
@@ -1,11 +1,8 @@
-def compute_max_power_ratio(blocks_per_entity):
+def compute_max_power_ratio(block_distribution):
     """
     Calculates the maximum power ratio of a distribution of balances
-    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order
     :returns: float that represents the maximum power ratio among all block producers (0 if there weren't any)
     """
-    if len(blocks_per_entity) == 0:
-        return 0
-    max_nblocks = max(blocks_per_entity.values())
-    total_blocks = sum(blocks_per_entity.values())
-    return max_nblocks / total_blocks if total_blocks > 0 else 0
+    total_blocks = sum(block_distribution)
+    return block_distribution[0] / total_blocks if total_blocks else 0
diff --git a/consensus_decentralization/metrics/nakamoto_coefficient.py b/consensus_decentralization/metrics/nakamoto_coefficient.py
@@ -1,10 +1,10 @@
 from consensus_decentralization.metrics.tau_index import compute_tau_index
 
 
-def compute_nakamoto_coefficient(blocks_per_entity):
+def compute_nakamoto_coefficient(block_distribution):
     """
     Calculates the Nakamoto coefficient of a distribution of blocks to entities
-    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order
     :returns: int that represents the Nakamoto coefficient of the given distribution, or None if the data is empty
     """
-    return compute_tau_index(blocks_per_entity=blocks_per_entity, threshold=0.5)
+    return compute_tau_index(block_distribution, 0.5)
diff --git a/consensus_decentralization/metrics/tau_index.py b/consensus_decentralization/metrics/tau_index.py
@@ -1,19 +1,18 @@
-def compute_tau_index(blocks_per_entity, threshold):
+def compute_tau_index(block_distribution, threshold):
     """
     Calculates the tau-decentralization index of a distribution of blocks
-    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order
     :param threshold: float, the parameter of the tau-decentralization index, i.e. the threshold for the power
     ratio that is captured by the index (e.g. 0.66 for 66%)
     :returns: int that corresponds to the tau index of the given distribution, or None if there were no blocks
     """
-    total_blocks = sum(blocks_per_entity.values())
+    total_blocks = sum(block_distribution)
     if total_blocks == 0:
         return None
     tau_index, power_ratio_covered = 0, 0
-    blocks_per_entity_copy = blocks_per_entity.copy()
-    while power_ratio_covered < threshold:
-        current_max_entity = max(blocks_per_entity_copy, key=blocks_per_entity_copy.get)
+    for block_amount in block_distribution:
+        if power_ratio_covered >= threshold:
+            break
         tau_index += 1
-        power_ratio_covered += blocks_per_entity_copy[current_max_entity] / total_blocks
-        del blocks_per_entity_copy[current_max_entity]
+        power_ratio_covered += block_amount / total_blocks
     return tau_index
diff --git a/consensus_decentralization/metrics/theil_index.py b/consensus_decentralization/metrics/theil_index.py
@@ -1,19 +1,19 @@
 from math import log
 
 
-def compute_theil_index(blocks_per_entity):
+def compute_theil_index(block_distribution):
     """
     Calculates the Thiel index of a distribution of blocks to entities
-    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order
     :returns: float that represents the Thiel index of the given distribution
     """
-    n = len(blocks_per_entity)
+    n = len(block_distribution)
     if n == 0:
         return 0
-    total_blocks = sum(blocks_per_entity.values())
+    total_blocks = sum(block_distribution)
     mu = total_blocks / n
     theil = 0
-    for nblocks in blocks_per_entity.values():
+    for nblocks in block_distribution:
         x = nblocks / mu
         if x > 0:
             theil += x * log(x)
diff --git a/run.py b/run.py
@@ -10,6 +10,13 @@
 logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)
 
 
+def process_data(force_map, project_dir, project, output_dir):
+    mapped_data_file = project_dir / 'mapped_data.json'
+    if force_map or not mapped_data_file.is_file():
+        parsed_data = parse(project=project, input_dir=hlp.RAW_DATA_DIR)
+        apply_mapping(project=project, parsed_data=parsed_data, output_dir=output_dir)
+
+
 def main(projects, timeframe, aggregate_by, force_map, make_plots, make_animated_plots, output_dir=hlp.OUTPUT_DIR):
     """
     Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
@@ -28,19 +35,15 @@ def main(projects, timeframe, aggregate_by, force_map, make_plots, make_animated
     for project in projects:
         project_dir = output_dir / project
         project_dir.mkdir(parents=True, exist_ok=True)  # create project output directory if it doesn't already exist
-        mapped_data_file = project_dir / 'mapped_data.json'
-        if force_map or not mapped_data_file.is_file():
-            parsed_data = parse(project=project, input_dir=hlp.RAW_DATA_DIR)
-            mapped_data = apply_mapping(project=project, parsed_data=parsed_data, output_dir=output_dir)
-        else:
-            mapped_data = None
+
+        process_data(force_map, project_dir, project, output_dir)
+
         aggregate(
             project=project,
             output_dir=output_dir,
             timeframe=timeframe,
             aggregate_by=aggregate_by,
-            force_aggregate=force_map,
-            mapped_data=mapped_data
+            force_aggregate=force_map
         )
 
     used_metrics = analyze(
diff --git a/tests/test_aggregate.py b/tests/test_aggregate.py
@@ -131,8 +131,7 @@ def test_aggregate(setup_and_cleanup, mock_sample_bitcoin_mapped_data):
 
 
 def test_aggregate_method(setup_and_cleanup, mock_sample_bitcoin_mapped_data):
-    aggregator = Aggregator(project='sample_bitcoin', io_dir=setup_and_cleanup / 'sample_bitcoin',
-                            data_to_aggregate=mock_sample_bitcoin_mapped_data)
+    aggregator = Aggregator(project='sample_bitcoin', io_dir=setup_and_cleanup / 'sample_bitcoin')
 
     blocks_per_entity = aggregator.aggregate(datetime.date(2018, 2, 1), datetime.date(2018, 2, 28))
     assert sum(blocks_per_entity.values()) == 8
diff --git a/tests/test_metrics.py b/tests/test_metrics.py