Add configurable population window

LadyChristina · LadyChristina · commit 200af0dbc4a8 · 2025-03-16T13:37:59.000Z
diff --git a/config.yaml b/config.yaml
@@ -38,8 +38,8 @@ analyze_flags:
 # The timeframe for which an analysis should be performed.
 # Each date is a string of the form YYYY-MM-DD.
 timeframe:
-  start_date: 2011-01-01
-  end_date: 2023-12-31
+  start_date: 2018-01-01
+  end_date: 2025-03-01
 
 # The number of days to use for the estimation window, i.e.how many days of blocks to use for each data point.
 # If left empty, then the entire time frame will be used (only valid when combined with empty frequency).
@@ -48,19 +48,13 @@ estimation_window: 30
 # How frequently to sample the data, in days
 # If left empty, then only one data point will be analyzed (snapshot instead of longitudinal analysis), but this is
 #  only valid when combined with an empty estimation_window.
-frequency: 30  # todo maybe add hadrcoded values for day, week, month, year (in the code that parses this) + for the estimation window
-
-
-input_directories:  # Paths to directories that contain raw input data
-  - ./input
-
-# Paths to directories of snapshot db files; either absolute or relative from run.py.
-# The first path will be used to write newly created dbs and the output of runs
-output_directories:  
-  - ./output
+frequency: 30
 
+# A number that specifies how many windows to look back and forward when deciding whether an entity is active on a
+#  given time period, or 'all' to count all entities that have produced blocks in the entire observation period.
+population_windows: 1
 
 # Plot flags
 plot_parameters:
   plot: false
-  animated: true
+  animated: false
diff --git a/consensus_decentralization/analyze.py b/consensus_decentralization/analyze.py
@@ -11,7 +11,7 @@
 from consensus_decentralization.metrics.total_entities import compute_total_entities  # noqa: F401
 
 
-def analyze(projects, aggregated_data_filename, output_dir):
+def analyze(projects, aggregated_data_filename, input_dir, output_dir, population_windows):
     """
     Calculates all available metrics for the given ledgers and timeframes. Outputs one file for each metric.
     :param projects: list of strings that correspond to the ledgers whose data should be analyzed
@@ -20,6 +20,7 @@ def analyze(projects, aggregated_data_filename, output_dir):
 
     Using multiple projects and timeframes is necessary here to produce collective csv files.
     """
+
     logging.info('Calculating metrics on aggregated data..')
     metrics = hlp.get_metrics_config()
     metric_params = []
@@ -30,6 +31,7 @@ def analyze(projects, aggregated_data_filename, output_dir):
         else:
             metric_params.append((key, key, None))
     metric_names = [name for name, _, _ in metric_params]
+    clustering_flag = hlp.get_clustering_flag()
 
     aggregate_output = {}
 
@@ -42,8 +44,9 @@ def analyze(projects, aggregated_data_filename, output_dir):
     for column_index, project in enumerate(projects):
         logging.info(f'Calculating {project} metrics')
         aggregate_output[project] = {}
-        aggregated_data_dir = output_dir / project / 'blocks_per_entity'
-        dates, blocks_per_entity = hlp.get_blocks_per_entity_from_file(aggregated_data_dir / aggregated_data_filename)
+        aggregated_data_dir = input_dir / project / hlp.get_aggregated_data_dir_name(clustering_flag)
+        dates, blocks_per_entity = hlp.get_blocks_per_entity_from_file(aggregated_data_dir /
+                                                                       aggregated_data_filename, population_windows)
         for date in dates:
             aggregate_output[project][date] = {}
 
@@ -80,7 +83,6 @@ def analyze(projects, aggregated_data_filename, output_dir):
             csv_writer = csv.writer(f)
             csv_writer.writerows(csv_contents[metric])
 
-    clustering_flag = hlp.get_config_data()['analyze_flags']['clustering']
     aggregate_csv_output = [['ledger', 'date', 'clustering'] + metric_names]
     for project, timeframes in aggregate_output.items():
         for date, results in timeframes.items():
diff --git a/consensus_decentralization/helper.py b/consensus_decentralization/helper.py
@@ -191,11 +191,13 @@ def write_blocks_per_entity_to_file(output_dir, blocks_per_entity, dates, filena
             csv_writer.writerow(entity_row)
 
 
-def get_blocks_per_entity_from_file(filepath):
+def get_blocks_per_entity_from_file(filepath, population_windows):
     """
     Retrieves information about the number of blocks that each entity produced over some timeframe for some project.
     :param filepath: the path to the file with the relevant information. It can be either an absolute or a relative
     path in either a pathlib.PosixPath object or a string.
+    :param population_windows: int representing the number of windows to look back and forward when determining if an
+    entity is active during a certain time frame
     :returns: a tuple of length 2 where the first item is a list of time chunks (strings) and the second item is a
     dictionary with entities (keys) and a list of the number of blocks they produced during each time chunk (values)
     """
@@ -207,7 +209,17 @@ def get_blocks_per_entity_from_file(filepath):
         for row in csv_reader:
             entity = row[0]
             for idx, item in enumerate(row[1:]):
-                if item != '0':
+                if item == '0':
+                    if population_windows == 'all':
+                        blocks_per_entity[entity][dates[idx]] = 0
+                    else:
+                        # If the entity hasn't produced any blocks in the current time chunk, we only consider it as
+                        # active if it has produced at least one block in population_windows time chunks before or after
+                        # (otherwise it's not considered part of the population for this time frame)
+                        for i in range(max(0, idx - population_windows), min(len(row) - 1, idx + population_windows + 1)):
+                            if row[i + 1] != '0':
+                                blocks_per_entity[entity][dates[idx]] = 0
+                else:
                     blocks_per_entity[entity][dates[idx]] = int(item)
     return dates, blocks_per_entity
 
@@ -373,6 +385,21 @@ def get_estimation_window_and_frequency():
         raise ValueError('"estimation_window" or "frequency" missing from config file')
 
 
+def get_population_windows():
+    """
+    Retrieves the number of windows to be used for estimating the population of block producers
+    :returns: int representing the number of windows to look back and forward when determining if an entity is active
+    during a certain time frame
+    :raises ValueError: if the population_windows field is missing from the config file
+    """
+    try:
+        config = get_config_data()
+        population_windows = config['population_windows']
+        return population_windows
+    except KeyError:
+        raise ValueError('"population_windows" missing from config file')
+
+
 def get_plot_flag():
     """
     Gets the flag that determines whether generate plots for the output
diff --git a/run.py b/run.py
@@ -18,7 +18,7 @@ def process_data(force_map, ledger_dir, ledger, output_dir):
     return None
 
 
-def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTERIM_DIR,
+def main(ledgers, timeframe, estimation_window, frequency, population_windows, interim_dir=hlp.INTERIM_DIR,
          results_dir=hlp.RESULTS_DIR):
     """
     Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
@@ -59,6 +59,7 @@ def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTER
     used_metrics = analyze(
         projects=ledgers,
         aggregated_data_filename=aggregated_data_filename,
+        population_windows=population_windows,
         input_dir=interim_dir,
         output_dir=metrics_dir
     )
@@ -80,8 +81,9 @@ def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTER
     ledgers = hlp.get_ledgers()
 
     estimation_window, frequency = hlp.get_estimation_window_and_frequency()
+    population_windows = hlp.get_population_windows()
 
-    results_dir = hlp.get_results_dir(estimation_window, frequency)
+    results_dir = hlp.get_results_dir(estimation_window, frequency, population_windows)
     results_dir.mkdir(parents=True, exist_ok=True)
 
     start_date, end_date = hlp.get_start_end_dates()
@@ -92,6 +94,6 @@ def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTER
                          'the first date.')
     timeframe = (timeframe_start, timeframe_end)
 
-    main(ledgers, timeframe, estimation_window, frequency, results_dir=results_dir)
+    main(ledgers, timeframe, estimation_window, frequency, population_windows, results_dir=results_dir)
 
     logging.info('Done. Please check the output directory for results.')
diff --git a/tests/test_helper.py b/tests/test_helper.py
@@ -82,16 +82,25 @@ def test_committed_pool_data():
 def test_write_read_blocks_per_entity(setup_and_cleanup):
     output_dir = setup_and_cleanup
 
-    blocks_per_entity = {'Entity 1': {'2018': 1, '2019': 3}, 'Entity 2': {'2018': 2, '2019': 2}}
+    blocks_per_entity = {
+        'Entity 1': {'2018': 1, '2019': 3, '2020': 2, '2021': 3},
+        'Entity 2': {'2018': 2, '2019': 2, '2021': 1},
+        'Entity 3': {'2018': 2},
+        'Entity 4': {'2021': 1}
+    }
 
-    write_blocks_per_entity_to_file(output_dir=output_dir, blocks_per_entity=blocks_per_entity, dates=['2018', '2019'],
-                                    filename='test.csv')
+    write_blocks_per_entity_to_file(output_dir=output_dir, blocks_per_entity=blocks_per_entity,
+                                    dates=['2018', '2019', '2020', '2021'], filename='test.csv')
 
-    dates, bpe = get_blocks_per_entity_from_file(output_dir / 'test.csv')
+    dates, bpe = get_blocks_per_entity_from_file(output_dir / 'test.csv', population_windows=1)
 
-    assert all(len(nblocks) == len(dates) for nblocks in bpe.values())
-    assert dates == ['2018', '2019']
-    assert all([bpe['Entity 1'] == {'2018': 1, '2019': 3}, bpe['Entity 2'] == {'2018': 2, '2019': 2}])
+    assert dates == ['2018', '2019', '2020', '2021']
+    assert all([
+        bpe['Entity 1'] == {'2018': 1, '2019': 3, '2020': 2, '2021': 3},
+        bpe['Entity 2'] == {'2018': 2, '2019': 2, '2020': 0, '2021': 1},
+        bpe['Entity 3'] == {'2018': 2, '2019': 0},
+        bpe['Entity 4'] == {'2020': 0, '2021': 1}
+    ])
 
 
 def test_valid_date():