Update dir structure of produced files

LadyChristina · LadyChristina · commit 7c6b3e84d444 · 2025-03-16T13:15:06.000Z
diff --git a/consensus_decentralization/helper.py b/consensus_decentralization/helper.py
@@ -14,8 +14,9 @@
 
 ROOT_DIR = pathlib.Path(__file__).resolve().parent.parent
 RAW_DATA_DIR = ROOT_DIR / 'raw_block_data'
-OUTPUT_DIR = ROOT_DIR / 'output'
+INTERIM_DIR = ROOT_DIR / 'processed_data'
 MAPPING_INFO_DIR = ROOT_DIR / 'mapping_information'
+RESULTS_DIR = ROOT_DIR / 'results'
 
 with open(ROOT_DIR / "config.yaml") as f:
     config = safe_load(f)
@@ -294,7 +295,7 @@ def read_mapped_project_data(project_dir):
     :param project_dir: pathlib.PosixPath object of the output directory corresponding to the project
     :returns: a dictionary with the mapped data
     """
-    with open(project_dir / 'mapped_data.json') as f:
+    with open(project_dir / get_mapped_data_filename(get_clustering_flag())) as f:
         data = json.load(f)
     return data
 
@@ -309,6 +310,15 @@ def get_representative_dates(time_chunks):
     return [str(chunk[0] + (chunk[1] - chunk[0]) // 2) for chunk in time_chunks]
 
 
+def get_aggregated_data_dir_name(clustering_flag):
+    """
+    Determines the name of the directory that will contain the aggregated data
+    :param clustering_flag: boolean that determines whether the data is clustered or not
+    :returns: str that corresponds to the name of the directory
+    """
+    return 'blocks_per_entity_' + ('clustered' if clustering_flag else 'non_clustered')
+
+
 def get_blocks_per_entity_filename(timeframe, estimation_window, frequency):
     """
     Determines the filename of the csv file that contains the aggregated data
@@ -395,3 +405,35 @@ def get_force_map_flag():
         return config['execution_flags']['force_map']
     except KeyError:
         raise ValueError('Flag "force_map" missing from config file')
+
+
+def get_clustering_flag():
+    """
+    Gets the flag that determines whether to perform clustering
+    :returns: boolean
+    :raises ValueError: if the flag is not set in the config file
+    """
+    config = get_config_data()
+    try:
+        return config['analyze_flags']['clustering']
+    except KeyError:
+        raise ValueError('Flag "clustering" missing from config file')
+
+
+def get_results_dir(estimation_window, frequency, population_windows):
+    """
+    Retrieves the path to the results directory for the specific config parameters
+    :returns: pathlib.PosixPath object
+    """
+    results_dir_name = (f'{estimation_window}_day_window_with_{population_windows}_population_windows_sampled_every'
+                        f'_{frequency}_days')
+    return RESULTS_DIR / results_dir_name
+
+
+def get_mapped_data_filename(clustering_flag):
+    """
+    Retrieves the filename of the mapped data file
+    :param clustering_flag: boolean that determines whether the data is clustered or not
+    :returns: str
+    """
+    return 'mapped_data_' + ('clustered' if clustering_flag else 'non_clustered') + '.json'
diff --git a/consensus_decentralization/plot.py b/consensus_decentralization/plot.py
@@ -123,7 +123,8 @@ def plot_animated_stack_area_chart(values, execution_id, path, ylabel, legend_la
     plt.close(fig)
 
 
-def plot_dynamics_per_ledger(ledgers, aggregated_data_filename, top_k=-1, unit='relative', animated=False, legend=False):
+def plot_dynamics_per_ledger(ledgers, aggregated_data_filename, output_dir, top_k=-1, unit='relative', animated=False,
+                             legend=False):
     """
     Plots the dynamics of pools for each ledger in terms of produced blocks
     :param ledgers: list of strings representing the ledgers whose data will be plotted
@@ -137,13 +138,12 @@ def plot_dynamics_per_ledger(ledgers, aggregated_data_filename, top_k=-1, unit='
     :param legend: bool that specifies whether the plots to be generated will include a legend or not
     """
     for ledger in ledgers:
-        ledger_path = hlp.OUTPUT_DIR / ledger
-        figures_path = ledger_path / 'figures'
-        if not figures_path.is_dir():
-            figures_path.mkdir()
+        ledger_path = hlp.INTERIM_DIR / ledger
+        figures_path = output_dir / ledger
+        figures_path.mkdir(parents=True, exist_ok=True)
 
         time_chunks, blocks_per_entity = hlp.get_blocks_per_entity_from_file(
-            filepath=ledger_path / "blocks_per_entity" / aggregated_data_filename
+            filepath=ledger_path / hlp.get_aggregated_data_dir_name(hlp.get_clustering_flag()) / aggregated_data_filename
         )
 
         total_blocks_per_time_chunk = [0] * len(time_chunks)
@@ -213,13 +213,10 @@ def plot_dynamics_per_ledger(ledgers, aggregated_data_filename, top_k=-1, unit='
             )
 
 
-def plot_comparative_metrics(ledgers, metrics, animated=False):
+def plot_comparative_metrics(ledgers, metrics, metrics_dir, output_dir, animated=False):
     for metric in metrics:
-        figures_path = hlp.OUTPUT_DIR / 'figures'
-        if not figures_path.is_dir():
-            figures_path.mkdir()
-        filename = f'{metric}.csv'
-        metric_df = pd.read_csv(hlp.OUTPUT_DIR / filename)
+        metric_filepath = metrics_dir / f'{metric}.csv'
+        metric_df = pd.read_csv(metric_filepath)
         # only keep rows that contain at least one (non-nan) value in the columns that correspond to the ledgers
         metric_df = metric_df[metric_df.iloc[:, 1:].notna().any(axis=1)]
         ledger_columns_to_keep = [col for col in metric_df.columns if col in ledgers]
@@ -233,7 +230,7 @@ def plot_comparative_metrics(ledgers, metrics, animated=False):
                     x_label='Time',
                     y_label=metric,
                     filename=f"{metric}_{'_'.join(ledger_columns_to_keep)}",
-                    path=figures_path,
+                    path=output_dir,
                     colors=colors
                 )
             else:
@@ -242,24 +239,35 @@ def plot_comparative_metrics(ledgers, metrics, animated=False):
                     x_label='Time',
                     y_label=metric,
                     filename=f"{metric}_{'_'.join(ledger_columns_to_keep)}",
-                    path=figures_path,
+                    path=output_dir,
                     xtick_labels=metric_df['timeframe'],
                     colors=colors
                 )
 
 
-def plot(ledgers, metrics, aggregated_data_filename, animated):
+def plot(ledgers, metrics, aggregated_data_filename, animated, metrics_dir, figures_dir):
     logging.info("Creating plots..")
-    plot_dynamics_per_ledger(ledgers=ledgers, aggregated_data_filename=aggregated_data_filename, animated=False, legend=True)
-    plot_comparative_metrics(ledgers=ledgers, metrics=metrics, animated=False)
+    #plot_dynamics_per_ledger(ledgers=ledgers, aggregated_data_filename=aggregated_data_filename, output_dir=
+    # figures_dir, animated=False, legend=True)
+    plot_comparative_metrics(ledgers=ledgers, metrics=metrics, animated=False, metrics_dir=metrics_dir, output_dir=figures_dir)
     if animated:
-        plot_dynamics_per_ledger(ledgers=ledgers, aggregated_data_filename=aggregated_data_filename, animated=True)
-        plot_comparative_metrics(ledgers=ledgers, metrics=metrics, animated=True)
+        plot_dynamics_per_ledger(ledgers=ledgers, aggregated_data_filename=aggregated_data_filename,
+                                 output_dir=figures_dir, animated=True)
+        plot_comparative_metrics(ledgers=ledgers, metrics=metrics, animated=True, metrics_dir=metrics_dir, output_dir=figures_dir)
 
 
 if __name__ == '__main__':
     ledgers = hlp.get_ledgers()
-    default_metrics = hlp.get_metrics_config().keys()
+
+    metrics = hlp.get_metrics_config()
+    metric_params = []
+    for key, args in metrics.items():
+        if args:
+            for val in args:
+                metric_params.append((f'{key}={val}', key, val))
+        else:
+            metric_params.append((key, key, None))
+    default_metrics = [name for name, _, _ in metric_params]
 
     default_start_date, default_end_date = hlp.get_start_end_dates()
     timeframe_start = hlp.get_timeframe_beginning(default_start_date)
@@ -296,4 +304,4 @@ def plot(ledgers, metrics, aggregated_data_filename, animated):
         help='Flag to specify whether to also generate animated plots.'
     )
     args = parser.parse_args()
-    plot(ledgers=args.ledgers, metrics=args.metrics, aggregated_data_filename=args.filename, animated=args.animated)
+    plot(ledgers=args.ledgers, metrics=args.metrics, aggregated_data_filename=args.filename, animated=args.animated, results_dir=hlp.RESULTS_DIR)
diff --git a/docs/aggregator.md b/docs/aggregator.md
@@ -1,10 +1,10 @@
 # Aggregator
 
-The aggregator obtains the mapped data of a ledger (from `output/<project_name>/mapped_data.json`) and aggregates it
-over units of time that are determined based on the given `timeframe` and `aggregate_by` parameters.
+The aggregator obtains the mapped data of a ledger (from `processed_data/<project_name>/mapped_data.json`) and 
+aggregates it over units of time that are determined based on the given `timeframe` and `aggregate_by` parameters.
 It then outputs a `csv` file with the distribution of blocks to entities for each time unit under consideration.
-This file is saved in the directory `output/<project name>/blocks_per_entity/` and is named based on the `timeframe`
-and `aggregate_by` parameters.
+This file is saved in the directory `processed_data/<project name>/blocks_per_entity/` and is named based on the 
+`timeframe` and `aggregate_by` parameters.
 For example, if the specified timeframe is from June 2023 to September 2023 and the aggregation is by month, then
 the output file would be named `monthly_from_2023-06-01_to_2023-09-30.csv` and would be structured as follows:
 ```
diff --git a/docs/mappings.md b/docs/mappings.md
@@ -4,8 +4,8 @@ A mapping is responsible for linking blocks to the entities that created them. W
 information about the addresses that received rewards for producing some block or identifiers that are related to them,
 it does not contain information about the entities that control these addresses, which is where the mapping comes in.
 
-The mapping takes as input the parsed data and outputs a file (`output/<project_name>/mapped_data.json`), which is
-structured as follows:
+The mapping takes as input the parsed data and outputs a file (`processed_data/<project_name>/mapped_data.json`), 
+which is structured as follows:
 
 ```
 [
diff --git a/docs/setup.md b/docs/setup.md
@@ -65,5 +65,6 @@ specified ledger. By default, this flag is set to False and no plots are generat
 is set to False and no animated plots are generated. Note that this flag is ignored if `--plot` is set to False.
 
 
-All output files can then be found under the `output/` directory, which is automatically created the first time the tool
-is run.
+All output files can then be found under the `results/` directory, which is automatically created the first time the 
+tool is run. Interim files that are produced by some modules and are used by others can be found under the 
+`processed_data/` directory.
diff --git a/run.py b/run.py
@@ -10,13 +10,16 @@
 
 
 def process_data(force_map, ledger_dir, ledger, output_dir):
-    mapped_data_file = ledger_dir / 'mapped_data.json'
+    clustering_flag = hlp.get_clustering_flag()
+    mapped_data_file = ledger_dir / hlp.get_mapped_data_filename(clustering_flag)
     if force_map or not mapped_data_file.is_file():
         parsed_data = parse(ledger, input_dir=hlp.RAW_DATA_DIR)
-        apply_mapping(ledger, parsed_data=parsed_data, output_dir=output_dir)
+        return apply_mapping(ledger, parsed_data=parsed_data, output_dir=output_dir)
+    return None
 
 
-def main(ledgers, timeframe, estimation_window, frequency, output_dir=hlp.OUTPUT_DIR):
+def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTERIM_DIR,
+         results_dir=hlp.RESULTS_DIR):
     """
     Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
     :param ledgers: list of strings that correspond to the ledgers whose data should be analyzed
@@ -27,41 +30,49 @@ def main(ledgers, timeframe, estimation_window, frequency, output_dir=hlp.OUTPUT
     :param frequency: int or None. The number of days to consider for the frequency of the analysis (i.e. the number
         of days between each data point considered in the analysis). If None, only one data point will be considered,
         spanning the entire timeframe (i.e. it needs to be combined with None estimation_window).
-    :param output_dir: pathlib.PosixPath object of the directory where the output data will be saved
+    :param interim_dir: pathlib.PosixPath object of the directory where the output data will be saved
     """
     logging.info(f"The ledgers that will be analyzed are: {','.join(ledgers)}")
 
     force_map = hlp.get_force_map_flag()
 
     for ledger in ledgers:
-        ledger_dir = output_dir / ledger
+        ledger_dir = interim_dir / ledger
         ledger_dir.mkdir(parents=True, exist_ok=True)  # create ledger output directory if it doesn't already exist
 
-        process_data(force_map, ledger_dir, ledger, output_dir)
+        mapped_data = process_data(force_map, ledger_dir, ledger, interim_dir)
 
         aggregate(
             ledger,
-            output_dir,
+            interim_dir,
             timeframe,
             estimation_window,
             frequency,
-            force_map
+            force_map,
+            mapped_data=mapped_data
         )
 
     aggregated_data_filename = hlp.get_blocks_per_entity_filename(timeframe, estimation_window, frequency)
+    metrics_dir = results_dir / 'metrics'
+    metrics_dir.mkdir(parents=True, exist_ok=True)
 
     used_metrics = analyze(
         projects=ledgers,
         aggregated_data_filename=aggregated_data_filename,
-        output_dir=output_dir
+        input_dir=interim_dir,
+        output_dir=metrics_dir
     )
 
     if hlp.get_plot_flag():
+        figures_dir = results_dir / 'figures'
+        figures_dir.mkdir(parents=True, exist_ok=True)
         plot(
             ledgers=ledgers,
             metrics=used_metrics,
             aggregated_data_filename=aggregated_data_filename,
-            animated=hlp.get_plot_config_data()['animated']
+            animated=hlp.get_plot_config_data()['animated'],
+            metrics_dir=metrics_dir,
+            figures_dir=figures_dir
         )
 
 
@@ -70,6 +81,9 @@ def main(ledgers, timeframe, estimation_window, frequency, output_dir=hlp.OUTPUT
 
     estimation_window, frequency = hlp.get_estimation_window_and_frequency()
 
+    results_dir = hlp.get_results_dir(estimation_window, frequency)
+    results_dir.mkdir(parents=True, exist_ok=True)
+
     start_date, end_date = hlp.get_start_end_dates()
     timeframe_start = hlp.get_timeframe_beginning(start_date)
     timeframe_end = hlp.get_timeframe_end(end_date)
@@ -78,6 +92,6 @@ def main(ledgers, timeframe, estimation_window, frequency, output_dir=hlp.OUTPUT
                          'the first date.')
     timeframe = (timeframe_start, timeframe_end)
 
-    main(ledgers, timeframe, estimation_window, frequency)
+    main(ledgers, timeframe, estimation_window, frequency, results_dir=results_dir)
 
     logging.info('Done. Please check the output directory for results.')
diff --git a/tests/test_aggregate.py b/tests/test_aggregate.py
@@ -2,7 +2,7 @@
 import json
 import shutil
 import pytest
-from consensus_decentralization.helper import OUTPUT_DIR
+from consensus_decentralization.helper import INTERIM_DIR
 from consensus_decentralization.aggregate import aggregate, Aggregator, divide_timeframe
 
 
@@ -14,7 +14,7 @@ def setup_and_cleanup():
     after (cleanup)
     """
     # Set up
-    test_io_dir = OUTPUT_DIR / "test_output"
+    test_io_dir = INTERIM_DIR / "test_output"
     yield test_io_dir
     # Clean up
     shutil.rmtree(test_io_dir)
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
@@ -1,6 +1,6 @@
 import shutil
 import pytest
-from consensus_decentralization.helper import OUTPUT_DIR
+from consensus_decentralization.helper import INTERIM_DIR
 from consensus_decentralization.analyze import analyze
 
 
@@ -12,7 +12,7 @@ def setup_and_cleanup():
     after (cleanup)
     """
     # Set up
-    test_io_dir = OUTPUT_DIR / "test_output"
+    test_io_dir = INTERIM_DIR / "test_output"
     test_bitcoin_dir = test_io_dir / "sample_bitcoin"
     test_bitcoin_dir.mkdir(parents=True, exist_ok=True)
     # create files that would be the output of aggregation
diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py
@@ -9,7 +9,7 @@
 from consensus_decentralization.map import ledger_mapping
 from consensus_decentralization.mappings.default_mapping import DefaultMapping
 from consensus_decentralization.mappings.cardano_mapping import CardanoMapping
-from consensus_decentralization.helper import OUTPUT_DIR, config
+from consensus_decentralization.helper import INTERIM_DIR, config
 import pytest
 
 
@@ -21,7 +21,7 @@ def setup_and_cleanup():
     after (cleanup)
     """
     # Set up
-    test_output_dir = OUTPUT_DIR / "test_output"
+    test_output_dir = INTERIM_DIR / "test_output"
     ledger_mapping['sample_bitcoin'] = DefaultMapping
     ledger_parser['sample_bitcoin'] = DefaultParser
     ledger_mapping['sample_cardano'] = CardanoMapping
@@ -81,7 +81,7 @@ def test_end_to_end(setup_and_cleanup):
         (datetime.date(2010, 1, 1), datetime.date(2010, 12, 31)),
         estimation_window=None,
         frequency=None,
-        output_dir=test_output_dir
+        interim_dir=test_output_dir
     )
 
     expected_entropy = [
@@ -116,7 +116,7 @@ def test_end_to_end(setup_and_cleanup):
         (datetime.date(2018, 2, 1), datetime.date(2018, 3, 31)),
         estimation_window=30,
         frequency=30,
-        output_dir=test_output_dir
+        interim_dir=test_output_dir
     )
 
     expected_entropy = [
@@ -154,7 +154,7 @@ def test_end_to_end(setup_and_cleanup):
         (datetime.date(2020, 12, 1), datetime.date(2020, 12, 31)),
         estimation_window=31,
         frequency=31,
-        output_dir=test_output_dir
+        interim_dir=test_output_dir
     )
 
     expected_entropy = [
diff --git a/tests/test_helper.py b/tests/test_helper.py
diff --git a/tests/test_mappings.py b/tests/test_mappings.py

Original file line number	Diff line number	Diff line change
`@@ -4,8 +4,8 @@ A mapping is responsible for linking blocks to the entities that created them. W`
`4`	`4`	`information about the addresses that received rewards for producing some block or identifiers that are related to them,`
`5`	`5`	`it does not contain information about the entities that control these addresses, which is where the mapping comes in.`
`6`	`6`
`7`		-The mapping takes as input the parsed data and outputs a file (`output/<project_name>/mapped_data.json`), which is
`8`		`-structured as follows:`
	`7`	+The mapping takes as input the parsed data and outputs a file (`processed_data/<project_name>/mapped_data.json`),
	`8`	`+which is structured as follows:`
`9`	`9`
`10`	`10`	```
`11`	`11`	`[`