Update execution parameter handling (#154)

dimkarakostas · LadyChristina · web-flow · commit 95d757a6903c · 2024-06-13T14:19:26.000+01:00
* Remove cmd args and move all params to config file

* Validate dates given in config

* Update get_granularity return value when empty

* Fix docstring

---------

Co-authored-by: LadyChristina &lt;christina_ovezik@outlook.com&gt;
diff --git a/config.yaml b/config.yaml
@@ -1,3 +1,4 @@
+# The metrics for which an analysis should be performed.
 metrics:
   entropy:
     - 1
@@ -12,14 +13,8 @@ metrics:
     - 0.33
     - 0.66
 
-analyze_flags:
-  clustering: true
-
-default_timeframe:
-  start_date: 2010-01-01
-  end_date: 2023-12-31
-
-default_ledgers:
+# The ledgers for which an analysis should be performed.
+ledgers:
   - bitcoin
   - bitcoin_cash
   - cardano
@@ -28,3 +23,36 @@ default_ledgers:
   - litecoin
   - tezos
   - zcash
+
+# Execution flags
+execution_flags:
+  force_map: false
+
+# Analyze flags
+analyze_flags:
+  clustering: true
+
+# The timeframe for which an analysis should be performed.
+# Each date is a string of the form YYYY-MM-DD.
+# If granularity is also set, then the analysis will run on the timeframe of the two farthest snapshots.
+timeframe:
+  start_date: 2010-01-01
+  end_date: 2023-12-31
+
+# The granularity for the analysis when two dates are provided in the --snapshot_dates argument (which are then interpreted as start and end dates). 
+# It can be one of: "day", "week", "month", "year", or empty. If empty, then only the snapshots for the given dates will be analyzed.
+granularity: "month"
+
+input_directories:  # Paths to directories that contain raw input data
+  - ./input
+
+# Paths to directories of snapshot db files; either absolute or relative from run.py.
+# The first path will be used to write newly created dbs and the output of runs
+output_directories:  
+  - ./output
+
+
+# Plot flags
+plot_parameters:
+  plot: false
+  animated: true
diff --git a/consensus_decentralization/collect_data.py b/consensus_decentralization/collect_data.py
@@ -54,7 +54,7 @@ def collect_data(ledgers, force_query):
 if __name__ == '__main__':
     logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)
 
-    default_ledgers = hlp.get_default_ledgers()
+    default_ledgers = hlp.get_ledgers()
 
     parser = argparse.ArgumentParser()
     parser.add_argument(
diff --git a/consensus_decentralization/helper.py b/consensus_decentralization/helper.py
@@ -265,24 +265,27 @@ def get_metrics_config():
     return metrics
 
 
-def get_default_ledgers():
+def get_ledgers():
     """
-    Retrieves data regarding the default ledgers to use
+    Retrieves data regarding the ledgers to use
     :returns: a list of strings that correspond to the ledgers that will be used (unless overriden by the relevant cmd
     arg)
     """
     config = get_config_data()
-    ledgers = config['default_ledgers']
+    ledgers = config['ledgers']
     return ledgers
 
 
-def get_default_start_end_dates():
+def get_start_end_dates():
     """
     Retrieves the start and end dates for which to analyze data
     :returns: a tuple of two strings, (<start date>, <end date>)
     """
     config = get_config_data()
-    return str(config['default_timeframe']['start_date']), str(config['default_timeframe']['end_date'])
+    for date_type in ['start', 'end']:
+        if not valid_date(str(config['timeframe'][f'{date_type}_date'])):
+            raise ValueError(f'Invalid {date_type} date')
+    return str(config['timeframe']['start_date']), str(config['timeframe']['end_date'])
 
 
 def read_mapped_project_data(project_dir):
@@ -361,3 +364,58 @@ def get_date_from_block(block, level='day'):
     elif level == 'day':
         return timestamp[:10]
     raise ValueError(f'Invalid level: {level}')
+
+
+def get_granularity():
+    """
+    Retrieves the granularity to be used in the analysis
+    :returns: string in ['day', 'week', 'month', 'year'] that represents the chosen granularity
+    or 'all' if the relevant field is empty in the config file
+    :raises ValueError: if the granularity field is missing from the config file or if
+    the chosen value is not one of the allowed ones
+    """
+    try:
+        granularity = get_config_data()['granularity']
+        if granularity:
+            if granularity in ['day', 'week', 'month', 'year']:
+                return granularity
+            else:
+                raise ValueError('Malformed "granularity" in config; should be one of: "day", "week", "month", "year", or empty')
+        else:
+            return 'all'
+    except KeyError:
+        raise ValueError('"granularity" not in config file')
+
+
+def get_plot_flag():
+    """
+    Gets the flag that determines whether generate plots for the output
+    :returns: boolean
+    :raises ValueError: if the flag is not set in the config file
+    """
+    config = get_config_data()
+    try:
+        return config['plot_parameters']['plot']
+    except KeyError:
+        raise ValueError('Flag "plot" not in config file')
+
+
+def get_plot_config_data():
+    """
+    Retrieves the plot-related config parameters
+    :returns: dictionary
+    """
+    return get_config_data()['plot_parameters']
+
+
+def get_force_map_flag():
+    """
+    Gets the flag that determines whether to forcefully map the data
+    :returns: boolean
+    :raises ValueError: if the flag is not set in the config file
+    """
+    config = get_config_data()
+    try:
+        return config['execution_flags']['force_map']
+    except KeyError:
+        raise ValueError('Flag "force_map" not in config file')
diff --git a/consensus_decentralization/plot.py b/consensus_decentralization/plot.py
@@ -258,10 +258,10 @@ def plot(ledgers, metrics, aggregated_data_filename, animated):
 
 
 if __name__ == '__main__':
-    default_ledgers = hlp.get_default_ledgers()
+    ledgers = hlp.get_ledgers()
     default_metrics = hlp.get_metrics_config().keys()
 
-    default_start_date, default_end_date = hlp.get_default_start_end_dates()
+    default_start_date, default_end_date = hlp.get_start_end_dates()
     timeframe_start = hlp.get_timeframe_beginning(default_start_date)
     timeframe_end = hlp.get_timeframe_end(default_end_date)
 
@@ -271,8 +271,8 @@ def plot(ledgers, metrics, aggregated_data_filename, animated):
         '--ledgers',
         nargs="*",
         type=str.lower,
-        default=default_ledgers,
-        choices=default_ledgers,
+        default=ledgers,
+        choices=ledgers,
         help='The ledgers whose data will be plotted.'
     )
     parser.add_argument(
diff --git a/run.py b/run.py
@@ -1,4 +1,3 @@
-import argparse
 import logging
 from consensus_decentralization.aggregate import aggregate
 from consensus_decentralization.map import apply_mapping
@@ -10,19 +9,19 @@
 logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)
 
 
-def process_data(force_map, project_dir, project, output_dir):
-    mapped_data_file = project_dir / 'mapped_data.json'
+def process_data(force_map, ledger_dir, ledger, output_dir):
+    mapped_data_file = ledger_dir / 'mapped_data.json'
     if force_map or not mapped_data_file.is_file():
-        parsed_data = parse(project=project, input_dir=hlp.RAW_DATA_DIR)
-        apply_mapping(project=project, parsed_data=parsed_data, output_dir=output_dir)
+        parsed_data = parse(ledger, input_dir=hlp.RAW_DATA_DIR)
+        apply_mapping(ledger, parsed_data=parsed_data, output_dir=output_dir)
 
 
-def main(projects, timeframe, aggregate_by, force_map, make_plots, make_animated_plots, output_dir=hlp.OUTPUT_DIR):
+def main(ledgers, timeframe, granularity, output_dir=hlp.OUTPUT_DIR):
     """
     Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
-    :param projects: list of strings that correspond to the ledgers whose data should be analyzed
+    :param ledgers: list of strings that correspond to the ledgers whose data should be analyzed
     :param timeframe: tuple of (start_date, end_date) where each date is a datetime.date object.
-    :param aggregate_by: string that corresponds to the granularity that will be used for the analysis. It can be one
+    :param granularity: string that corresponds to the granularity that will be used for the analysis. It can be one
         of: day, week, month, year, all.
     :param force_map: bool. If True, then the parsing and mapping will be performed, regardless of whether
         mapped data for some or all of the projects already exist
@@ -31,104 +30,52 @@ def main(projects, timeframe, aggregate_by, force_map, make_plots, make_animated
         Warning: generating animated plots might take a long time
     :param output_dir: pathlib.PosixPath object of the directory where the output data will be saved
     """
-    logging.info(f"The ledgers that will be analyzed are: {','.join(projects)}")
-    for project in projects:
-        project_dir = output_dir / project
-        project_dir.mkdir(parents=True, exist_ok=True)  # create project output directory if it doesn't already exist
+    logging.info(f"The ledgers that will be analyzed are: {','.join(ledgers)}")
 
-        process_data(force_map, project_dir, project, output_dir)
+    force_map = hlp.get_force_map_flag()
+
+    for ledger in ledgers:
+        ledger_dir = output_dir / ledger
+        ledger_dir.mkdir(parents=True, exist_ok=True)  # create ledger output directory if it doesn't already exist
+
+        process_data(force_map, ledger_dir, ledger, output_dir)
 
         aggregate(
-            project=project,
-            output_dir=output_dir,
-            timeframe=timeframe,
-            aggregate_by=aggregate_by,
-            force_aggregate=force_map
+            ledger,
+            output_dir,
+            timeframe,
+            granularity,
+            force_map
         )
 
     used_metrics = analyze(
-        projects=projects,
-        aggregated_data_filename=hlp.get_blocks_per_entity_filename(aggregate_by=aggregate_by, timeframe=timeframe),
+        ledgers,
+        aggregated_data_filename=hlp.get_blocks_per_entity_filename(granularity, timeframe),
         output_dir=output_dir
     )
 
-    if make_plots:
+    if hlp.get_plot_flag():
         plot(
-            ledgers=projects,
+            ledgers,
             metrics=used_metrics,
-            aggregated_data_filename=hlp.get_blocks_per_entity_filename(aggregate_by=aggregate_by, timeframe=timeframe),
-            animated=make_animated_plots
+            aggregated_data_filename=hlp.get_blocks_per_entity_filename(granularity, timeframe),
+            animated=hlp.get_plot_config_data()['animated']
         )
 
 
 if __name__ == '__main__':
-    default_ledgers = hlp.get_default_ledgers()
-    start_date, end_date = hlp.get_default_start_end_dates()
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--ledgers',
-        nargs="*",
-        type=str.lower,
-        default=default_ledgers,
-        choices=default_ledgers,
-        help='The ledgers that will be analyzed.'
-    )
-    parser.add_argument(
-        '--timeframe',
-        nargs="*",
-        type=hlp.valid_date,
-        default=[start_date, end_date],
-        help='The timeframe that will be analyzed. You can provide two values to mark the beginning and end of the '
-             'time frame or a single value that encapsulates both.'
-    )
-    parser.add_argument(
-        '--aggregate-by',
-        nargs="?",
-        type=str.lower,
-        default='month',
-        choices=['day', 'week', 'month', 'year', 'all'],
-        help='The granularity that will be used for the analysis. It can be one of: "day", "week", "month", "year", '
-             '"all" and by default it is month. Note that in the case of weekly aggregation, we consider a week to '
-             'be 7 consecutive days, starting from the first day of the time period under consideration (so not '
-             'necessarily Monday to Sunday). If "all" is chosen then no aggregation will be performed, meaning that '
-             'the given timeframe will be treated as one unit of time in our analysis.'
-    )
-    parser.add_argument(
-        '--force-map',
-        action='store_true',
-        help='Flag to specify whether to map the parsed data, regardless if the mapped data files exist.'
-    )
-    parser.add_argument(
-        '--plot',
-        action='store_true',
-        help='Flag to specify whether to produce and save plots of the results.'
-    )
-    parser.add_argument(
-        '--animated',
-        action='store_true',
-        help='Flag to specify whether to also generate animated plots.'
-    )
-    args = parser.parse_args()
-
-    aggregate_by = args.aggregate_by
-    timeframe = args.timeframe
-    if len(timeframe) > 2:
-        parser.error('Too many values given for --timeframe argument. Please provide one date to get a snapshot or '
-                     'two dates to get a time series.')
-    timeframe_start = hlp.get_timeframe_beginning(timeframe[0])
-    timeframe_end = hlp.get_timeframe_end(timeframe[-1])
+    ledgers = hlp.get_ledgers()
+
+    granularity = hlp.get_granularity()
+
+    start_date, end_date = hlp.get_start_end_dates()
+    timeframe_start = hlp.get_timeframe_beginning(start_date)
+    timeframe_end = hlp.get_timeframe_end(end_date)
     if timeframe_end < timeframe_start:
-        parser.error('Invalid --timeframe values. Please note that if providing a second date, it must occur after '
-                     'the first date.')
-
-    main(
-        projects=args.ledgers,
-        timeframe=(timeframe_start, timeframe_end),
-        aggregate_by=aggregate_by,
-        force_map=args.force_map,
-        make_plots=args.plot,
-        make_animated_plots=args.animated
-    )
+        raise ValueError('Invalid --timeframe values. Please note that if providing a second date, it must occur after '
+                         'the first date.')
+    timeframe = (timeframe_start, timeframe_end)
+
+    main(ledgers, timeframe, granularity)
 
     logging.info('Done. Please check the output directory for results.')
diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py
diff --git a/tests/test_helper.py b/tests/test_helper.py