Output restructuring (#106)

LadyChristina · web-flow · commit f1e36d9367c5 · 2023-08-28T22:47:22.000+03:00
* Save mapped data in dedicated dir

* Make plotting less verbose

* Update mapping doc

* Stop generating mapped files for timeframes with no data

* Update tests + analyze fix

* Fix flake8 issues

* Add newline
diff --git a/consensus_decentralization/analyze.py b/consensus_decentralization/analyze.py
@@ -36,25 +36,31 @@ def analyze(projects, timeframes, output_dir):
                 if timeframe not in csv_contents[metric].keys():
                     csv_contents[metric][timeframe] = timeframe
 
-            # Get mapped data for the year that corresponds to the timeframe.
+            # Get mapped data for the year that corresponds to the timeframe, if such data exists
             # This is needed because the Gini coefficient is computed over all entities per each year.
             year = timeframe[:4]
             yearly_entities = set()
             yearly_entity_groups = set()
-            with open(output_dir / f'{project}/{year}.csv') as f:
-                for line in f.readlines()[1:]:
-                    entity_group, entity, _ = line.split(',')
-                    yearly_entities.add(entity)
-                    yearly_entity_groups.add(entity_group)
-
-            # Get mapped data for the defined timeframe.
-            with open(output_dir / f'{project}/{timeframe}.csv') as f:
-                blocks_per_entity = {}
-                blocks_per_entity_group = defaultdict(int, {'Unknown': 0})
-                for line in f.readlines()[1:]:
-                    entity_group, entity, resources = line.split(',')
-                    blocks_per_entity[entity] = int(resources)
-                    blocks_per_entity_group[entity_group] += int(resources)
+            try:
+                with open(output_dir / f'{project}/mapped_data/{year}.csv') as f:
+                    for line in f.readlines()[1:]:
+                        entity_group, entity, _ = line.split(',')
+                        yearly_entities.add(entity)
+                        yearly_entity_groups.add(entity_group)
+            except FileNotFoundError:
+                pass
+
+            blocks_per_entity = {}
+            blocks_per_entity_group = defaultdict(int, {'Unknown': 0})
+            try:
+                # Get mapped data for the defined timeframe, if such data exists
+                with open(output_dir / f'{project}/mapped_data/{timeframe}.csv') as f:
+                    for line in f.readlines()[1:]:
+                        entity_group, entity, resources = line.split(',')
+                        blocks_per_entity[entity] = int(resources)
+                        blocks_per_entity_group[entity_group] += int(resources)
+            except FileNotFoundError:
+                pass
 
             results = {}
             results_unknowns_grouped = {}
diff --git a/consensus_decentralization/map.py b/consensus_decentralization/map.py
@@ -36,14 +36,14 @@ def apply_mapping(project, timeframes, output_dir, force_map):
 
     computed_yearly_mappings = set()  # Keep track of computed yearly mappings to avoid recomputing them in the same run
     for timeframe in timeframes:
-        output_file = project_output_dir / f'{timeframe}.csv'
+        output_file = mapping.mapped_data_dir / f'{timeframe}.csv'
         if not output_file.is_file() or force_map:
             mapping.perform_mapping(timeframe)
 
             # Get mapped data for the year that corresponds to the timeframe.
             # This is needed because the Gini coefficient is computed over all entities per each year.
             year = timeframe[:4]
-            year_file = project_output_dir / f'{year}.csv'
+            year_file = mapping.mapped_data_dir / f'{year}.csv'
             if not year_file.is_file() or (force_map and year not in computed_yearly_mappings):
                 mapping.perform_mapping(year)
                 computed_yearly_mappings.add(year)
diff --git a/consensus_decentralization/mappings/cardano_mapping.py b/consensus_decentralization/mappings/cardano_mapping.py
@@ -71,6 +71,6 @@ def process(self, timeframe):
             blocks_per_entity[entity.replace(',', '')] += 1
 
         groups = self.map_block_creators_to_groups(blocks_per_entity.keys())
-        hlp.write_blocks_per_entity_to_file(self.io_dir, blocks_per_entity, groups, timeframe)
+        hlp.write_blocks_per_entity_to_file(self.mapped_data_dir, blocks_per_entity, groups, timeframe)
 
         return blocks_per_entity
diff --git a/consensus_decentralization/mappings/default_mapping.py b/consensus_decentralization/mappings/default_mapping.py
@@ -9,7 +9,9 @@ class DefaultMapping:
     methods must use a mapping class that inherits from this one.
 
     :ivar project_name: the name of the project associated with a specific mapping instance
-    :ivar io_dir: the directory that includes the parsed data related to the project
+    :ivar parsed_data_dir: the directory that includes the parsed data related to the project
+    :ivar mapped_data_dir: the directory to save the mapped data files in
+    :ivar multi_pool_dir: the directory to save the multi pool data files in
     :ivar dataset: a dictionary with the parsed data of the project
     :ivar special_addresses: a set with the special addresses of the project (addresses that don't count in the
     context of out analysis)
@@ -23,7 +25,11 @@ class DefaultMapping:
 
     def __init__(self, project_name, io_dir):
         self.project_name = project_name
-        self.io_dir = io_dir
+        self.parsed_data_dir = io_dir
+        self.mapped_data_dir = io_dir / 'mapped_data'
+        self.mapped_data_dir.mkdir(parents=True, exist_ok=True)
+        self.multi_pool_dir = io_dir / 'multi_pool_data'
+        self.multi_pool_dir.mkdir(parents=True, exist_ok=True)
         self.dataset = None
         self.special_addresses = hlp.get_special_addresses(project_name)
         self.known_addresses = hlp.get_known_addresses(project_name)
@@ -47,7 +53,7 @@ def read_project_data(self):
         Reads the parsed data from the directory specified by the instance
         :returns: a dictionary with the parsed data
         """
-        with open(self.io_dir / 'parsed_data.json') as f:
+        with open(self.parsed_data_dir / 'parsed_data.json') as f:
             data = json.load(f)
         return data
 
@@ -137,11 +143,11 @@ def write_multi_pool_files(self, timeframe):
         with multiple pools, if any such blocks/addresses were found for the project
         """
         if self.multi_pool_addresses:
-            with open(self.io_dir / f'multi_pool_addresses_{timeframe}.csv', 'w') as f:
+            with open(self.multi_pool_dir / f'multi_pool_addresses_{timeframe}.csv', 'w') as f:
                 f.write('Block No,Timestamp,Address,Entity\n' + '\n'.join(self.multi_pool_addresses))
 
         if self.multi_pool_blocks:
-            with open(self.io_dir / f'multi_pool_blocks_{timeframe}.csv', 'w') as f:
+            with open(self.multi_pool_dir / f'multi_pool_blocks_{timeframe}.csv', 'w') as f:
                 f.write('Block No,Timestamp,Entities\n' + '\n'.join(self.multi_pool_blocks))
 
     def process(self, timeframe):
@@ -168,7 +174,8 @@ def process(self, timeframe):
             blocks_per_entity[entity.replace(',', '')] += 1
 
         groups = self.map_block_creators_to_groups(blocks_per_entity.keys())
-        hlp.write_blocks_per_entity_to_file(self.io_dir, blocks_per_entity, groups, timeframe)
+        if len(blocks_per_entity) > 0:
+            hlp.write_blocks_per_entity_to_file(self.mapped_data_dir, blocks_per_entity, groups, timeframe)
 
         if len(timeframe) == 4:  # If timeframe is a year, also write multi-pool addresses and blocks to file
             self.write_multi_pool_files(timeframe)
diff --git a/consensus_decentralization/mappings/dummy_mapping.py b/consensus_decentralization/mappings/dummy_mapping.py
@@ -22,6 +22,6 @@ def process(self, timeframe):
 
             blocks_per_entity[entity] += 1
 
-        write_blocks_per_entity_to_file(self.io_dir, blocks_per_entity, blocks_per_entity.keys, timeframe)
+        write_blocks_per_entity_to_file(self.mapped_data_dir, blocks_per_entity, blocks_per_entity.keys, timeframe)
 
         return blocks_per_entity
diff --git a/consensus_decentralization/plot.py b/consensus_decentralization/plot.py
@@ -131,7 +131,6 @@ def plot_animated_stack_area_chart(values, execution_id, path, ylabel, legend_la
 
 def plot_dynamics_per_ledger(ledgers, top_k=-1, animated=False, legend=False):
     for ledger in ledgers:
-        logging.info(f"Plotting {'(animated)' if animated else ''} {ledger} data..")
         path = hlp.OUTPUT_DIR / ledger
         figures_path = path / 'figures'
         if not figures_path.is_dir():
@@ -146,7 +145,10 @@ def plot_dynamics_per_ledger(ledgers, top_k=-1, animated=False, legend=False):
             for month in range(1, 13):
                 timeframe = f'{year}-0{month}' if month < 10 else f'{year}-{month}'
                 filename = f'{timeframe}.csv'
-                blocks = hlp.get_blocks_per_entity_from_file(path / filename)
+                file = path / "mapped_data" / filename
+                if not file.is_file():
+                    continue  # Only plot timeframes for which mapped data exist
+                blocks = hlp.get_blocks_per_entity_from_file(file)
                 total_blocks = sum(blocks.values())
                 if total_blocks == 0:
                     continue
@@ -207,7 +209,6 @@ def plot_dynamics_per_ledger(ledgers, top_k=-1, animated=False, legend=False):
 
 def plot_comparative_metrics(ledgers, metrics, animated=False):
     for metric in metrics:
-        logging.info(f"Plotting {'(animated)' if animated else ''} {metric}..")
         figures_path = hlp.OUTPUT_DIR / 'figures'
         if not figures_path.is_dir():
             figures_path.mkdir()
@@ -233,7 +234,6 @@ def plot_comparative_metrics(ledgers, metrics, animated=False):
 
 def plot_confidence_intervals(ledgers, metrics):
     for metric in metrics:
-        logging.info(f"Plotting {metric} (with confidence intervals)..")
         figures_path = hlp.OUTPUT_DIR / 'figures'
         if not figures_path.is_dir():
             figures_path.mkdir()
@@ -260,7 +260,8 @@ def plot_confidence_intervals(ledgers, metrics):
             plt.savefig(figures_path / filename, bbox_inches='tight')
 
 
-def plot(ledgers, metrics, animated, show_confidence=True):
+def plot(ledgers, metrics, animated, show_confidence=False):
+    logging.info("Creating plots..")
     plot_dynamics_per_ledger(ledgers, animated=False, legend=True)
     plot_comparative_metrics(ledgers, metrics, animated=False)
     if animated:
diff --git a/docs/mappings.md b/docs/mappings.md
@@ -1,15 +1,20 @@
 # Mappings
 
-A mapping obtains the parsed data (from `output/<project_name>/parsed_data.json`) and outputs a `csv` file that maps
-blocks to entities, structured as follows:
+A mapping obtains the parsed data of a ledger (from `output/<project_name>/parsed_data.json`) and outputs one or more 
+`csv` files that map blocks to entities, structured as follows:
 
 ```
 Entity,Resources
 <name of entity>,<(int) number of blocks>
 ```
 
-The name of the `csv` file is the timeframe, over which the mapping was executed (e.g., `2021-04.csv`). The file is stored in the
-project's output directory (`output/<project_name>/`).
+Specifically, if the `timeframe` argument is provided during execution, then the mapping outputs a single `csv` 
+file that corresponds to that timeframe. Otherwise, it outputs a `csv` file for each month contained in the default 
+time range (as specified in the [config file](https://github.com/Blockchain-Technology-Lab/pooling-analysis/blob/main/config.yaml)). 
+It also outputs a `csv` file for each year contained in the relevant time frames.
+
+Each `csv` file is named after the timeframe over which the mapping was executed (e.g., `2021-04.csv`) and is
+stored in a dedicated folder in the project's output directory (`output/<project_name>/mapped_data`).
 
 The logic of the mapping depends on the type of clustering we want to achieve. So, different mappings will output
 different results, even if applied on the same data. An exception to this is the "no-cluster" mapping (DummyMapping
diff --git a/run.py b/run.py
@@ -4,7 +4,8 @@
 from consensus_decentralization.analyze import analyze
 from consensus_decentralization.parse import parse
 from consensus_decentralization.plot import plot
-from consensus_decentralization.helper import valid_date, RAW_DATA_DIR, OUTPUT_DIR, get_default_ledgers, get_start_end_years
+from consensus_decentralization.helper import valid_date, RAW_DATA_DIR, OUTPUT_DIR, get_default_ledgers, \
+    get_start_end_years
 
 logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)
 
diff --git a/tests/test_4_mappings.py b/tests/test_4_mappings.py
@@ -67,12 +67,21 @@ def test_map(setup_and_cleanup):
     parse(project, test_raw_data_dir, test_output_dir)
     apply_mapping(project, timeframes, test_output_dir, force_map)
 
-    output_file = test_output_dir / project / f'{timeframes[0]}.csv'
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
+    assert not output_file.is_file()  # since there is no data from 2010 in the sample
+
+    yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
+    assert not yearly_output_file.is_file()
+
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[1]}.csv'
     assert output_file.is_file()
 
-    yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
+    yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[1][:4]}.csv'
     assert yearly_output_file.is_file()
 
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[2]}.csv'
+    assert output_file.is_file()
+
     try:
         os.remove(str(pool_info_dir / f'clusters/{project}.json'))  # Remove temp pool info file
     except FileNotFoundError:
@@ -131,7 +140,7 @@ def test_bitcoin_mapping(setup_and_cleanup):
         'GBMiners,GBMiners,2'
     ]
 
-    output_file = test_output_dir / project / f'{timeframes[0]}.csv'
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
     with open(output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert line == expected_output[idx]
@@ -144,7 +153,7 @@ def test_bitcoin_mapping(setup_and_cleanup):
         'Unknown,1AM2fYfpY3ZeMeCKXmN66haoWxvB89pJUx,1'
     ]
 
-    yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
+    yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
     with open(yearly_output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
@@ -161,7 +170,7 @@ def test_bitcoin_mapping(setup_and_cleanup):
         'Bitmain,Bitmain,1',
     ]
 
-    output_file = test_output_dir / project / f'{timeframes[0]}.csv'
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
     with open(output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
@@ -233,12 +242,12 @@ def test_ethereum_mapping(setup_and_cleanup):
         'Unknown,0x45133a7e1cc7e18555ae8a4ee632a8a61de90df6,1'
     ]
 
-    output_file = test_output_dir / project / f'{timeframes[0]}.csv'
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
     with open(output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
 
-    yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
+    yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
     with open(yearly_output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
@@ -254,7 +263,7 @@ def test_ethereum_mapping(setup_and_cleanup):
         'MEV Builder: 0x3B...436,MEV Builder: 0x3B...436,1'
     ]
 
-    output_file = test_output_dir / project / f'{timeframes[0]}.csv'
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
     with open(output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
@@ -313,12 +322,12 @@ def test_cardano_mapping(setup_and_cleanup):
         '1percentpool,1percentpool,1'
     ]
 
-    output_file = test_output_dir / project / f'{timeframes[0]}.csv'
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
     with open(output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
 
-    yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
+    yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
     with open(yearly_output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
@@ -385,12 +394,12 @@ def test_tezos_mapping(setup_and_cleanup):
         'Unknown,----- UNDEFINED MINER -----,1'
     ]
 
-    output_file = test_output_dir / project / f'{timeframes[0]}.csv'
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
     with open(output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
 
-    yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
+    yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
     with open(yearly_output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
@@ -406,7 +415,7 @@ def test_tezos_mapping(setup_and_cleanup):
         'Unknown,tz0000000000000000000000000000000000,1'
     ]
 
-    output_file = test_output_dir / project / f'{timeframes[0]}.csv'
+    output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
     with open(output_file) as f:
         for idx, line in enumerate(f.readlines()):
             assert expected_output[idx] == line
@@ -426,7 +435,8 @@ def test_tezos_mapping(setup_and_cleanup):
 
 
 def test_get_reward_addresses():
-    default_mapping = DefaultMapping("sample_bitcoin", None)
+    some_path = pathlib.Path()
+    default_mapping = DefaultMapping("sample_bitcoin", some_path)
 
     block = {
         "number": 625113,
@@ -465,7 +475,8 @@ def test_get_reward_addresses():
     reward_addresses = default_mapping.get_reward_addresses(block)
     assert reward_addresses is None
 
-    eth_mapping = EthereumMapping("sample_ethereum", None)
+    some_path = pathlib.Path()
+    eth_mapping = EthereumMapping("sample_ethereum", some_path)
     block = {
         "number": 6982695,
         "timestamp": "2018-12-31 00:00:12+00:00",
@@ -477,7 +488,8 @@ def test_get_reward_addresses():
 
 
 def test_from_known_addresses():
-    cardano_mapping = CardanoMapping("sample_cardano", None)
+    some_path = pathlib.Path()
+    cardano_mapping = CardanoMapping("sample_cardano", some_path)
 
     block = {
         "number": 92082690,
diff --git a/tests/test_5_analyze.py b/tests/test_5_analyze.py