Clustering flag (#151)

dimkarakostas · web-flow · commit 4484086e4939 · 2024-04-29T13:43:15.000+01:00
* Retrieve config data once in helper file

* Add config param for disabling clustering

* Print clustering flag in aggregate output file

* Remove empty lines from aggregate output

* Remove space from aggregate output csv header

* Handle list of metric params in config

* Use clustering instead of no_clustering config flag

* Renaming vars in analyze
diff --git a/config.yaml b/config.yaml
@@ -1,15 +1,19 @@
 metrics:
   entropy:
-    alpha: 1
+    - 1
   entropy_percentage:
-    alpha: 1
+    - 1
   gini:
   hhi:
   nakamoto_coefficient:
   theil_index:
   max_power_ratio:
   tau_index:
-    threshold: 0.66
+    - 0.33
+    - 0.66
+
+analyze_flags:
+  clustering: true
 
 default_timeframe:
   start_date: 2010-01-01
diff --git a/consensus_decentralization/analyze.py b/consensus_decentralization/analyze.py
@@ -21,7 +21,14 @@ def analyze(projects, aggregated_data_filename, output_dir):
     """
     logging.info('Calculating metrics on aggregated data..')
     metrics = hlp.get_metrics_config()
-    metric_names = list(metrics.keys())
+    metric_params = []
+    for key, args in metrics.items():
+        if args:
+            for val in args:
+                metric_params.append((f'{key}={val}', key, val))
+        else:
+            metric_params.append((key, key, None))
+    metric_names = [name for name, _, _ in metric_params]
 
     aggregate_output = {}
 
@@ -43,34 +50,39 @@ def analyze(projects, aggregated_data_filename, output_dir):
             for tchunk, nblocks in block_values.items():
                 if nblocks > 0:
                     chunks_with_blocks.add(tchunk)
-        for metric, args_dict in metrics.items():
+        for metric_name, metric, param in metric_params:
+            logging.info(f'Calculating {metric_name}')
+
             for row_index, time_chunk in enumerate(time_chunks):
                 time_chunk_blocks_per_entity = {}
                 if column_index == 0:
-                    csv_contents[metric].append([time_chunk])
+                    csv_contents[metric_name].append([time_chunk])
                 if time_chunk in chunks_with_blocks:
                     for entity, block_values in blocks_per_entity.items():
                         try:
                             time_chunk_blocks_per_entity[entity] = block_values[time_chunk]
                         except KeyError:
                             time_chunk_blocks_per_entity[entity] = 0
                 func = eval(f'compute_{metric}')
-                result = func(time_chunk_blocks_per_entity, **args_dict) if args_dict else func(
-                    time_chunk_blocks_per_entity)
-                csv_contents[metric][row_index + 1].append(result)
-                aggregate_output[project][time_chunk][metric] = result
+                if param:
+                    result = func(time_chunk_blocks_per_entity, param)
+                else:
+                    result = func(time_chunk_blocks_per_entity)
+                csv_contents[metric_name][row_index + 1].append(result)
+                aggregate_output[project][time_chunk][metric_name] = result
 
-    for metric in metrics.keys():
+    for metric in metric_names:
         with open(output_dir / f'{metric}.csv', 'w') as f:
             csv_writer = csv.writer(f)
             csv_writer.writerows(csv_contents[metric])
 
-    aggregate_csv_output = [['ledger', 'snapshot date'] + metric_names]
+    clustering_flag = hlp.get_config_data()['analyze_flags']['clustering']
+    aggregate_csv_output = [['ledger', 'snapshot_date', 'clustering'] + metric_names]
     for project, timeframes in aggregate_output.items():
         for time_chunk, results in timeframes.items():
-            aggregate_csv_output.append([project, time_chunk])
-            for metric in metric_names:
-                aggregate_csv_output[-1].append(results[metric])
+            metric_values = [results[metric] for metric in metric_names]
+            if any(metric_values):
+                aggregate_csv_output.append([project, time_chunk, clustering_flag] + metric_values)
     with open(output_dir / 'output.csv', 'w') as f:
         csv_writer = csv.writer(f)
         csv_writer.writerows(aggregate_csv_output)
diff --git a/consensus_decentralization/helper.py b/consensus_decentralization/helper.py
@@ -17,6 +17,9 @@
 OUTPUT_DIR = ROOT_DIR / 'output'
 MAPPING_INFO_DIR = ROOT_DIR / 'mapping_information'
 
+with open(ROOT_DIR / "config.yaml") as f:
+    config = safe_load(f)
+
 
 def valid_date(date_string):
     """
@@ -231,8 +234,6 @@ def get_config_data():
     root directory of the project.
     :returns: a dictionary of configuration keys and values
     """
-    with open(ROOT_DIR / "config.yaml") as f:
-        config = safe_load(f)
     return config
 
 
diff --git a/consensus_decentralization/mappings/default_mapping.py b/consensus_decentralization/mappings/default_mapping.py
@@ -45,29 +45,34 @@ def perform_mapping(self):
         project.
         :returns: a list of dictionaries (mapped block data)
         """
+        clustering_flag = hlp.get_config_data()['analyze_flags']['clustering']
         for block in self.data_to_map:
-            entity = self.map_from_known_identifiers(block)
-            if entity:
-                mapping_method = 'known_identifiers'
+            if not clustering_flag:
+                entity = self.fallback_mapping(block)
+                mapping_method = 'fallback_mapping'
             else:
-                entity = self.map_from_known_addresses(block)
+                entity = self.map_from_known_identifiers(block)
                 if entity:
-                    mapping_method = 'known_addresses'
+                    mapping_method = 'known_identifiers'
                 else:
-                    entity = self.fallback_mapping(block)
-                    mapping_method = 'fallback_mapping'
-
-            cluster = self.map_from_known_clusters(block)
-            if cluster:
-                entity = cluster
-                mapping_method = 'known_clusters'
-
-            # Finally, check legal links to map to the highest-level entity, if relevant
-            day = hlp.get_date_from_block(block)
-            legal_links = hlp.get_pool_legal_links(timeframe=day)
-            if entity in legal_links.keys():
-                entity = legal_links[entity]
-                mapping_method = 'known_legal_links'
+                    entity = self.map_from_known_addresses(block)
+                    if entity:
+                        mapping_method = 'known_addresses'
+                    else:
+                        entity = self.fallback_mapping(block)
+                        mapping_method = 'fallback_mapping'
+
+                cluster = self.map_from_known_clusters(block)
+                if cluster:
+                    entity = cluster
+                    mapping_method = 'known_clusters'
+
+                # Finally, check legal links to map to the highest-level entity, if relevant
+                day = hlp.get_date_from_block(block)
+                legal_links = hlp.get_pool_legal_links(timeframe=day)
+                if entity in legal_links.keys():
+                    entity = legal_links[entity]
+                    mapping_method = 'known_legal_links'
 
             self.mapped_data.append({
                 "number": block['number'],
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
@@ -52,7 +52,7 @@ def test_analyze(setup_and_cleanup):
         output_dir=test_output_dir
     )
 
-    metrics = ['gini', 'nakamoto_coefficient', 'entropy']
+    metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
     for metric in metrics:
         output_file = test_output_dir / f'{metric}.csv'
         assert output_file.is_file()
@@ -64,7 +64,7 @@ def test_analyze(setup_and_cleanup):
                 assert lines[1] == '2018,0.25\n'
             elif metric == 'nakamoto_coefficient':
                 assert lines[1] == '2018,2\n'
-            elif metric == 'entropy':
+            elif metric == 'entropy=1':
                 assert lines[1] == '2018,1.836591668108979\n'
 
     analyze(
@@ -73,7 +73,7 @@ def test_analyze(setup_and_cleanup):
         output_dir=test_output_dir
     )
 
-    metrics = ['gini', 'nakamoto_coefficient', 'entropy']
+    metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
     for metric in metrics:
         output_file = test_output_dir / f'{metric}.csv'
         assert output_file.is_file()
@@ -87,7 +87,7 @@ def test_analyze(setup_and_cleanup):
             elif metric == 'nakamoto_coefficient':
                 assert lines[1] == 'Feb-2018,1\n'
                 assert lines[2] == 'Mar-2018,1\n'
-            elif metric == 'entropy':
+            elif metric == 'entropy=1':
                 assert lines[1] == 'Feb-2018,1.5\n'
                 assert lines[2] == 'Mar-2018,0.0\n'
 
@@ -97,7 +97,7 @@ def test_analyze(setup_and_cleanup):
         output_dir=test_output_dir
     )
 
-    metrics = ['gini', 'nakamoto_coefficient', 'entropy']
+    metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
     for metric in metrics:
         output_file = test_output_dir / f'{metric}.csv'
         assert output_file.is_file()
diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py
@@ -85,7 +85,7 @@ def test_end_to_end(setup_and_cleanup):
         'timeframe,sample_bitcoin,sample_cardano\n',
         '2010,,\n'
     ]
-    with open(test_output_dir / 'entropy.csv') as f:
+    with open(test_output_dir / 'entropy=1.csv') as f:
         lines = f.readlines()
         for idx, line in enumerate(lines):
             assert line == expected_entropy[idx]
@@ -123,7 +123,7 @@ def test_end_to_end(setup_and_cleanup):
         'Feb-2018,1.5,\n',
         'Mar-2018,0.0,\n',
         ]
-    with open(test_output_dir / 'entropy.csv') as f:
+    with open(test_output_dir / 'entropy=1.csv') as f:
         lines = f.readlines()
         for idx, line in enumerate(lines):
             assert line == expected_entropy[idx]
@@ -161,7 +161,7 @@ def test_end_to_end(setup_and_cleanup):
         'timeframe,sample_bitcoin,sample_cardano\n',
         'Dec-2020,,1.9219280948873623\n'
     ]
-    with open(test_output_dir / 'entropy.csv') as f:
+    with open(test_output_dir / 'entropy=1.csv') as f:
         lines = f.readlines()
         for idx, line in enumerate(lines):
             assert line == expected_entropy[idx]