Make parsing more efficient

LadyChristina · LadyChristina · commit 420b5370be2b · 2025-03-16T13:16:23.000Z
diff --git a/consensus_decentralization/aggregate.py b/consensus_decentralization/aggregate.py
@@ -12,16 +12,16 @@ class Aggregator:
     blocks they produced
     """
 
-    def __init__(self, project, io_dir):
+    def __init__(self, project, io_dir, mapped_data=None):
         """
         :param project: str. Name of the project
         :param io_dir: Path. Path to the project's output directory
         """
         self.project = project
-        self.data_to_aggregate = hlp.read_mapped_project_data(io_dir)
+        self.data_to_aggregate = hlp.read_mapped_project_data(io_dir) if mapped_data is None else mapped_data
         self.data_start_date = hlp.get_timeframe_beginning(hlp.get_date_from_block(self.data_to_aggregate[0]))
         self.data_end_date = hlp.get_timeframe_beginning(hlp.get_date_from_block(self.data_to_aggregate[-1]))
-        self.aggregated_data_dir = io_dir / 'blocks_per_entity'
+        self.aggregated_data_dir = io_dir / hlp.get_aggregated_data_dir_name(hlp.get_clustering_flag())
         self.aggregated_data_dir.mkdir(parents=True, exist_ok=True)
 
         self.monthly_data_breaking_points = [(self.data_start_date.strftime('%Y-%m'), 0)]
@@ -89,7 +89,7 @@ def divide_timeframe(timeframe, estimation_window, frequency):
     return time_chunks
 
 
-def aggregate(project, output_dir, timeframe, estimation_window, frequency, force_aggregate):
+def aggregate(project, output_dir, timeframe, estimation_window, frequency, force_aggregate, mapped_data=None):
     """
     Aggregates the results of the mapping process for the given project and timeframe. The results are saved in a csv
     file in the project's output directory. Note that the output file is created (just with the headers) even if there
@@ -113,7 +113,7 @@ def aggregate(project, output_dir, timeframe, estimation_window, frequency, forc
             raise ValueError('The estimation window is too large for the given timeframe')
 
     project_io_dir = output_dir / project
-    aggregator = Aggregator(project, project_io_dir)
+    aggregator = Aggregator(project, project_io_dir, mapped_data=mapped_data)
 
     filename = hlp.get_blocks_per_entity_filename(timeframe=timeframe, estimation_window=estimation_window, frequency=frequency)
     output_file = aggregator.aggregated_data_dir / filename
diff --git a/consensus_decentralization/mappings/default_mapping.py b/consensus_decentralization/mappings/default_mapping.py
@@ -1,4 +1,5 @@
 import json
+
 import consensus_decentralization.helper as hlp
 
 
@@ -9,8 +10,6 @@ class DefaultMapping:
 
     :ivar project_name: the name of the project associated with a specific mapping instance
     :ivar output_dir: the directory that includes the parsed data related to the project
-    :ivar mapped_data_dir: the directory to save the mapped data files in
-    :ivar multi_pool_dir: the directory to save the multi pool data files in
     :ivar data_to_map: a list with the parsed data of the project (list of dictionaries with block information
     :ivar special_addresses: a set with the special addresses of the project (addresses that don't count in the
     context of out analysis)
@@ -45,7 +44,7 @@ def perform_mapping(self):
         project.
         :returns: a list of dictionaries (mapped block data)
         """
-        clustering_flag = hlp.get_config_data()['analyze_flags']['clustering']
+        clustering_flag = hlp.get_clustering_flag()
         for block in self.data_to_map:
             if not clustering_flag:
                 entity = self.fallback_mapping(block)
@@ -83,7 +82,7 @@ def perform_mapping(self):
             })
 
         if len(self.mapped_data) > 0:
-            self.write_mapped_data()
+            self.write_mapped_data(clustering_flag)
         self.write_multi_pool_files()
 
         return self.mapped_data
@@ -187,11 +186,12 @@ def write_multi_pool_files(self):
             with open(self.output_dir / 'multi_pool_blocks.csv', 'w') as f:
                 f.write('Block No,Timestamp,Entities\n' + '\n'.join(self.multi_pool_blocks))
 
-    def write_mapped_data(self):
+    def write_mapped_data(self, clustering_flag):
         """
         Writes the mapped data into a file in a directory associated with the mapping instance. Specifically,
         into a folder named after the project, inside the general output directory
+        :param clustering_flag: boolean, indicating whether clustering was used in the mapping process
         """
-        filename = 'mapped_data.json'
+        filename = hlp.get_mapped_data_filename(clustering_flag)
         with open(self.output_dir / filename, 'w') as f:
             json.dump(self.mapped_data, f, indent=4)
diff --git a/consensus_decentralization/mappings/dummy_mapping.py b/consensus_decentralization/mappings/dummy_mapping.py
@@ -1,4 +1,5 @@
 from consensus_decentralization.mappings.default_mapping import DefaultMapping
+import consensus_decentralization.helper as hlp
 
 
 class DummyMapping(DefaultMapping):
@@ -28,6 +29,6 @@ def perform_mapping(self):
             })
 
         if len(self.mapped_data) > 0:
-            self.write_mapped_data()
+            self.write_mapped_data(hlp.get_clustering_flag())
 
         return self.mapped_data
diff --git a/consensus_decentralization/parsers/dummy_parser.py b/consensus_decentralization/parsers/dummy_parser.py
@@ -24,11 +24,12 @@ def parse(self):
         directory associated with the parser instance (specifically in <general output directory>/<project_name>)
         """
         data = self.read_and_sort_data()
+
         for block in data:
             if 'identifiers' not in block.keys():
                 block['identifiers'] = None
             else:
                 block['identifiers'] = self.parse_identifiers(block['identifiers'])
             if 'reward_addresses' not in block.keys():
                 block['reward_addresses'] = None
-        return data
+            yield block
diff --git a/consensus_decentralization/parsers/ethereum_parser.py b/consensus_decentralization/parsers/ethereum_parser.py
@@ -1,4 +1,5 @@
 from consensus_decentralization.parsers.dummy_parser import DummyParser
+import json
 
 
 class EthereumParser(DummyParser):
@@ -20,3 +21,20 @@ def parse_identifiers(block_identifiers):
             return bytes.fromhex(block_identifiers[2:]).decode('utf-8')
         except (UnicodeDecodeError, ValueError):
             return block_identifiers
+
+    def read_and_sort_data(self):
+        """
+        Reads the "raw" block data associated with the project
+        :returns: a list of dictionaries (block data) sorted by timestamp
+        Note that the current version does not sort the data (because it is too memory-intensive) but assumes that the
+        data are already sorted (which is generally the case given the suggested queries).
+        """
+        filename = f'{self.project_name}_raw_data.json'
+        filepath = self.input_dir / filename
+
+        def generate_data():
+            with open(filepath) as f:
+                for line in f:
+                    yield json.loads(line.strip())
+
+        return generate_data()