Skip to content

Commit e60e87a

Browse files
Various updates (#164)
* Added configurable parameter for determining population of active block producers in a given time period ('population_windows' in config.yaml) * Made Ethereum parser more efficient by using generators instead of lists (because Ethereum block data was getting too large to handle that way) * Updated the directory structure of the results and intermediate files * Generalised max power ratio metric to concentration ratios * Updated documentation
1 parent f6dbfbb commit e60e87a

26 files changed

+374
-199
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ output
88
raw_block_data/*_raw_data.json
99
.coverage
1010
site
11+
.ipynb_checkpoints/
12+
*.ipynb
13+
processed_data
14+
results

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ analyzed. If only the timeframe is specified, all ledgers will be analyzed for
5858
the given timeframe. If no arguments are given, all ledgers will be analyzed for
5959
all months since January 2018.
6060

61-
Three files `nc.csv`, `gini.csv`, `entropy.csv` are also created in the `output` directory, containing the data from the
62-
last execution of `run.py`.
61+
Three files `nc.csv`, `gini.csv`, `entropy.csv` are also created in the `results` directory, containing the data from
62+
the last execution of `run.py`.
6363

6464
## Contributing
6565

config.yaml

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ metrics:
88
hhi:
99
nakamoto_coefficient:
1010
theil_index:
11-
max_power_ratio:
11+
concentration_ratio:
12+
- 1
13+
- 3
1214
tau_index:
1315
- 0.33
1416
- 0.66
@@ -36,8 +38,8 @@ analyze_flags:
3638
# The timeframe for which an analysis should be performed.
3739
# Each date is a string of the form YYYY-MM-DD.
3840
timeframe:
39-
start_date: 2011-01-01
40-
end_date: 2023-12-31
41+
start_date: 2018-01-01
42+
end_date: 2025-03-01
4143

4244
# The number of days to use for the estimation window, i.e.how many days of blocks to use for each data point.
4345
# If left empty, then the entire time frame will be used (only valid when combined with empty frequency).
@@ -46,19 +48,13 @@ estimation_window: 30
4648
# How frequently to sample the data, in days
4749
# If left empty, then only one data point will be analyzed (snapshot instead of longitudinal analysis), but this is
4850
# only valid when combined with an empty estimation_window.
49-
frequency: 30 # todo maybe add hadrcoded values for day, week, month, year (in the code that parses this) + for the estimation window
50-
51-
52-
input_directories: # Paths to directories that contain raw input data
53-
- ./input
54-
55-
# Paths to directories of snapshot db files; either absolute or relative from run.py.
56-
# The first path will be used to write newly created dbs and the output of runs
57-
output_directories:
58-
- ./output
51+
frequency: 30
5952

53+
# A number that specifies how many windows to look back and forward when deciding whether an entity is active on a
54+
# given time period, or 'all' to count all entities that have produced blocks in the entire observation period.
55+
population_windows: 1
6056

6157
# Plot flags
6258
plot_parameters:
6359
plot: false
64-
animated: true
60+
animated: false

consensus_decentralization/aggregate.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,16 @@ class Aggregator:
1212
blocks they produced
1313
"""
1414

15-
def __init__(self, project, io_dir):
15+
def __init__(self, project, io_dir, mapped_data=None):
1616
"""
1717
:param project: str. Name of the project
1818
:param io_dir: Path. Path to the project's output directory
1919
"""
2020
self.project = project
21-
self.data_to_aggregate = hlp.read_mapped_project_data(io_dir)
21+
self.data_to_aggregate = hlp.read_mapped_project_data(io_dir) if mapped_data is None else mapped_data
2222
self.data_start_date = hlp.get_timeframe_beginning(hlp.get_date_from_block(self.data_to_aggregate[0]))
2323
self.data_end_date = hlp.get_timeframe_beginning(hlp.get_date_from_block(self.data_to_aggregate[-1]))
24-
self.aggregated_data_dir = io_dir / 'blocks_per_entity'
24+
self.aggregated_data_dir = io_dir / hlp.get_aggregated_data_dir_name(hlp.get_clustering_flag())
2525
self.aggregated_data_dir.mkdir(parents=True, exist_ok=True)
2626

2727
self.monthly_data_breaking_points = [(self.data_start_date.strftime('%Y-%m'), 0)]
@@ -89,7 +89,7 @@ def divide_timeframe(timeframe, estimation_window, frequency):
8989
return time_chunks
9090

9191

92-
def aggregate(project, output_dir, timeframe, estimation_window, frequency, force_aggregate):
92+
def aggregate(project, output_dir, timeframe, estimation_window, frequency, force_aggregate, mapped_data=None):
9393
"""
9494
Aggregates the results of the mapping process for the given project and timeframe. The results are saved in a csv
9595
file in the project's output directory. Note that the output file is created (just with the headers) even if there
@@ -113,7 +113,7 @@ def aggregate(project, output_dir, timeframe, estimation_window, frequency, forc
113113
raise ValueError('The estimation window is too large for the given timeframe')
114114

115115
project_io_dir = output_dir / project
116-
aggregator = Aggregator(project, project_io_dir)
116+
aggregator = Aggregator(project, project_io_dir, mapped_data=mapped_data)
117117

118118
filename = hlp.get_blocks_per_entity_filename(timeframe=timeframe, estimation_window=estimation_window, frequency=frequency)
119119
output_file = aggregator.aggregated_data_dir / filename

consensus_decentralization/analyze.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,25 @@
66
from consensus_decentralization.metrics.entropy import compute_entropy, compute_entropy_percentage # noqa: F401
77
from consensus_decentralization.metrics.herfindahl_hirschman_index import compute_hhi # noqa: F401
88
from consensus_decentralization.metrics.theil_index import compute_theil_index # noqa: F401
9-
from consensus_decentralization.metrics.max_power_ratio import compute_max_power_ratio # noqa: F401
9+
from consensus_decentralization.metrics.concentration_ratio import compute_concentration_ratio # noqa: F401
1010
from consensus_decentralization.metrics.tau_index import compute_tau_index # noqa: F401
1111
from consensus_decentralization.metrics.total_entities import compute_total_entities # noqa: F401
1212

1313

14-
def analyze(projects, aggregated_data_filename, output_dir):
14+
def analyze(projects, aggregated_data_filename, input_dir, output_dir, population_windows):
1515
"""
1616
Calculates all available metrics for the given ledgers and timeframes. Outputs one file for each metric.
1717
:param projects: list of strings that correspond to the ledgers whose data should be analyzed
1818
:param aggregated_data_filename: string that corresponds to the name of the file that contains the aggregated data
19+
:param input_dir: the directory where the aggregated data is located
20+
:param output_dir: the directory to save the results in
21+
:param population_windows: the number of windows to look backwards and forwards to determine the population of
22+
active block producers for a given time period
1923
:returns: a list with the names of all the metrics that were used
2024
2125
Using multiple projects and timeframes is necessary here to produce collective csv files.
2226
"""
27+
2328
logging.info('Calculating metrics on aggregated data..')
2429
metrics = hlp.get_metrics_config()
2530
metric_params = []
@@ -30,6 +35,7 @@ def analyze(projects, aggregated_data_filename, output_dir):
3035
else:
3136
metric_params.append((key, key, None))
3237
metric_names = [name for name, _, _ in metric_params]
38+
clustering_flag = hlp.get_clustering_flag()
3339

3440
aggregate_output = {}
3541

@@ -42,8 +48,9 @@ def analyze(projects, aggregated_data_filename, output_dir):
4248
for column_index, project in enumerate(projects):
4349
logging.info(f'Calculating {project} metrics')
4450
aggregate_output[project] = {}
45-
aggregated_data_dir = output_dir / project / 'blocks_per_entity'
46-
dates, blocks_per_entity = hlp.get_blocks_per_entity_from_file(aggregated_data_dir / aggregated_data_filename)
51+
aggregated_data_dir = input_dir / project / hlp.get_aggregated_data_dir_name(clustering_flag)
52+
dates, blocks_per_entity = hlp.get_blocks_per_entity_from_file(aggregated_data_dir /
53+
aggregated_data_filename, population_windows)
4754
for date in dates:
4855
aggregate_output[project][date] = {}
4956

@@ -80,7 +87,6 @@ def analyze(projects, aggregated_data_filename, output_dir):
8087
csv_writer = csv.writer(f)
8188
csv_writer.writerows(csv_contents[metric])
8289

83-
clustering_flag = hlp.get_config_data()['analyze_flags']['clustering']
8490
aggregate_csv_output = [['ledger', 'date', 'clustering'] + metric_names]
8591
for project, timeframes in aggregate_output.items():
8692
for date, results in timeframes.items():

consensus_decentralization/helper.py

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@
1414

1515
ROOT_DIR = pathlib.Path(__file__).resolve().parent.parent
1616
RAW_DATA_DIR = ROOT_DIR / 'raw_block_data'
17-
OUTPUT_DIR = ROOT_DIR / 'output'
17+
INTERIM_DIR = ROOT_DIR / 'processed_data'
1818
MAPPING_INFO_DIR = ROOT_DIR / 'mapping_information'
19+
RESULTS_DIR = ROOT_DIR / 'results'
1920

2021
with open(ROOT_DIR / "config.yaml") as f:
2122
config = safe_load(f)
@@ -190,11 +191,13 @@ def write_blocks_per_entity_to_file(output_dir, blocks_per_entity, dates, filena
190191
csv_writer.writerow(entity_row)
191192

192193

193-
def get_blocks_per_entity_from_file(filepath):
194+
def get_blocks_per_entity_from_file(filepath, population_windows):
194195
"""
195196
Retrieves information about the number of blocks that each entity produced over some timeframe for some project.
196197
:param filepath: the path to the file with the relevant information. It can be either an absolute or a relative
197198
path in either a pathlib.PosixPath object or a string.
199+
:param population_windows: int representing the number of windows to look back and forward when determining if an
200+
entity is active during a certain time frame
198201
:returns: a tuple of length 2 where the first item is a list of time chunks (strings) and the second item is a
199202
dictionary with entities (keys) and a list of the number of blocks they produced during each time chunk (values)
200203
"""
@@ -206,7 +209,17 @@ def get_blocks_per_entity_from_file(filepath):
206209
for row in csv_reader:
207210
entity = row[0]
208211
for idx, item in enumerate(row[1:]):
209-
if item != '0':
212+
if item == '0':
213+
if population_windows == 'all':
214+
blocks_per_entity[entity][dates[idx]] = 0
215+
else:
216+
# If the entity hasn't produced any blocks in the current time chunk, we only consider it as
217+
# active if it has produced at least one block in population_windows time chunks before or after
218+
# (otherwise it's not considered part of the population for this time frame)
219+
for i in range(max(0, idx - population_windows), min(len(row) - 1, idx + population_windows + 1)):
220+
if row[i + 1] != '0':
221+
blocks_per_entity[entity][dates[idx]] = 0
222+
else:
210223
blocks_per_entity[entity][dates[idx]] = int(item)
211224
return dates, blocks_per_entity
212225

@@ -294,7 +307,7 @@ def read_mapped_project_data(project_dir):
294307
:param project_dir: pathlib.PosixPath object of the output directory corresponding to the project
295308
:returns: a dictionary with the mapped data
296309
"""
297-
with open(project_dir / 'mapped_data.json') as f:
310+
with open(project_dir / get_mapped_data_filename(get_clustering_flag())) as f:
298311
data = json.load(f)
299312
return data
300313

@@ -309,6 +322,15 @@ def get_representative_dates(time_chunks):
309322
return [str(chunk[0] + (chunk[1] - chunk[0]) // 2) for chunk in time_chunks]
310323

311324

325+
def get_aggregated_data_dir_name(clustering_flag):
326+
"""
327+
Determines the name of the directory that will contain the aggregated data
328+
:param clustering_flag: boolean that determines whether the data is clustered or not
329+
:returns: str that corresponds to the name of the directory
330+
"""
331+
return 'blocks_per_entity_' + ('clustered' if clustering_flag else 'non_clustered')
332+
333+
312334
def get_blocks_per_entity_filename(timeframe, estimation_window, frequency):
313335
"""
314336
Determines the filename of the csv file that contains the aggregated data
@@ -363,6 +385,21 @@ def get_estimation_window_and_frequency():
363385
raise ValueError('"estimation_window" or "frequency" missing from config file')
364386

365387

388+
def get_population_windows():
389+
"""
390+
Retrieves the number of windows to be used for estimating the population of block producers
391+
:returns: int representing the number of windows to look back and forward when determining if an entity is active
392+
during a certain time frame
393+
:raises ValueError: if the population_windows field is missing from the config file
394+
"""
395+
try:
396+
config = get_config_data()
397+
population_windows = config['population_windows']
398+
return population_windows
399+
except KeyError:
400+
raise ValueError('"population_windows" missing from config file')
401+
402+
366403
def get_plot_flag():
367404
"""
368405
Gets the flag that determines whether generate plots for the output
@@ -395,3 +432,35 @@ def get_force_map_flag():
395432
return config['execution_flags']['force_map']
396433
except KeyError:
397434
raise ValueError('Flag "force_map" missing from config file')
435+
436+
437+
def get_clustering_flag():
438+
"""
439+
Gets the flag that determines whether to perform clustering
440+
:returns: boolean
441+
:raises ValueError: if the flag is not set in the config file
442+
"""
443+
config = get_config_data()
444+
try:
445+
return config['analyze_flags']['clustering']
446+
except KeyError:
447+
raise ValueError('Flag "clustering" missing from config file')
448+
449+
450+
def get_results_dir(estimation_window, frequency, population_windows):
451+
"""
452+
Retrieves the path to the results directory for the specific config parameters
453+
:returns: pathlib.PosixPath object
454+
"""
455+
results_dir_name = (f'{estimation_window}_day_window_with_{population_windows}_population_windows_sampled_every'
456+
f'_{frequency}_days')
457+
return RESULTS_DIR / results_dir_name
458+
459+
460+
def get_mapped_data_filename(clustering_flag):
461+
"""
462+
Retrieves the filename of the mapped data file
463+
:param clustering_flag: boolean that determines whether the data is clustered or not
464+
:returns: str
465+
"""
466+
return 'mapped_data_' + ('clustered' if clustering_flag else 'non_clustered') + '.json'

consensus_decentralization/mappings/default_mapping.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
23
import consensus_decentralization.helper as hlp
34

45

@@ -9,8 +10,6 @@ class DefaultMapping:
910
1011
:ivar project_name: the name of the project associated with a specific mapping instance
1112
:ivar output_dir: the directory that includes the parsed data related to the project
12-
:ivar mapped_data_dir: the directory to save the mapped data files in
13-
:ivar multi_pool_dir: the directory to save the multi pool data files in
1413
:ivar data_to_map: a list with the parsed data of the project (list of dictionaries with block information
1514
:ivar special_addresses: a set with the special addresses of the project (addresses that don't count in the
1615
context of out analysis)
@@ -45,7 +44,7 @@ def perform_mapping(self):
4544
project.
4645
:returns: a list of dictionaries (mapped block data)
4746
"""
48-
clustering_flag = hlp.get_config_data()['analyze_flags']['clustering']
47+
clustering_flag = hlp.get_clustering_flag()
4948
for block in self.data_to_map:
5049
if not clustering_flag:
5150
entity = self.fallback_mapping(block)
@@ -83,7 +82,7 @@ def perform_mapping(self):
8382
})
8483

8584
if len(self.mapped_data) > 0:
86-
self.write_mapped_data()
85+
self.write_mapped_data(clustering_flag)
8786
self.write_multi_pool_files()
8887

8988
return self.mapped_data
@@ -187,11 +186,12 @@ def write_multi_pool_files(self):
187186
with open(self.output_dir / 'multi_pool_blocks.csv', 'w') as f:
188187
f.write('Block No,Timestamp,Entities\n' + '\n'.join(self.multi_pool_blocks))
189188

190-
def write_mapped_data(self):
189+
def write_mapped_data(self, clustering_flag):
191190
"""
192191
Writes the mapped data into a file in a directory associated with the mapping instance. Specifically,
193192
into a folder named after the project, inside the general output directory
193+
:param clustering_flag: boolean, indicating whether clustering was used in the mapping process
194194
"""
195-
filename = 'mapped_data.json'
195+
filename = hlp.get_mapped_data_filename(clustering_flag)
196196
with open(self.output_dir / filename, 'w') as f:
197197
json.dump(self.mapped_data, f, indent=4)

consensus_decentralization/mappings/dummy_mapping.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from consensus_decentralization.mappings.default_mapping import DefaultMapping
2+
import consensus_decentralization.helper as hlp
23

34

45
class DummyMapping(DefaultMapping):
@@ -28,6 +29,6 @@ def perform_mapping(self):
2829
})
2930

3031
if len(self.mapped_data) > 0:
31-
self.write_mapped_data()
32+
self.write_mapped_data(hlp.get_clustering_flag())
3233

3334
return self.mapped_data
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
def compute_concentration_ratio(block_distribution, topn):
2+
"""
3+
Calculates the n-concentration ratio of a distribution of balances
4+
:param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order
5+
:param topn: the number of top block producers to consider
6+
:returns: float that represents the ratio of blocks produced by the top n block producers (0 if there weren't any)
7+
"""
8+
total_blocks = sum(block_distribution)
9+
return sum(block_distribution[:topn]) / total_blocks if total_blocks else 0

consensus_decentralization/metrics/max_power_ratio.py

Lines changed: 0 additions & 8 deletions
This file was deleted.

0 commit comments

Comments
 (0)