Skip to content

Commit 200af0d

Browse files
committed
Add configurable population window
1 parent 420b537 commit 200af0d

File tree

5 files changed

+63
-29
lines changed

5 files changed

+63
-29
lines changed

config.yaml

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ analyze_flags:
3838
# The timeframe for which an analysis should be performed.
3939
# Each date is a string of the form YYYY-MM-DD.
4040
timeframe:
41-
start_date: 2011-01-01
42-
end_date: 2023-12-31
41+
start_date: 2018-01-01
42+
end_date: 2025-03-01
4343

4444
# The number of days to use for the estimation window, i.e.how many days of blocks to use for each data point.
4545
# If left empty, then the entire time frame will be used (only valid when combined with empty frequency).
@@ -48,19 +48,13 @@ estimation_window: 30
4848
# How frequently to sample the data, in days
4949
# If left empty, then only one data point will be analyzed (snapshot instead of longitudinal analysis), but this is
5050
# only valid when combined with an empty estimation_window.
51-
frequency: 30 # todo maybe add hadrcoded values for day, week, month, year (in the code that parses this) + for the estimation window
52-
53-
54-
input_directories: # Paths to directories that contain raw input data
55-
- ./input
56-
57-
# Paths to directories of snapshot db files; either absolute or relative from run.py.
58-
# The first path will be used to write newly created dbs and the output of runs
59-
output_directories:
60-
- ./output
51+
frequency: 30
6152

53+
# A number that specifies how many windows to look back and forward when deciding whether an entity is active on a
54+
# given time period, or 'all' to count all entities that have produced blocks in the entire observation period.
55+
population_windows: 1
6256

6357
# Plot flags
6458
plot_parameters:
6559
plot: false
66-
animated: true
60+
animated: false

consensus_decentralization/analyze.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from consensus_decentralization.metrics.total_entities import compute_total_entities # noqa: F401
1212

1313

14-
def analyze(projects, aggregated_data_filename, output_dir):
14+
def analyze(projects, aggregated_data_filename, input_dir, output_dir, population_windows):
1515
"""
1616
Calculates all available metrics for the given ledgers and timeframes. Outputs one file for each metric.
1717
:param projects: list of strings that correspond to the ledgers whose data should be analyzed
@@ -20,6 +20,7 @@ def analyze(projects, aggregated_data_filename, output_dir):
2020
2121
Using multiple projects and timeframes is necessary here to produce collective csv files.
2222
"""
23+
2324
logging.info('Calculating metrics on aggregated data..')
2425
metrics = hlp.get_metrics_config()
2526
metric_params = []
@@ -30,6 +31,7 @@ def analyze(projects, aggregated_data_filename, output_dir):
3031
else:
3132
metric_params.append((key, key, None))
3233
metric_names = [name for name, _, _ in metric_params]
34+
clustering_flag = hlp.get_clustering_flag()
3335

3436
aggregate_output = {}
3537

@@ -42,8 +44,9 @@ def analyze(projects, aggregated_data_filename, output_dir):
4244
for column_index, project in enumerate(projects):
4345
logging.info(f'Calculating {project} metrics')
4446
aggregate_output[project] = {}
45-
aggregated_data_dir = output_dir / project / 'blocks_per_entity'
46-
dates, blocks_per_entity = hlp.get_blocks_per_entity_from_file(aggregated_data_dir / aggregated_data_filename)
47+
aggregated_data_dir = input_dir / project / hlp.get_aggregated_data_dir_name(clustering_flag)
48+
dates, blocks_per_entity = hlp.get_blocks_per_entity_from_file(aggregated_data_dir /
49+
aggregated_data_filename, population_windows)
4750
for date in dates:
4851
aggregate_output[project][date] = {}
4952

@@ -80,7 +83,6 @@ def analyze(projects, aggregated_data_filename, output_dir):
8083
csv_writer = csv.writer(f)
8184
csv_writer.writerows(csv_contents[metric])
8285

83-
clustering_flag = hlp.get_config_data()['analyze_flags']['clustering']
8486
aggregate_csv_output = [['ledger', 'date', 'clustering'] + metric_names]
8587
for project, timeframes in aggregate_output.items():
8688
for date, results in timeframes.items():

consensus_decentralization/helper.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,11 +191,13 @@ def write_blocks_per_entity_to_file(output_dir, blocks_per_entity, dates, filena
191191
csv_writer.writerow(entity_row)
192192

193193

194-
def get_blocks_per_entity_from_file(filepath):
194+
def get_blocks_per_entity_from_file(filepath, population_windows):
195195
"""
196196
Retrieves information about the number of blocks that each entity produced over some timeframe for some project.
197197
:param filepath: the path to the file with the relevant information. It can be either an absolute or a relative
198198
path in either a pathlib.PosixPath object or a string.
199+
:param population_windows: int representing the number of windows to look back and forward when determining if an
200+
entity is active during a certain time frame
199201
:returns: a tuple of length 2 where the first item is a list of time chunks (strings) and the second item is a
200202
dictionary with entities (keys) and a list of the number of blocks they produced during each time chunk (values)
201203
"""
@@ -207,7 +209,17 @@ def get_blocks_per_entity_from_file(filepath):
207209
for row in csv_reader:
208210
entity = row[0]
209211
for idx, item in enumerate(row[1:]):
210-
if item != '0':
212+
if item == '0':
213+
if population_windows == 'all':
214+
blocks_per_entity[entity][dates[idx]] = 0
215+
else:
216+
# If the entity hasn't produced any blocks in the current time chunk, we only consider it as
217+
# active if it has produced at least one block in population_windows time chunks before or after
218+
# (otherwise it's not considered part of the population for this time frame)
219+
for i in range(max(0, idx - population_windows), min(len(row) - 1, idx + population_windows + 1)):
220+
if row[i + 1] != '0':
221+
blocks_per_entity[entity][dates[idx]] = 0
222+
else:
211223
blocks_per_entity[entity][dates[idx]] = int(item)
212224
return dates, blocks_per_entity
213225

@@ -373,6 +385,21 @@ def get_estimation_window_and_frequency():
373385
raise ValueError('"estimation_window" or "frequency" missing from config file')
374386

375387

388+
def get_population_windows():
389+
"""
390+
Retrieves the number of windows to be used for estimating the population of block producers
391+
:returns: int representing the number of windows to look back and forward when determining if an entity is active
392+
during a certain time frame
393+
:raises ValueError: if the population_windows field is missing from the config file
394+
"""
395+
try:
396+
config = get_config_data()
397+
population_windows = config['population_windows']
398+
return population_windows
399+
except KeyError:
400+
raise ValueError('"population_windows" missing from config file')
401+
402+
376403
def get_plot_flag():
377404
"""
378405
Gets the flag that determines whether generate plots for the output

run.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def process_data(force_map, ledger_dir, ledger, output_dir):
1818
return None
1919

2020

21-
def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTERIM_DIR,
21+
def main(ledgers, timeframe, estimation_window, frequency, population_windows, interim_dir=hlp.INTERIM_DIR,
2222
results_dir=hlp.RESULTS_DIR):
2323
"""
2424
Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
@@ -59,6 +59,7 @@ def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTER
5959
used_metrics = analyze(
6060
projects=ledgers,
6161
aggregated_data_filename=aggregated_data_filename,
62+
population_windows=population_windows,
6263
input_dir=interim_dir,
6364
output_dir=metrics_dir
6465
)
@@ -80,8 +81,9 @@ def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTER
8081
ledgers = hlp.get_ledgers()
8182

8283
estimation_window, frequency = hlp.get_estimation_window_and_frequency()
84+
population_windows = hlp.get_population_windows()
8385

84-
results_dir = hlp.get_results_dir(estimation_window, frequency)
86+
results_dir = hlp.get_results_dir(estimation_window, frequency, population_windows)
8587
results_dir.mkdir(parents=True, exist_ok=True)
8688

8789
start_date, end_date = hlp.get_start_end_dates()
@@ -92,6 +94,6 @@ def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTER
9294
'the first date.')
9395
timeframe = (timeframe_start, timeframe_end)
9496

95-
main(ledgers, timeframe, estimation_window, frequency, results_dir=results_dir)
97+
main(ledgers, timeframe, estimation_window, frequency, population_windows, results_dir=results_dir)
9698

9799
logging.info('Done. Please check the output directory for results.')

tests/test_helper.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,16 +82,25 @@ def test_committed_pool_data():
8282
def test_write_read_blocks_per_entity(setup_and_cleanup):
8383
output_dir = setup_and_cleanup
8484

85-
blocks_per_entity = {'Entity 1': {'2018': 1, '2019': 3}, 'Entity 2': {'2018': 2, '2019': 2}}
85+
blocks_per_entity = {
86+
'Entity 1': {'2018': 1, '2019': 3, '2020': 2, '2021': 3},
87+
'Entity 2': {'2018': 2, '2019': 2, '2021': 1},
88+
'Entity 3': {'2018': 2},
89+
'Entity 4': {'2021': 1}
90+
}
8691

87-
write_blocks_per_entity_to_file(output_dir=output_dir, blocks_per_entity=blocks_per_entity, dates=['2018', '2019'],
88-
filename='test.csv')
92+
write_blocks_per_entity_to_file(output_dir=output_dir, blocks_per_entity=blocks_per_entity,
93+
dates=['2018', '2019', '2020', '2021'], filename='test.csv')
8994

90-
dates, bpe = get_blocks_per_entity_from_file(output_dir / 'test.csv')
95+
dates, bpe = get_blocks_per_entity_from_file(output_dir / 'test.csv', population_windows=1)
9196

92-
assert all(len(nblocks) == len(dates) for nblocks in bpe.values())
93-
assert dates == ['2018', '2019']
94-
assert all([bpe['Entity 1'] == {'2018': 1, '2019': 3}, bpe['Entity 2'] == {'2018': 2, '2019': 2}])
97+
assert dates == ['2018', '2019', '2020', '2021']
98+
assert all([
99+
bpe['Entity 1'] == {'2018': 1, '2019': 3, '2020': 2, '2021': 3},
100+
bpe['Entity 2'] == {'2018': 2, '2019': 2, '2020': 0, '2021': 1},
101+
bpe['Entity 3'] == {'2018': 2, '2019': 0},
102+
bpe['Entity 4'] == {'2020': 0, '2021': 1}
103+
])
95104

96105

97106
def test_valid_date():

0 commit comments

Comments
 (0)