Skip to content

Commit 414b057

Browse files
Granularity split (#162)
* Split granularity into estimation_window and frequency * Remove mentions of granularity * flake8
1 parent 0ddcf6e commit 414b057

File tree

11 files changed

+310
-235
lines changed

11 files changed

+310
-235
lines changed

config.yaml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,19 @@ analyze_flags:
3535

3636
# The timeframe for which an analysis should be performed.
3737
# Each date is a string of the form YYYY-MM-DD.
38-
# If granularity is also set, then the analysis will run on the timeframe of the two farthest snapshots.
3938
timeframe:
40-
start_date: 2010-01-01
39+
start_date: 2011-01-01
4140
end_date: 2023-12-31
4241

43-
# The granularity for the analysis when two dates are provided in the --snapshot_dates argument (which are then interpreted as start and end dates).
44-
# It can be one of: "day", "week", "month", "year", or empty. If empty, then only the snapshots for the given dates will be analyzed.
45-
granularity: "month"
42+
# The number of days to use for the estimation window, i.e.how many days of blocks to use for each data point.
43+
# If left empty, then the entire time frame will be used (only valid when combined with empty frequency).
44+
estimation_window: 30
45+
46+
# How frequently to sample the data, in days
47+
# If left empty, then only one data point will be analyzed (snapshot instead of longitudinal analysis), but this is
48+
# only valid when combined with an empty estimation_window.
49+
frequency: 30 # todo maybe add hadrcoded values for day, week, month, year (in the code that parses this) + for the estimation window
50+
4651

4752
input_directories: # Paths to directories that contain raw input data
4853
- ./input

consensus_decentralization/aggregate.py

Lines changed: 43 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import logging
22
from collections import defaultdict
3-
from dateutil.rrule import rrule, MONTHLY, WEEKLY, YEARLY, DAILY
43
import datetime
54
import consensus_decentralization.helper as hlp
65

@@ -56,77 +55,84 @@ def aggregate(self, timeframe_start, timeframe_end):
5655
return blocks_per_entity
5756

5857

59-
def divide_timeframe(timeframe, granularity):
58+
def divide_timeframe(timeframe, estimation_window, frequency):
6059
"""
61-
Divides the timeframe into smaller timeframes of the given granularity
60+
Divides the timeframe into smaller timeframes based on the given estimation_window and frequency. Each smaller
61+
timeframe will be estimation_window days long and the start (or end) date of each smaller timeframe will be
62+
frequency days apart from the start (or end) date of the previous timeframe. The last timeframe will not
63+
necessarily have the end date of the original timeframe, it might be some days before that, so that all time
64+
frames produced have equal length.
65+
If the estimation_window is None, then the timeframe is not divided and the list will contain only one
66+
tuple with the start and end dates of the timeframe. If the frequency is None, then the list will contain only one
67+
tuple with the start and end dates of the timeframe.
6268
:param timeframe: a tuple of (start_date, end_date) where each date is a datetime.date object.
63-
:param granularity: the granularity that will be used for the analysis. It can be one of: day, week, month, year, all
64-
:return: a list of tuples of (start_date, end_date) where each date is a datetime.date object and each tuple
65-
corresponds to a timeframe of the given granularity
66-
:raises ValueError: if the timeframe is not valid (i.e. end date preceeds start_date) or if the granularity is not
67-
one of: day, week, month, year
69+
:param estimation_window: int or None. The number of days to include in each time chunk. If None, the entire
70+
timeframe will be considered as one chunk.
71+
:param frequency: int or None. The number of days between each sample start date. If None, only one sample will be
72+
considered, spanning the entire timeframe (i.e. it needs to be combined with None estimation_window).
73+
:returns: a list of tuples of (start_date, end_date) where each date is a datetime.date object. If the estimation
74+
window is larger than the timeframe, then an empty list is returned.
75+
:raises ValueError: if the timeframe is not valid (i.e. end date preceeds start_date)
6876
"""
6977
timeframe_start, timeframe_end = timeframe
7078
if timeframe_end < timeframe_start:
7179
raise ValueError(f'Invalid timeframe: {timeframe}')
72-
if granularity == 'day':
73-
start_dates = [dt.date() for dt in rrule(freq=DAILY, dtstart=timeframe_start, until=timeframe_end)]
74-
end_dates = start_dates
75-
elif granularity == 'week':
76-
start_dates = [dt.date() for dt in rrule(freq=WEEKLY, dtstart=timeframe_start, until=timeframe_end)]
77-
end_dates = [dt - datetime.timedelta(days=1) for dt in start_dates[1:]] + [timeframe_end]
78-
elif granularity == 'month':
79-
start_dates = [dt.date() for dt in rrule(freq=MONTHLY, dtstart=timeframe_start.replace(day=1), until=timeframe_end)]
80-
start_dates[0] = timeframe_start
81-
end_dates = [dt - datetime.timedelta(days=1) for dt in start_dates[1:]] + [timeframe_end]
82-
elif granularity == 'year':
83-
start_dates = [dt.date() for dt in rrule(freq=YEARLY, dtstart=timeframe_start.replace(month=1, day=1), until=timeframe_end)]
84-
start_dates[0] = timeframe_start
85-
end_dates = [dt - datetime.timedelta(days=1) for dt in start_dates[1:]] + [timeframe_end]
86-
else:
87-
# no need to divide the timeframe
88-
start_dates = [timeframe_start]
89-
end_dates = [timeframe_end]
90-
return list(zip(start_dates, end_dates))
80+
if estimation_window is None:
81+
return [(timeframe_start, timeframe_end)]
82+
time_chunks = []
83+
first_window_day = timeframe_start
84+
last_window_day = timeframe_start + datetime.timedelta(days=estimation_window - 1)
85+
while last_window_day <= timeframe_end:
86+
time_chunks.append((first_window_day, last_window_day))
87+
first_window_day += datetime.timedelta(days=frequency)
88+
last_window_day += datetime.timedelta(days=frequency)
89+
return time_chunks
9190

9291

93-
def aggregate(project, output_dir, timeframe, aggregate_by, force_aggregate):
92+
def aggregate(project, output_dir, timeframe, estimation_window, frequency, force_aggregate):
9493
"""
9594
Aggregates the results of the mapping process for the given project and timeframe. The results are saved in a csv
9695
file in the project's output directory. Note that the output file is created (just with the headers) even if there
9796
is no data to aggregate.
9897
:param project: the name of the project
9998
:param output_dir: the path to the general output directory
10099
:param timeframe: a tuple of (start_date, end_date) where each date is a datetime.date object
101-
:param aggregate_by: the granularity that will be used for the analysis. It can be one of: day, week, month,
102-
year, all
100+
:param estimation_window: int or None. The number of days to use for aggregating the data (i.e. counting all the
101+
blocks produced by the entity within estimation_window days). If None, the entire timeframe will be considered
102+
as one chunk.
103+
:param frequency: int or None. The number of days to consider for the frequency of the analysis (i.e. the number
104+
of days between each data point considered in the analysis). If None, only one data point will be considered,
105+
spanning the entire timeframe (i.e. it needs to be combined with None estimation_window).
103106
:param force_aggregate: bool. If True, then the aggregation will be performed, regardless of whether aggregated
104-
data for the project and specified granularity already exist
107+
data for the project and specified window / frequency already exist
105108
:returns: a list of strings that correspond to the time chunks of the aggregation or None if no aggregation took
106109
place (the corresponding output file already existed and force_aggregate was set to False)
107110
"""
111+
if estimation_window is not None:
112+
if timeframe[0] + datetime.timedelta(days=estimation_window - 1) > timeframe[1]:
113+
raise ValueError('The estimation window is too large for the given timeframe')
114+
108115
project_io_dir = output_dir / project
109116
aggregator = Aggregator(project, project_io_dir)
110117

111-
filename = hlp.get_blocks_per_entity_filename(aggregate_by=aggregate_by, timeframe=timeframe)
118+
filename = hlp.get_blocks_per_entity_filename(timeframe=timeframe, estimation_window=estimation_window, frequency=frequency)
112119
output_file = aggregator.aggregated_data_dir / filename
113120

114121
if not output_file.is_file() or force_aggregate:
115122
logging.info(f'Aggregating {project} data..')
116-
timeframe_chunks = divide_timeframe(timeframe=timeframe, granularity=aggregate_by)
117-
timeframe_chunk_starts = hlp.format_time_chunks(time_chunks=timeframe_chunks, granularity=aggregate_by)
123+
timeframe_chunks = divide_timeframe(timeframe=timeframe, estimation_window=estimation_window, frequency=frequency)
124+
representative_dates = hlp.get_representative_dates(time_chunks=timeframe_chunks)
118125
blocks_per_entity = defaultdict(dict)
119126
for i, chunk in enumerate(timeframe_chunks):
120127
chunk_start, chunk_end = chunk
121-
t_chunk = timeframe_chunk_starts[i]
122128
chunk_blocks_per_entity = aggregator.aggregate(chunk_start, chunk_end)
123129
for entity, blocks in chunk_blocks_per_entity.items():
124-
blocks_per_entity[entity][t_chunk] = blocks
130+
blocks_per_entity[entity][representative_dates[i]] = blocks
125131

126132
hlp.write_blocks_per_entity_to_file(
127133
output_dir=aggregator.aggregated_data_dir,
128134
blocks_per_entity=blocks_per_entity,
129-
time_chunks=timeframe_chunk_starts,
135+
dates=representative_dates,
130136
filename=filename
131137
)
132138
return timeframe_chunks

consensus_decentralization/analyze.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -43,50 +43,50 @@ def analyze(projects, aggregated_data_filename, output_dir):
4343
logging.info(f'Calculating {project} metrics')
4444
aggregate_output[project] = {}
4545
aggregated_data_dir = output_dir / project / 'blocks_per_entity'
46-
time_chunks, blocks_per_entity = hlp.get_blocks_per_entity_from_file(aggregated_data_dir / aggregated_data_filename)
47-
for time_chunk in time_chunks:
48-
aggregate_output[project][time_chunk] = {}
46+
dates, blocks_per_entity = hlp.get_blocks_per_entity_from_file(aggregated_data_dir / aggregated_data_filename)
47+
for date in dates:
48+
aggregate_output[project][date] = {}
4949

50-
chunks_with_blocks = set()
50+
dates_with_blocks = set()
5151
for block_values in blocks_per_entity.values():
52-
for tchunk, nblocks in block_values.items():
52+
for date, nblocks in block_values.items():
5353
if nblocks > 0:
54-
chunks_with_blocks.add(tchunk)
54+
dates_with_blocks.add(date)
5555

56-
for row_index, time_chunk in enumerate(time_chunks):
57-
time_chunk_blocks_per_entity = {}
56+
for row_index, date in enumerate(dates):
57+
date_blocks_per_entity = {}
5858
if column_index == 0:
5959
for metric_name, _, _ in metric_params:
60-
csv_contents[metric_name].append([time_chunk])
61-
if time_chunk in chunks_with_blocks:
60+
csv_contents[metric_name].append([date])
61+
if date in dates_with_blocks:
6262
for entity, block_values in blocks_per_entity.items():
6363
try:
64-
time_chunk_blocks_per_entity[entity] = block_values[time_chunk]
64+
date_blocks_per_entity[entity] = block_values[date]
6565
except KeyError:
66-
time_chunk_blocks_per_entity[entity] = 0
67-
sorted_time_chunk_blocks = sorted(time_chunk_blocks_per_entity.values(), reverse=True)
66+
date_blocks_per_entity[entity] = 0
67+
sorted_date_blocks = sorted(date_blocks_per_entity.values(), reverse=True)
6868

6969
for metric_name, metric, param in metric_params:
7070
func = eval(f'compute_{metric}')
7171
if param:
72-
result = func(sorted_time_chunk_blocks, param)
72+
result = func(sorted_date_blocks, param)
7373
else:
74-
result = func(sorted_time_chunk_blocks)
74+
result = func(sorted_date_blocks)
7575
csv_contents[metric_name][row_index + 1].append(result)
76-
aggregate_output[project][time_chunk][metric_name] = result
76+
aggregate_output[project][date][metric_name] = result
7777

7878
for metric in metric_names:
7979
with open(output_dir / f'{metric}.csv', 'w') as f:
8080
csv_writer = csv.writer(f)
8181
csv_writer.writerows(csv_contents[metric])
8282

8383
clustering_flag = hlp.get_config_data()['analyze_flags']['clustering']
84-
aggregate_csv_output = [['ledger', 'snapshot_date', 'clustering'] + metric_names]
84+
aggregate_csv_output = [['ledger', 'date', 'clustering'] + metric_names]
8585
for project, timeframes in aggregate_output.items():
86-
for time_chunk, results in timeframes.items():
86+
for date, results in timeframes.items():
8787
metric_values = [results[metric] for metric in metric_names]
8888
if any(metric_values):
89-
aggregate_csv_output.append([project, time_chunk, clustering_flag] + metric_values)
89+
aggregate_csv_output.append([project, date, clustering_flag] + metric_values)
9090
with open(output_dir / 'output.csv', 'w') as f:
9191
csv_writer = csv.writer(f)
9292
csv_writer.writerows(aggregate_csv_output)

0 commit comments

Comments
 (0)