|
1 | 1 | import logging |
2 | 2 | from collections import defaultdict |
3 | | -from dateutil.rrule import rrule, MONTHLY, WEEKLY, YEARLY, DAILY |
4 | 3 | import datetime |
5 | 4 | import consensus_decentralization.helper as hlp |
6 | 5 |
|
@@ -56,77 +55,84 @@ def aggregate(self, timeframe_start, timeframe_end): |
56 | 55 | return blocks_per_entity |
57 | 56 |
|
58 | 57 |
|
59 | | -def divide_timeframe(timeframe, granularity): |
| 58 | +def divide_timeframe(timeframe, estimation_window, frequency): |
60 | 59 | """ |
61 | | - Divides the timeframe into smaller timeframes of the given granularity |
| 60 | + Divides the timeframe into smaller timeframes based on the given estimation_window and frequency. Each smaller |
| 61 | + timeframe will be estimation_window days long and the start (or end) date of each smaller timeframe will be |
| 62 | + frequency days apart from the start (or end) date of the previous timeframe. The last timeframe will not |
| 63 | + necessarily have the end date of the original timeframe, it might be some days before that, so that all time |
| 64 | + frames produced have equal length. |
| 65 | + If the estimation_window is None, then the timeframe is not divided and the list will contain only one |
| 66 | + tuple with the start and end dates of the timeframe. If the frequency is None, then the list will contain only one |
| 67 | + tuple with the start and end dates of the timeframe. |
62 | 68 | :param timeframe: a tuple of (start_date, end_date) where each date is a datetime.date object. |
63 | | - :param granularity: the granularity that will be used for the analysis. It can be one of: day, week, month, year, all |
64 | | - :return: a list of tuples of (start_date, end_date) where each date is a datetime.date object and each tuple |
65 | | - corresponds to a timeframe of the given granularity |
66 | | - :raises ValueError: if the timeframe is not valid (i.e. end date preceeds start_date) or if the granularity is not |
67 | | - one of: day, week, month, year |
| 69 | + :param estimation_window: int or None. The number of days to include in each time chunk. If None, the entire |
| 70 | + timeframe will be considered as one chunk. |
| 71 | + :param frequency: int or None. The number of days between each sample start date. If None, only one sample will be |
| 72 | + considered, spanning the entire timeframe (i.e. it needs to be combined with None estimation_window). |
| 73 | + :returns: a list of tuples of (start_date, end_date) where each date is a datetime.date object. If the estimation |
| 74 | + window is larger than the timeframe, then an empty list is returned. |
| 75 | + :raises ValueError: if the timeframe is not valid (i.e. end date preceeds start_date) |
68 | 76 | """ |
69 | 77 | timeframe_start, timeframe_end = timeframe |
70 | 78 | if timeframe_end < timeframe_start: |
71 | 79 | raise ValueError(f'Invalid timeframe: {timeframe}') |
72 | | - if granularity == 'day': |
73 | | - start_dates = [dt.date() for dt in rrule(freq=DAILY, dtstart=timeframe_start, until=timeframe_end)] |
74 | | - end_dates = start_dates |
75 | | - elif granularity == 'week': |
76 | | - start_dates = [dt.date() for dt in rrule(freq=WEEKLY, dtstart=timeframe_start, until=timeframe_end)] |
77 | | - end_dates = [dt - datetime.timedelta(days=1) for dt in start_dates[1:]] + [timeframe_end] |
78 | | - elif granularity == 'month': |
79 | | - start_dates = [dt.date() for dt in rrule(freq=MONTHLY, dtstart=timeframe_start.replace(day=1), until=timeframe_end)] |
80 | | - start_dates[0] = timeframe_start |
81 | | - end_dates = [dt - datetime.timedelta(days=1) for dt in start_dates[1:]] + [timeframe_end] |
82 | | - elif granularity == 'year': |
83 | | - start_dates = [dt.date() for dt in rrule(freq=YEARLY, dtstart=timeframe_start.replace(month=1, day=1), until=timeframe_end)] |
84 | | - start_dates[0] = timeframe_start |
85 | | - end_dates = [dt - datetime.timedelta(days=1) for dt in start_dates[1:]] + [timeframe_end] |
86 | | - else: |
87 | | - # no need to divide the timeframe |
88 | | - start_dates = [timeframe_start] |
89 | | - end_dates = [timeframe_end] |
90 | | - return list(zip(start_dates, end_dates)) |
| 80 | + if estimation_window is None: |
| 81 | + return [(timeframe_start, timeframe_end)] |
| 82 | + time_chunks = [] |
| 83 | + first_window_day = timeframe_start |
| 84 | + last_window_day = timeframe_start + datetime.timedelta(days=estimation_window - 1) |
| 85 | + while last_window_day <= timeframe_end: |
| 86 | + time_chunks.append((first_window_day, last_window_day)) |
| 87 | + first_window_day += datetime.timedelta(days=frequency) |
| 88 | + last_window_day += datetime.timedelta(days=frequency) |
| 89 | + return time_chunks |
91 | 90 |
|
92 | 91 |
|
93 | | -def aggregate(project, output_dir, timeframe, aggregate_by, force_aggregate): |
| 92 | +def aggregate(project, output_dir, timeframe, estimation_window, frequency, force_aggregate): |
94 | 93 | """ |
95 | 94 | Aggregates the results of the mapping process for the given project and timeframe. The results are saved in a csv |
96 | 95 | file in the project's output directory. Note that the output file is created (just with the headers) even if there |
97 | 96 | is no data to aggregate. |
98 | 97 | :param project: the name of the project |
99 | 98 | :param output_dir: the path to the general output directory |
100 | 99 | :param timeframe: a tuple of (start_date, end_date) where each date is a datetime.date object |
101 | | - :param aggregate_by: the granularity that will be used for the analysis. It can be one of: day, week, month, |
102 | | - year, all |
| 100 | + :param estimation_window: int or None. The number of days to use for aggregating the data (i.e. counting all the |
| 101 | + blocks produced by the entity within estimation_window days). If None, the entire timeframe will be considered |
| 102 | + as one chunk. |
| 103 | + :param frequency: int or None. The number of days to consider for the frequency of the analysis (i.e. the number |
| 104 | + of days between each data point considered in the analysis). If None, only one data point will be considered, |
| 105 | + spanning the entire timeframe (i.e. it needs to be combined with None estimation_window). |
103 | 106 | :param force_aggregate: bool. If True, then the aggregation will be performed, regardless of whether aggregated |
104 | | - data for the project and specified granularity already exist |
| 107 | + data for the project and specified window / frequency already exist |
105 | 108 | :returns: a list of strings that correspond to the time chunks of the aggregation or None if no aggregation took |
106 | 109 | place (the corresponding output file already existed and force_aggregate was set to False) |
107 | 110 | """ |
| 111 | + if estimation_window is not None: |
| 112 | + if timeframe[0] + datetime.timedelta(days=estimation_window - 1) > timeframe[1]: |
| 113 | + raise ValueError('The estimation window is too large for the given timeframe') |
| 114 | + |
108 | 115 | project_io_dir = output_dir / project |
109 | 116 | aggregator = Aggregator(project, project_io_dir) |
110 | 117 |
|
111 | | - filename = hlp.get_blocks_per_entity_filename(aggregate_by=aggregate_by, timeframe=timeframe) |
| 118 | + filename = hlp.get_blocks_per_entity_filename(timeframe=timeframe, estimation_window=estimation_window, frequency=frequency) |
112 | 119 | output_file = aggregator.aggregated_data_dir / filename |
113 | 120 |
|
114 | 121 | if not output_file.is_file() or force_aggregate: |
115 | 122 | logging.info(f'Aggregating {project} data..') |
116 | | - timeframe_chunks = divide_timeframe(timeframe=timeframe, granularity=aggregate_by) |
117 | | - timeframe_chunk_starts = hlp.format_time_chunks(time_chunks=timeframe_chunks, granularity=aggregate_by) |
| 123 | + timeframe_chunks = divide_timeframe(timeframe=timeframe, estimation_window=estimation_window, frequency=frequency) |
| 124 | + representative_dates = hlp.get_representative_dates(time_chunks=timeframe_chunks) |
118 | 125 | blocks_per_entity = defaultdict(dict) |
119 | 126 | for i, chunk in enumerate(timeframe_chunks): |
120 | 127 | chunk_start, chunk_end = chunk |
121 | | - t_chunk = timeframe_chunk_starts[i] |
122 | 128 | chunk_blocks_per_entity = aggregator.aggregate(chunk_start, chunk_end) |
123 | 129 | for entity, blocks in chunk_blocks_per_entity.items(): |
124 | | - blocks_per_entity[entity][t_chunk] = blocks |
| 130 | + blocks_per_entity[entity][representative_dates[i]] = blocks |
125 | 131 |
|
126 | 132 | hlp.write_blocks_per_entity_to_file( |
127 | 133 | output_dir=aggregator.aggregated_data_dir, |
128 | 134 | blocks_per_entity=blocks_per_entity, |
129 | | - time_chunks=timeframe_chunk_starts, |
| 135 | + dates=representative_dates, |
130 | 136 | filename=filename |
131 | 137 | ) |
132 | 138 | return timeframe_chunks |
|
0 commit comments