Skip to content

Commit 7c6b3e8

Browse files
committed
Update dir structure of produced files
1 parent 6bbd1fb commit 7c6b3e8

File tree

11 files changed

+120
-55
lines changed

11 files changed

+120
-55
lines changed

consensus_decentralization/helper.py

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@
1414

1515
ROOT_DIR = pathlib.Path(__file__).resolve().parent.parent
1616
RAW_DATA_DIR = ROOT_DIR / 'raw_block_data'
17-
OUTPUT_DIR = ROOT_DIR / 'output'
17+
INTERIM_DIR = ROOT_DIR / 'processed_data'
1818
MAPPING_INFO_DIR = ROOT_DIR / 'mapping_information'
19+
RESULTS_DIR = ROOT_DIR / 'results'
1920

2021
with open(ROOT_DIR / "config.yaml") as f:
2122
config = safe_load(f)
@@ -294,7 +295,7 @@ def read_mapped_project_data(project_dir):
294295
:param project_dir: pathlib.PosixPath object of the output directory corresponding to the project
295296
:returns: a dictionary with the mapped data
296297
"""
297-
with open(project_dir / 'mapped_data.json') as f:
298+
with open(project_dir / get_mapped_data_filename(get_clustering_flag())) as f:
298299
data = json.load(f)
299300
return data
300301

@@ -309,6 +310,15 @@ def get_representative_dates(time_chunks):
309310
return [str(chunk[0] + (chunk[1] - chunk[0]) // 2) for chunk in time_chunks]
310311

311312

313+
def get_aggregated_data_dir_name(clustering_flag):
314+
"""
315+
Determines the name of the directory that will contain the aggregated data
316+
:param clustering_flag: boolean that determines whether the data is clustered or not
317+
:returns: str that corresponds to the name of the directory
318+
"""
319+
return 'blocks_per_entity_' + ('clustered' if clustering_flag else 'non_clustered')
320+
321+
312322
def get_blocks_per_entity_filename(timeframe, estimation_window, frequency):
313323
"""
314324
Determines the filename of the csv file that contains the aggregated data
@@ -395,3 +405,35 @@ def get_force_map_flag():
395405
return config['execution_flags']['force_map']
396406
except KeyError:
397407
raise ValueError('Flag "force_map" missing from config file')
408+
409+
410+
def get_clustering_flag():
411+
"""
412+
Gets the flag that determines whether to perform clustering
413+
:returns: boolean
414+
:raises ValueError: if the flag is not set in the config file
415+
"""
416+
config = get_config_data()
417+
try:
418+
return config['analyze_flags']['clustering']
419+
except KeyError:
420+
raise ValueError('Flag "clustering" missing from config file')
421+
422+
423+
def get_results_dir(estimation_window, frequency, population_windows):
424+
"""
425+
Retrieves the path to the results directory for the specific config parameters
426+
:returns: pathlib.PosixPath object
427+
"""
428+
results_dir_name = (f'{estimation_window}_day_window_with_{population_windows}_population_windows_sampled_every'
429+
f'_{frequency}_days')
430+
return RESULTS_DIR / results_dir_name
431+
432+
433+
def get_mapped_data_filename(clustering_flag):
434+
"""
435+
Retrieves the filename of the mapped data file
436+
:param clustering_flag: boolean that determines whether the data is clustered or not
437+
:returns: str
438+
"""
439+
return 'mapped_data_' + ('clustered' if clustering_flag else 'non_clustered') + '.json'

consensus_decentralization/plot.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ def plot_animated_stack_area_chart(values, execution_id, path, ylabel, legend_la
123123
plt.close(fig)
124124

125125

126-
def plot_dynamics_per_ledger(ledgers, aggregated_data_filename, top_k=-1, unit='relative', animated=False, legend=False):
126+
def plot_dynamics_per_ledger(ledgers, aggregated_data_filename, output_dir, top_k=-1, unit='relative', animated=False,
127+
legend=False):
127128
"""
128129
Plots the dynamics of pools for each ledger in terms of produced blocks
129130
:param ledgers: list of strings representing the ledgers whose data will be plotted
@@ -137,13 +138,12 @@ def plot_dynamics_per_ledger(ledgers, aggregated_data_filename, top_k=-1, unit='
137138
:param legend: bool that specifies whether the plots to be generated will include a legend or not
138139
"""
139140
for ledger in ledgers:
140-
ledger_path = hlp.OUTPUT_DIR / ledger
141-
figures_path = ledger_path / 'figures'
142-
if not figures_path.is_dir():
143-
figures_path.mkdir()
141+
ledger_path = hlp.INTERIM_DIR / ledger
142+
figures_path = output_dir / ledger
143+
figures_path.mkdir(parents=True, exist_ok=True)
144144

145145
time_chunks, blocks_per_entity = hlp.get_blocks_per_entity_from_file(
146-
filepath=ledger_path / "blocks_per_entity" / aggregated_data_filename
146+
filepath=ledger_path / hlp.get_aggregated_data_dir_name(hlp.get_clustering_flag()) / aggregated_data_filename
147147
)
148148

149149
total_blocks_per_time_chunk = [0] * len(time_chunks)
@@ -213,13 +213,10 @@ def plot_dynamics_per_ledger(ledgers, aggregated_data_filename, top_k=-1, unit='
213213
)
214214

215215

216-
def plot_comparative_metrics(ledgers, metrics, animated=False):
216+
def plot_comparative_metrics(ledgers, metrics, metrics_dir, output_dir, animated=False):
217217
for metric in metrics:
218-
figures_path = hlp.OUTPUT_DIR / 'figures'
219-
if not figures_path.is_dir():
220-
figures_path.mkdir()
221-
filename = f'{metric}.csv'
222-
metric_df = pd.read_csv(hlp.OUTPUT_DIR / filename)
218+
metric_filepath = metrics_dir / f'{metric}.csv'
219+
metric_df = pd.read_csv(metric_filepath)
223220
# only keep rows that contain at least one (non-nan) value in the columns that correspond to the ledgers
224221
metric_df = metric_df[metric_df.iloc[:, 1:].notna().any(axis=1)]
225222
ledger_columns_to_keep = [col for col in metric_df.columns if col in ledgers]
@@ -233,7 +230,7 @@ def plot_comparative_metrics(ledgers, metrics, animated=False):
233230
x_label='Time',
234231
y_label=metric,
235232
filename=f"{metric}_{'_'.join(ledger_columns_to_keep)}",
236-
path=figures_path,
233+
path=output_dir,
237234
colors=colors
238235
)
239236
else:
@@ -242,24 +239,35 @@ def plot_comparative_metrics(ledgers, metrics, animated=False):
242239
x_label='Time',
243240
y_label=metric,
244241
filename=f"{metric}_{'_'.join(ledger_columns_to_keep)}",
245-
path=figures_path,
242+
path=output_dir,
246243
xtick_labels=metric_df['timeframe'],
247244
colors=colors
248245
)
249246

250247

251-
def plot(ledgers, metrics, aggregated_data_filename, animated):
248+
def plot(ledgers, metrics, aggregated_data_filename, animated, metrics_dir, figures_dir):
252249
logging.info("Creating plots..")
253-
plot_dynamics_per_ledger(ledgers=ledgers, aggregated_data_filename=aggregated_data_filename, animated=False, legend=True)
254-
plot_comparative_metrics(ledgers=ledgers, metrics=metrics, animated=False)
250+
#plot_dynamics_per_ledger(ledgers=ledgers, aggregated_data_filename=aggregated_data_filename, output_dir=
251+
# figures_dir, animated=False, legend=True)
252+
plot_comparative_metrics(ledgers=ledgers, metrics=metrics, animated=False, metrics_dir=metrics_dir, output_dir=figures_dir)
255253
if animated:
256-
plot_dynamics_per_ledger(ledgers=ledgers, aggregated_data_filename=aggregated_data_filename, animated=True)
257-
plot_comparative_metrics(ledgers=ledgers, metrics=metrics, animated=True)
254+
plot_dynamics_per_ledger(ledgers=ledgers, aggregated_data_filename=aggregated_data_filename,
255+
output_dir=figures_dir, animated=True)
256+
plot_comparative_metrics(ledgers=ledgers, metrics=metrics, animated=True, metrics_dir=metrics_dir, output_dir=figures_dir)
258257

259258

260259
if __name__ == '__main__':
261260
ledgers = hlp.get_ledgers()
262-
default_metrics = hlp.get_metrics_config().keys()
261+
262+
metrics = hlp.get_metrics_config()
263+
metric_params = []
264+
for key, args in metrics.items():
265+
if args:
266+
for val in args:
267+
metric_params.append((f'{key}={val}', key, val))
268+
else:
269+
metric_params.append((key, key, None))
270+
default_metrics = [name for name, _, _ in metric_params]
263271

264272
default_start_date, default_end_date = hlp.get_start_end_dates()
265273
timeframe_start = hlp.get_timeframe_beginning(default_start_date)
@@ -296,4 +304,4 @@ def plot(ledgers, metrics, aggregated_data_filename, animated):
296304
help='Flag to specify whether to also generate animated plots.'
297305
)
298306
args = parser.parse_args()
299-
plot(ledgers=args.ledgers, metrics=args.metrics, aggregated_data_filename=args.filename, animated=args.animated)
307+
plot(ledgers=args.ledgers, metrics=args.metrics, aggregated_data_filename=args.filename, animated=args.animated, results_dir=hlp.RESULTS_DIR)

docs/aggregator.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# Aggregator
22

3-
The aggregator obtains the mapped data of a ledger (from `output/<project_name>/mapped_data.json`) and aggregates it
4-
over units of time that are determined based on the given `timeframe` and `aggregate_by` parameters.
3+
The aggregator obtains the mapped data of a ledger (from `processed_data/<project_name>/mapped_data.json`) and
4+
aggregates it over units of time that are determined based on the given `timeframe` and `aggregate_by` parameters.
55
It then outputs a `csv` file with the distribution of blocks to entities for each time unit under consideration.
6-
This file is saved in the directory `output/<project name>/blocks_per_entity/` and is named based on the `timeframe`
7-
and `aggregate_by` parameters.
6+
This file is saved in the directory `processed_data/<project name>/blocks_per_entity/` and is named based on the
7+
`timeframe` and `aggregate_by` parameters.
88
For example, if the specified timeframe is from June 2023 to September 2023 and the aggregation is by month, then
99
the output file would be named `monthly_from_2023-06-01_to_2023-09-30.csv` and would be structured as follows:
1010
```

docs/mappings.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ A mapping is responsible for linking blocks to the entities that created them. W
44
information about the addresses that received rewards for producing some block or identifiers that are related to them,
55
it does not contain information about the entities that control these addresses, which is where the mapping comes in.
66

7-
The mapping takes as input the parsed data and outputs a file (`output/<project_name>/mapped_data.json`), which is
8-
structured as follows:
7+
The mapping takes as input the parsed data and outputs a file (`processed_data/<project_name>/mapped_data.json`),
8+
which is structured as follows:
99

1010
```
1111
[

docs/setup.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,6 @@ specified ledger. By default, this flag is set to False and no plots are generat
6565
is set to False and no animated plots are generated. Note that this flag is ignored if `--plot` is set to False.
6666

6767

68-
All output files can then be found under the `output/` directory, which is automatically created the first time the tool
69-
is run.
68+
All output files can then be found under the `results/` directory, which is automatically created the first time the
69+
tool is run. Interim files that are produced by some modules and are used by others can be found under the
70+
`processed_data/` directory.

run.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,16 @@
1010

1111

1212
def process_data(force_map, ledger_dir, ledger, output_dir):
13-
mapped_data_file = ledger_dir / 'mapped_data.json'
13+
clustering_flag = hlp.get_clustering_flag()
14+
mapped_data_file = ledger_dir / hlp.get_mapped_data_filename(clustering_flag)
1415
if force_map or not mapped_data_file.is_file():
1516
parsed_data = parse(ledger, input_dir=hlp.RAW_DATA_DIR)
16-
apply_mapping(ledger, parsed_data=parsed_data, output_dir=output_dir)
17+
return apply_mapping(ledger, parsed_data=parsed_data, output_dir=output_dir)
18+
return None
1719

1820

19-
def main(ledgers, timeframe, estimation_window, frequency, output_dir=hlp.OUTPUT_DIR):
21+
def main(ledgers, timeframe, estimation_window, frequency, interim_dir=hlp.INTERIM_DIR,
22+
results_dir=hlp.RESULTS_DIR):
2023
"""
2124
Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
2225
:param ledgers: list of strings that correspond to the ledgers whose data should be analyzed
@@ -27,41 +30,49 @@ def main(ledgers, timeframe, estimation_window, frequency, output_dir=hlp.OUTPUT
2730
:param frequency: int or None. The number of days to consider for the frequency of the analysis (i.e. the number
2831
of days between each data point considered in the analysis). If None, only one data point will be considered,
2932
spanning the entire timeframe (i.e. it needs to be combined with None estimation_window).
30-
:param output_dir: pathlib.PosixPath object of the directory where the output data will be saved
33+
:param interim_dir: pathlib.PosixPath object of the directory where the output data will be saved
3134
"""
3235
logging.info(f"The ledgers that will be analyzed are: {','.join(ledgers)}")
3336

3437
force_map = hlp.get_force_map_flag()
3538

3639
for ledger in ledgers:
37-
ledger_dir = output_dir / ledger
40+
ledger_dir = interim_dir / ledger
3841
ledger_dir.mkdir(parents=True, exist_ok=True) # create ledger output directory if it doesn't already exist
3942

40-
process_data(force_map, ledger_dir, ledger, output_dir)
43+
mapped_data = process_data(force_map, ledger_dir, ledger, interim_dir)
4144

4245
aggregate(
4346
ledger,
44-
output_dir,
47+
interim_dir,
4548
timeframe,
4649
estimation_window,
4750
frequency,
48-
force_map
51+
force_map,
52+
mapped_data=mapped_data
4953
)
5054

5155
aggregated_data_filename = hlp.get_blocks_per_entity_filename(timeframe, estimation_window, frequency)
56+
metrics_dir = results_dir / 'metrics'
57+
metrics_dir.mkdir(parents=True, exist_ok=True)
5258

5359
used_metrics = analyze(
5460
projects=ledgers,
5561
aggregated_data_filename=aggregated_data_filename,
56-
output_dir=output_dir
62+
input_dir=interim_dir,
63+
output_dir=metrics_dir
5764
)
5865

5966
if hlp.get_plot_flag():
67+
figures_dir = results_dir / 'figures'
68+
figures_dir.mkdir(parents=True, exist_ok=True)
6069
plot(
6170
ledgers=ledgers,
6271
metrics=used_metrics,
6372
aggregated_data_filename=aggregated_data_filename,
64-
animated=hlp.get_plot_config_data()['animated']
73+
animated=hlp.get_plot_config_data()['animated'],
74+
metrics_dir=metrics_dir,
75+
figures_dir=figures_dir
6576
)
6677

6778

@@ -70,6 +81,9 @@ def main(ledgers, timeframe, estimation_window, frequency, output_dir=hlp.OUTPUT
7081

7182
estimation_window, frequency = hlp.get_estimation_window_and_frequency()
7283

84+
results_dir = hlp.get_results_dir(estimation_window, frequency)
85+
results_dir.mkdir(parents=True, exist_ok=True)
86+
7387
start_date, end_date = hlp.get_start_end_dates()
7488
timeframe_start = hlp.get_timeframe_beginning(start_date)
7589
timeframe_end = hlp.get_timeframe_end(end_date)
@@ -78,6 +92,6 @@ def main(ledgers, timeframe, estimation_window, frequency, output_dir=hlp.OUTPUT
7892
'the first date.')
7993
timeframe = (timeframe_start, timeframe_end)
8094

81-
main(ledgers, timeframe, estimation_window, frequency)
95+
main(ledgers, timeframe, estimation_window, frequency, results_dir=results_dir)
8296

8397
logging.info('Done. Please check the output directory for results.')

tests/test_aggregate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import json
33
import shutil
44
import pytest
5-
from consensus_decentralization.helper import OUTPUT_DIR
5+
from consensus_decentralization.helper import INTERIM_DIR
66
from consensus_decentralization.aggregate import aggregate, Aggregator, divide_timeframe
77

88

@@ -14,7 +14,7 @@ def setup_and_cleanup():
1414
after (cleanup)
1515
"""
1616
# Set up
17-
test_io_dir = OUTPUT_DIR / "test_output"
17+
test_io_dir = INTERIM_DIR / "test_output"
1818
yield test_io_dir
1919
# Clean up
2020
shutil.rmtree(test_io_dir)

tests/test_analyze.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import shutil
22
import pytest
3-
from consensus_decentralization.helper import OUTPUT_DIR
3+
from consensus_decentralization.helper import INTERIM_DIR
44
from consensus_decentralization.analyze import analyze
55

66

@@ -12,7 +12,7 @@ def setup_and_cleanup():
1212
after (cleanup)
1313
"""
1414
# Set up
15-
test_io_dir = OUTPUT_DIR / "test_output"
15+
test_io_dir = INTERIM_DIR / "test_output"
1616
test_bitcoin_dir = test_io_dir / "sample_bitcoin"
1717
test_bitcoin_dir.mkdir(parents=True, exist_ok=True)
1818
# create files that would be the output of aggregation

tests/test_end_to_end.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from consensus_decentralization.map import ledger_mapping
1010
from consensus_decentralization.mappings.default_mapping import DefaultMapping
1111
from consensus_decentralization.mappings.cardano_mapping import CardanoMapping
12-
from consensus_decentralization.helper import OUTPUT_DIR, config
12+
from consensus_decentralization.helper import INTERIM_DIR, config
1313
import pytest
1414

1515

@@ -21,7 +21,7 @@ def setup_and_cleanup():
2121
after (cleanup)
2222
"""
2323
# Set up
24-
test_output_dir = OUTPUT_DIR / "test_output"
24+
test_output_dir = INTERIM_DIR / "test_output"
2525
ledger_mapping['sample_bitcoin'] = DefaultMapping
2626
ledger_parser['sample_bitcoin'] = DefaultParser
2727
ledger_mapping['sample_cardano'] = CardanoMapping
@@ -81,7 +81,7 @@ def test_end_to_end(setup_and_cleanup):
8181
(datetime.date(2010, 1, 1), datetime.date(2010, 12, 31)),
8282
estimation_window=None,
8383
frequency=None,
84-
output_dir=test_output_dir
84+
interim_dir=test_output_dir
8585
)
8686

8787
expected_entropy = [
@@ -116,7 +116,7 @@ def test_end_to_end(setup_and_cleanup):
116116
(datetime.date(2018, 2, 1), datetime.date(2018, 3, 31)),
117117
estimation_window=30,
118118
frequency=30,
119-
output_dir=test_output_dir
119+
interim_dir=test_output_dir
120120
)
121121

122122
expected_entropy = [
@@ -154,7 +154,7 @@ def test_end_to_end(setup_and_cleanup):
154154
(datetime.date(2020, 12, 1), datetime.date(2020, 12, 31)),
155155
estimation_window=31,
156156
frequency=31,
157-
output_dir=test_output_dir
157+
interim_dir=test_output_dir
158158
)
159159

160160
expected_entropy = [

0 commit comments

Comments
 (0)