Skip to content

Commit f1e36d9

Browse files
Output restructuring (#106)
* Save mapped data in dedicated dir * Make plotting less verbose * Update mapping doc * Stop generating mapped files for timeframes with no data * Update tests + analyze fix * Fix flake8 issues * Add newline
1 parent f890368 commit f1e36d9

File tree

10 files changed

+98
-67
lines changed

10 files changed

+98
-67
lines changed

consensus_decentralization/analyze.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,25 +36,31 @@ def analyze(projects, timeframes, output_dir):
3636
if timeframe not in csv_contents[metric].keys():
3737
csv_contents[metric][timeframe] = timeframe
3838

39-
# Get mapped data for the year that corresponds to the timeframe.
39+
# Get mapped data for the year that corresponds to the timeframe, if such data exists
4040
# This is needed because the Gini coefficient is computed over all entities per each year.
4141
year = timeframe[:4]
4242
yearly_entities = set()
4343
yearly_entity_groups = set()
44-
with open(output_dir / f'{project}/{year}.csv') as f:
45-
for line in f.readlines()[1:]:
46-
entity_group, entity, _ = line.split(',')
47-
yearly_entities.add(entity)
48-
yearly_entity_groups.add(entity_group)
49-
50-
# Get mapped data for the defined timeframe.
51-
with open(output_dir / f'{project}/{timeframe}.csv') as f:
52-
blocks_per_entity = {}
53-
blocks_per_entity_group = defaultdict(int, {'Unknown': 0})
54-
for line in f.readlines()[1:]:
55-
entity_group, entity, resources = line.split(',')
56-
blocks_per_entity[entity] = int(resources)
57-
blocks_per_entity_group[entity_group] += int(resources)
44+
try:
45+
with open(output_dir / f'{project}/mapped_data/{year}.csv') as f:
46+
for line in f.readlines()[1:]:
47+
entity_group, entity, _ = line.split(',')
48+
yearly_entities.add(entity)
49+
yearly_entity_groups.add(entity_group)
50+
except FileNotFoundError:
51+
pass
52+
53+
blocks_per_entity = {}
54+
blocks_per_entity_group = defaultdict(int, {'Unknown': 0})
55+
try:
56+
# Get mapped data for the defined timeframe, if such data exists
57+
with open(output_dir / f'{project}/mapped_data/{timeframe}.csv') as f:
58+
for line in f.readlines()[1:]:
59+
entity_group, entity, resources = line.split(',')
60+
blocks_per_entity[entity] = int(resources)
61+
blocks_per_entity_group[entity_group] += int(resources)
62+
except FileNotFoundError:
63+
pass
5864

5965
results = {}
6066
results_unknowns_grouped = {}

consensus_decentralization/map.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,14 @@ def apply_mapping(project, timeframes, output_dir, force_map):
3636

3737
computed_yearly_mappings = set() # Keep track of computed yearly mappings to avoid recomputing them in the same run
3838
for timeframe in timeframes:
39-
output_file = project_output_dir / f'{timeframe}.csv'
39+
output_file = mapping.mapped_data_dir / f'{timeframe}.csv'
4040
if not output_file.is_file() or force_map:
4141
mapping.perform_mapping(timeframe)
4242

4343
# Get mapped data for the year that corresponds to the timeframe.
4444
# This is needed because the Gini coefficient is computed over all entities per each year.
4545
year = timeframe[:4]
46-
year_file = project_output_dir / f'{year}.csv'
46+
year_file = mapping.mapped_data_dir / f'{year}.csv'
4747
if not year_file.is_file() or (force_map and year not in computed_yearly_mappings):
4848
mapping.perform_mapping(year)
4949
computed_yearly_mappings.add(year)

consensus_decentralization/mappings/cardano_mapping.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,6 @@ def process(self, timeframe):
7171
blocks_per_entity[entity.replace(',', '')] += 1
7272

7373
groups = self.map_block_creators_to_groups(blocks_per_entity.keys())
74-
hlp.write_blocks_per_entity_to_file(self.io_dir, blocks_per_entity, groups, timeframe)
74+
hlp.write_blocks_per_entity_to_file(self.mapped_data_dir, blocks_per_entity, groups, timeframe)
7575

7676
return blocks_per_entity

consensus_decentralization/mappings/default_mapping.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ class DefaultMapping:
99
methods must use a mapping class that inherits from this one.
1010
1111
:ivar project_name: the name of the project associated with a specific mapping instance
12-
:ivar io_dir: the directory that includes the parsed data related to the project
12+
:ivar parsed_data_dir: the directory that includes the parsed data related to the project
13+
:ivar mapped_data_dir: the directory to save the mapped data files in
14+
:ivar multi_pool_dir: the directory to save the multi pool data files in
1315
:ivar dataset: a dictionary with the parsed data of the project
1416
:ivar special_addresses: a set with the special addresses of the project (addresses that don't count in the
1517
context of out analysis)
@@ -23,7 +25,11 @@ class DefaultMapping:
2325

2426
def __init__(self, project_name, io_dir):
2527
self.project_name = project_name
26-
self.io_dir = io_dir
28+
self.parsed_data_dir = io_dir
29+
self.mapped_data_dir = io_dir / 'mapped_data'
30+
self.mapped_data_dir.mkdir(parents=True, exist_ok=True)
31+
self.multi_pool_dir = io_dir / 'multi_pool_data'
32+
self.multi_pool_dir.mkdir(parents=True, exist_ok=True)
2733
self.dataset = None
2834
self.special_addresses = hlp.get_special_addresses(project_name)
2935
self.known_addresses = hlp.get_known_addresses(project_name)
@@ -47,7 +53,7 @@ def read_project_data(self):
4753
Reads the parsed data from the directory specified by the instance
4854
:returns: a dictionary with the parsed data
4955
"""
50-
with open(self.io_dir / 'parsed_data.json') as f:
56+
with open(self.parsed_data_dir / 'parsed_data.json') as f:
5157
data = json.load(f)
5258
return data
5359

@@ -137,11 +143,11 @@ def write_multi_pool_files(self, timeframe):
137143
with multiple pools, if any such blocks/addresses were found for the project
138144
"""
139145
if self.multi_pool_addresses:
140-
with open(self.io_dir / f'multi_pool_addresses_{timeframe}.csv', 'w') as f:
146+
with open(self.multi_pool_dir / f'multi_pool_addresses_{timeframe}.csv', 'w') as f:
141147
f.write('Block No,Timestamp,Address,Entity\n' + '\n'.join(self.multi_pool_addresses))
142148

143149
if self.multi_pool_blocks:
144-
with open(self.io_dir / f'multi_pool_blocks_{timeframe}.csv', 'w') as f:
150+
with open(self.multi_pool_dir / f'multi_pool_blocks_{timeframe}.csv', 'w') as f:
145151
f.write('Block No,Timestamp,Entities\n' + '\n'.join(self.multi_pool_blocks))
146152

147153
def process(self, timeframe):
@@ -168,7 +174,8 @@ def process(self, timeframe):
168174
blocks_per_entity[entity.replace(',', '')] += 1
169175

170176
groups = self.map_block_creators_to_groups(blocks_per_entity.keys())
171-
hlp.write_blocks_per_entity_to_file(self.io_dir, blocks_per_entity, groups, timeframe)
177+
if len(blocks_per_entity) > 0:
178+
hlp.write_blocks_per_entity_to_file(self.mapped_data_dir, blocks_per_entity, groups, timeframe)
172179

173180
if len(timeframe) == 4: # If timeframe is a year, also write multi-pool addresses and blocks to file
174181
self.write_multi_pool_files(timeframe)

consensus_decentralization/mappings/dummy_mapping.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,6 @@ def process(self, timeframe):
2222

2323
blocks_per_entity[entity] += 1
2424

25-
write_blocks_per_entity_to_file(self.io_dir, blocks_per_entity, blocks_per_entity.keys, timeframe)
25+
write_blocks_per_entity_to_file(self.mapped_data_dir, blocks_per_entity, blocks_per_entity.keys, timeframe)
2626

2727
return blocks_per_entity

consensus_decentralization/plot.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,6 @@ def plot_animated_stack_area_chart(values, execution_id, path, ylabel, legend_la
131131

132132
def plot_dynamics_per_ledger(ledgers, top_k=-1, animated=False, legend=False):
133133
for ledger in ledgers:
134-
logging.info(f"Plotting {'(animated)' if animated else ''} {ledger} data..")
135134
path = hlp.OUTPUT_DIR / ledger
136135
figures_path = path / 'figures'
137136
if not figures_path.is_dir():
@@ -146,7 +145,10 @@ def plot_dynamics_per_ledger(ledgers, top_k=-1, animated=False, legend=False):
146145
for month in range(1, 13):
147146
timeframe = f'{year}-0{month}' if month < 10 else f'{year}-{month}'
148147
filename = f'{timeframe}.csv'
149-
blocks = hlp.get_blocks_per_entity_from_file(path / filename)
148+
file = path / "mapped_data" / filename
149+
if not file.is_file():
150+
continue # Only plot timeframes for which mapped data exist
151+
blocks = hlp.get_blocks_per_entity_from_file(file)
150152
total_blocks = sum(blocks.values())
151153
if total_blocks == 0:
152154
continue
@@ -207,7 +209,6 @@ def plot_dynamics_per_ledger(ledgers, top_k=-1, animated=False, legend=False):
207209

208210
def plot_comparative_metrics(ledgers, metrics, animated=False):
209211
for metric in metrics:
210-
logging.info(f"Plotting {'(animated)' if animated else ''} {metric}..")
211212
figures_path = hlp.OUTPUT_DIR / 'figures'
212213
if not figures_path.is_dir():
213214
figures_path.mkdir()
@@ -233,7 +234,6 @@ def plot_comparative_metrics(ledgers, metrics, animated=False):
233234

234235
def plot_confidence_intervals(ledgers, metrics):
235236
for metric in metrics:
236-
logging.info(f"Plotting {metric} (with confidence intervals)..")
237237
figures_path = hlp.OUTPUT_DIR / 'figures'
238238
if not figures_path.is_dir():
239239
figures_path.mkdir()
@@ -260,7 +260,8 @@ def plot_confidence_intervals(ledgers, metrics):
260260
plt.savefig(figures_path / filename, bbox_inches='tight')
261261

262262

263-
def plot(ledgers, metrics, animated, show_confidence=True):
263+
def plot(ledgers, metrics, animated, show_confidence=False):
264+
logging.info("Creating plots..")
264265
plot_dynamics_per_ledger(ledgers, animated=False, legend=True)
265266
plot_comparative_metrics(ledgers, metrics, animated=False)
266267
if animated:

docs/mappings.md

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
# Mappings
22

3-
A mapping obtains the parsed data (from `output/<project_name>/parsed_data.json`) and outputs a `csv` file that maps
4-
blocks to entities, structured as follows:
3+
A mapping obtains the parsed data of a ledger (from `output/<project_name>/parsed_data.json`) and outputs one or more
4+
`csv` files that map blocks to entities, structured as follows:
55

66
```
77
Entity,Resources
88
<name of entity>,<(int) number of blocks>
99
```
1010

11-
The name of the `csv` file is the timeframe, over which the mapping was executed (e.g., `2021-04.csv`). The file is stored in the
12-
project's output directory (`output/<project_name>/`).
11+
Specifically, if the `timeframe` argument is provided during execution, then the mapping outputs a single `csv`
12+
file that corresponds to that timeframe. Otherwise, it outputs a `csv` file for each month contained in the default
13+
time range (as specified in the [config file](https://github.com/Blockchain-Technology-Lab/pooling-analysis/blob/main/config.yaml)).
14+
It also outputs a `csv` file for each year contained in the relevant time frames.
15+
16+
Each `csv` file is named after the timeframe over which the mapping was executed (e.g., `2021-04.csv`) and is
17+
stored in a dedicated folder in the project's output directory (`output/<project_name>/mapped_data`).
1318

1419
The logic of the mapping depends on the type of clustering we want to achieve. So, different mappings will output
1520
different results, even if applied on the same data. An exception to this is the "no-cluster" mapping (DummyMapping

run.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
from consensus_decentralization.analyze import analyze
55
from consensus_decentralization.parse import parse
66
from consensus_decentralization.plot import plot
7-
from consensus_decentralization.helper import valid_date, RAW_DATA_DIR, OUTPUT_DIR, get_default_ledgers, get_start_end_years
7+
from consensus_decentralization.helper import valid_date, RAW_DATA_DIR, OUTPUT_DIR, get_default_ledgers, \
8+
get_start_end_years
89

910
logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO)
1011

tests/test_4_mappings.py

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,21 @@ def test_map(setup_and_cleanup):
6767
parse(project, test_raw_data_dir, test_output_dir)
6868
apply_mapping(project, timeframes, test_output_dir, force_map)
6969

70-
output_file = test_output_dir / project / f'{timeframes[0]}.csv'
70+
output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
71+
assert not output_file.is_file() # since there is no data from 2010 in the sample
72+
73+
yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
74+
assert not yearly_output_file.is_file()
75+
76+
output_file = test_output_dir / project / f'mapped_data/{timeframes[1]}.csv'
7177
assert output_file.is_file()
7278

73-
yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
79+
yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[1][:4]}.csv'
7480
assert yearly_output_file.is_file()
7581

82+
output_file = test_output_dir / project / f'mapped_data/{timeframes[2]}.csv'
83+
assert output_file.is_file()
84+
7685
try:
7786
os.remove(str(pool_info_dir / f'clusters/{project}.json')) # Remove temp pool info file
7887
except FileNotFoundError:
@@ -131,7 +140,7 @@ def test_bitcoin_mapping(setup_and_cleanup):
131140
'GBMiners,GBMiners,2'
132141
]
133142

134-
output_file = test_output_dir / project / f'{timeframes[0]}.csv'
143+
output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
135144
with open(output_file) as f:
136145
for idx, line in enumerate(f.readlines()):
137146
assert line == expected_output[idx]
@@ -144,7 +153,7 @@ def test_bitcoin_mapping(setup_and_cleanup):
144153
'Unknown,1AM2fYfpY3ZeMeCKXmN66haoWxvB89pJUx,1'
145154
]
146155

147-
yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
156+
yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
148157
with open(yearly_output_file) as f:
149158
for idx, line in enumerate(f.readlines()):
150159
assert expected_output[idx] == line
@@ -161,7 +170,7 @@ def test_bitcoin_mapping(setup_and_cleanup):
161170
'Bitmain,Bitmain,1',
162171
]
163172

164-
output_file = test_output_dir / project / f'{timeframes[0]}.csv'
173+
output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
165174
with open(output_file) as f:
166175
for idx, line in enumerate(f.readlines()):
167176
assert expected_output[idx] == line
@@ -233,12 +242,12 @@ def test_ethereum_mapping(setup_and_cleanup):
233242
'Unknown,0x45133a7e1cc7e18555ae8a4ee632a8a61de90df6,1'
234243
]
235244

236-
output_file = test_output_dir / project / f'{timeframes[0]}.csv'
245+
output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
237246
with open(output_file) as f:
238247
for idx, line in enumerate(f.readlines()):
239248
assert expected_output[idx] == line
240249

241-
yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
250+
yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
242251
with open(yearly_output_file) as f:
243252
for idx, line in enumerate(f.readlines()):
244253
assert expected_output[idx] == line
@@ -254,7 +263,7 @@ def test_ethereum_mapping(setup_and_cleanup):
254263
'MEV Builder: 0x3B...436,MEV Builder: 0x3B...436,1'
255264
]
256265

257-
output_file = test_output_dir / project / f'{timeframes[0]}.csv'
266+
output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
258267
with open(output_file) as f:
259268
for idx, line in enumerate(f.readlines()):
260269
assert expected_output[idx] == line
@@ -313,12 +322,12 @@ def test_cardano_mapping(setup_and_cleanup):
313322
'1percentpool,1percentpool,1'
314323
]
315324

316-
output_file = test_output_dir / project / f'{timeframes[0]}.csv'
325+
output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
317326
with open(output_file) as f:
318327
for idx, line in enumerate(f.readlines()):
319328
assert expected_output[idx] == line
320329

321-
yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
330+
yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
322331
with open(yearly_output_file) as f:
323332
for idx, line in enumerate(f.readlines()):
324333
assert expected_output[idx] == line
@@ -385,12 +394,12 @@ def test_tezos_mapping(setup_and_cleanup):
385394
'Unknown,----- UNDEFINED MINER -----,1'
386395
]
387396

388-
output_file = test_output_dir / project / f'{timeframes[0]}.csv'
397+
output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
389398
with open(output_file) as f:
390399
for idx, line in enumerate(f.readlines()):
391400
assert expected_output[idx] == line
392401

393-
yearly_output_file = test_output_dir / project / f'{timeframes[0][:4]}.csv'
402+
yearly_output_file = test_output_dir / project / f'mapped_data/{timeframes[0][:4]}.csv'
394403
with open(yearly_output_file) as f:
395404
for idx, line in enumerate(f.readlines()):
396405
assert expected_output[idx] == line
@@ -406,7 +415,7 @@ def test_tezos_mapping(setup_and_cleanup):
406415
'Unknown,tz0000000000000000000000000000000000,1'
407416
]
408417

409-
output_file = test_output_dir / project / f'{timeframes[0]}.csv'
418+
output_file = test_output_dir / project / f'mapped_data/{timeframes[0]}.csv'
410419
with open(output_file) as f:
411420
for idx, line in enumerate(f.readlines()):
412421
assert expected_output[idx] == line
@@ -426,7 +435,8 @@ def test_tezos_mapping(setup_and_cleanup):
426435

427436

428437
def test_get_reward_addresses():
429-
default_mapping = DefaultMapping("sample_bitcoin", None)
438+
some_path = pathlib.Path()
439+
default_mapping = DefaultMapping("sample_bitcoin", some_path)
430440

431441
block = {
432442
"number": 625113,
@@ -465,7 +475,8 @@ def test_get_reward_addresses():
465475
reward_addresses = default_mapping.get_reward_addresses(block)
466476
assert reward_addresses is None
467477

468-
eth_mapping = EthereumMapping("sample_ethereum", None)
478+
some_path = pathlib.Path()
479+
eth_mapping = EthereumMapping("sample_ethereum", some_path)
469480
block = {
470481
"number": 6982695,
471482
"timestamp": "2018-12-31 00:00:12+00:00",
@@ -477,7 +488,8 @@ def test_get_reward_addresses():
477488

478489

479490
def test_from_known_addresses():
480-
cardano_mapping = CardanoMapping("sample_cardano", None)
491+
some_path = pathlib.Path()
492+
cardano_mapping = CardanoMapping("sample_cardano", some_path)
481493

482494
block = {
483495
"number": 92082690,

0 commit comments

Comments
 (0)