Merge pull request #56 from Blockchain-Technology-Lab/performance

dimkarakostas · web-flow · commit a534b1629e7b · 2023-12-18T11:55:38.000+02:00
Memory consumption reduction
diff --git a/data_collection_scripts/big_query_balance_data.py b/data_collection_scripts/big_query_balance_data.py
@@ -1,7 +1,8 @@
 """
-    This script can be used to run queries on BigQuery for any number of blockchains, and save the results in the input
-    directory of the project.
-    The relevant queries must be stored in a file named 'queries.yaml' in the root directory of the project.
+    This script can be used to run queries on BigQuery for any number of blockchains,
+    and save the results in the input directory of the project.
+    The relevant queries must be stored in a file named 'queries.yaml'
+    in the data_collection_scripts directory of the project.
 
     Attention! Before running this script, you need to generate service account credentials from Google, as described
     here (https://developers.google.com/workspace/guides/create-credentials#service-account) and save your key in the
@@ -22,7 +23,7 @@ def collect_data(ledgers, snapshot_dates, force_query):
     if not input_dir.is_dir():
         input_dir.mkdir()
 
-    with open(root_dir / "queries.yaml") as f:
+    with open(root_dir / "data_collection_scripts/queries.yaml") as f:
         queries = safe_load(f)
 
     i = 0
@@ -44,7 +45,7 @@ def collect_data(ledgers, snapshot_dates, force_query):
 
             while True:
                 try:
-                    client = bq.Client.from_service_account_json(json_credentials_path=root_dir / f"google-service-account-key-{i}.json")
+                    client = bq.Client.from_service_account_json(json_credentials_path=root_dir / f"data_collection_scripts/google-service-account-key-{i}.json")
                 except FileNotFoundError:
                     logging.info(f'Exhausted all {i} service account keys. Aborting..')
                     all_quota_exceeded = True
diff --git a/data_collection_scripts/google-service-account-key-SAMPLE.json b/data_collection_scripts/google-service-account-key-SAMPLE.json
diff --git a/data_collection_scripts/queries.yaml b/data_collection_scripts/queries.yaml
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
@@ -90,8 +90,8 @@ def test_analyze_snapshot(mocker):
     output = analyze_snapshot(None, 'bitcoin', '2010-01-01')
     assert output == {'top-1_absolute exclude_below_fees exclude_contracts non-clustered hhi': 1}
 
-    get_clustered_entries_mock.return_value = [['entity', 4], ['entity 2', 4]]
-    get_nonclustered_entries_mock.return_value = [['address', 4], ['address 2', 4]]
+    get_clustered_entries_mock.return_value = [[4, ], [4, ]]
+    get_nonclustered_entries_mock.return_value = [[4, ], [4, ]]
 
     get_force_analyze_mock.return_value = True
 
diff --git a/tests/test_helper.py b/tests/test_helper.py
@@ -238,7 +238,7 @@ def test_get_top_limit_value(mocker):
 
 
 def test_get_circulation_from_entries():
-    entries = [['i0', 10], ['i1', 11]]
+    entries = [[10, ], [11, ]]
     circulation = hlp.get_circulation_from_entries(entries)
     assert circulation == 21
 
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -3,51 +3,51 @@
 
 
 def test_tau_50():
-    tokens_per_entity = [('a', 3.0), ('b', 2), ('c', 1)]
+    tokens_per_entity = [(3.0, ), (2, ), (1, )]
     tau_index, tau_market_share = compute_tau(tokens_per_entity, circulation=6, threshold=0.5)
     assert tau_index == 1
     assert tau_market_share == 0.5
 
-    tokens_per_entity = [('a', 3), ('b', 2), ('c', 1), ('d', 1), ('e', 1), ('f', 1)]
+    tokens_per_entity = [(3, ), (2, ), (1, ), (1, ), (1, ), (1, )]
     tau_index, tau_market_share = compute_tau(tokens_per_entity, circulation=9, threshold=0.5)
     assert tau_index == 2
     assert round(tau_market_share, 2) == 0.56
 
-    tokens_per_entity = [('a', 1)]
+    tokens_per_entity = [(1, )]
     tau_index, tau_market_share = compute_tau(tokens_per_entity, circulation=1, threshold=0.5)
     assert tau_index == 1
     assert tau_market_share == 1
 
 
 def test_tau_33():
-    tokens_per_entity = [('a', 3.0), ('b', 2), ('c', 1)]
+    tokens_per_entity = [(3.0, ), (2, ), (1, )]
     tau_index, tau_market_share = compute_tau(tokens_per_entity, circulation=6, threshold=0.33)
     assert tau_index == 1
     assert tau_market_share == 0.5
 
-    tokens_per_entity = [('a', 3), ('b', 2), ('c', 1), ('d', 1), ('e', 1), ('f', 1)]
+    tokens_per_entity = [(3, ), (2, ), (1, ), (1, ), (1, ), (1, )]
     tau_index, tau_market_share = compute_tau(tokens_per_entity, circulation=9, threshold=0.33)
     assert tau_index == 1
     assert round(tau_market_share, 2) == 0.33
 
-    tokens_per_entity = [('a', 1)]
+    tokens_per_entity = [(1, )]
     tau_index, tau_market_share = compute_tau(tokens_per_entity, circulation=1, threshold=0.33)
     assert tau_index == 1
     assert tau_market_share == 1
 
 
 def test_tau_66():
-    tokens_per_entity = [('a', 3.0), ('b', 2), ('c', 1)]
+    tokens_per_entity = [(3.0, ), (2, ), (1, )]
     tau_index, tau_market_share = compute_tau(tokens_per_entity, circulation=6, threshold=0.66)
     assert tau_index == 2
     assert round(tau_market_share, 2) == 0.83
 
-    tokens_per_entity = [('a', 3), ('b', 2), ('c', 1), ('d', 1), ('e', 1), ('f', 1)]
+    tokens_per_entity = [(3, ), (2, ), (1, ), (1, ), (1, ), (1, )]
     tau_index, tau_market_share = compute_tau(tokens_per_entity, circulation=9, threshold=0.66)
     assert tau_index == 3
     assert round(tau_market_share, 2) == 0.67
 
-    tokens_per_entity = [('a', 1)]
+    tokens_per_entity = [(1, )]
     tau_index, tau_market_share = compute_tau(tokens_per_entity, circulation=1, threshold=0.66)
     assert tau_index == 1
     assert tau_market_share == 1
@@ -58,19 +58,19 @@ def test_gini():
     Ensure that the results of the compute_gini function are consistent with online calculators,
     such as https://goodcalculators.com/gini-coefficient-calculator/ (5 decimal accuracy)
     """
-    tokens_per_entity = [('a', 3.0), ('b', 2), ('c', 1)]
+    tokens_per_entity = [(3.0, ), (2, ), (1, )]
     gini = compute_gini(tokens_per_entity, circulation=6)
     assert round(gini, 5) == 0.22222
 
-    tokens_per_entity = [('a', 3), ('b', 2), ('c', 1), ('d', 1), ('e', 1), ('f', 1)]
+    tokens_per_entity = [(3, ), (2, ), (1, ), (1, ), (1, ), (1, )]
     gini = compute_gini(tokens_per_entity, circulation=9)
     assert round(gini, 5) == 0.24074
 
-    tokens_per_entity = [('a', 1)]
+    tokens_per_entity = [(1, )]
     gini = compute_gini(tokens_per_entity, circulation=1)
     assert gini == 0
 
-    tokens_per_entity = [('a', 1), ('b', 1), ('c', 1)]
+    tokens_per_entity = [(1, ), (1, ), (1, )]
     gini = compute_gini(tokens_per_entity, circulation=3)
     assert round(gini, 5) == 0  # Note that this test case fails if we don't round, because of floating point errors
 
@@ -80,19 +80,19 @@ def test_hhi():
     Ensure that the results of the compute_hhi function are consistent with online calculators,
     such as https://www.unclaw.com/chin/teaching/antitrust/herfindahl.htm
     """
-    tokens_per_entity = [('a', 3.0), ('b', 2), ('c', 1)]
+    tokens_per_entity = [(3.0, ), (2, ), (1, )]
     hhi = compute_hhi(tokens_per_entity, circulation=6)
     assert round(hhi) == 3889
 
-    tokens_per_entity = [('a', 3), ('b', 2), ('c', 1), ('d', 1), ('e', 1), ('f', 1)]
+    tokens_per_entity = [(3, ), (2, ), (1, ), (1, ), (1, ), (1, )]
     hhi = compute_hhi(tokens_per_entity, circulation=9)
     assert round(hhi) == 2099
 
-    tokens_per_entity = [('a', 1)]
+    tokens_per_entity = [(1, )]
     hhi = compute_hhi(tokens_per_entity, circulation=1)
     assert round(hhi) == 10000
 
-    tokens_per_entity = [('a', 1), ('b', 1), ('c', 1)]
+    tokens_per_entity = [(1, ), (1, ), (1, )]
     hhi = compute_hhi(tokens_per_entity, circulation=3)
     assert round(hhi) == 3333
 
@@ -102,51 +102,51 @@ def test_shannon_entropy():
     Ensure that the results of the compute_shannon_entropy function are consistent with online calculators,
     such as: https://www.omnicalculator.com/statistics/shannon-entropy
     """
-    tokens_per_entity = [('a', 3.0), ('b', 2), ('c', 1)]
+    tokens_per_entity = [(3.0, ), (2, ), (1, )]
     entropy = compute_shannon_entropy(tokens_per_entity, circulation=6)
     assert round(entropy, 3) == 1.459
 
-    tokens_per_entity = [('a', 3), ('b', 2), ('c', 1), ('d', 1), ('e', 1), ('f', 1)]
+    tokens_per_entity = [(3, ), (2, ), (1, ), (1, ), (1, ), (1, )]
     entropy = compute_shannon_entropy(tokens_per_entity, circulation=9)
     assert round(entropy, 3) == 2.419
 
-    tokens_per_entity = [('a', 1)]
+    tokens_per_entity = [(1, )]
     entropy = compute_shannon_entropy(tokens_per_entity, circulation=1)
     assert entropy == 0
 
-    tokens_per_entity = [('a', 1), ('b', 1), ('c', 1)]
+    tokens_per_entity = [(1, ), (1, ), (1, )]
     entropy = compute_shannon_entropy(tokens_per_entity, circulation=3)
     assert round(entropy, 3) == 1.585
 
 
 def test_total_entities():
-    tokens_per_entity = [('a', 3.0), ('b', 2), ('c', 1)]
+    tokens_per_entity = [(3.0, ), (2, ), (1, )]
     total_entities = compute_total_entities(tokens_per_entity, circulation=6)
     assert total_entities == 3
 
-    tokens_per_entity = [('a', 3), ('b', 2), ('c', 1), ('d', 1), ('e', 1), ('f', 1)]
+    tokens_per_entity = [(3, ), (2, ), (1, ), (1, ), (1, ), (1, )]
     total_entities = compute_total_entities(tokens_per_entity, circulation=9)
     assert total_entities == 6
 
-    tokens_per_entity = [('a', 1)]
+    tokens_per_entity = [(1, )]
     total_entities = compute_total_entities(tokens_per_entity, circulation=1)
     assert total_entities == 1
 
 
 def test_compute_max_power_ratio():
-    tokens_per_entity = [('a', 3.0), ('b', 2), ('c', 1)]
+    tokens_per_entity = [(3.0, ), (2, ), (1, )]
     max_mpr = compute_max_power_ratio(tokens_per_entity, circulation=6)
     assert max_mpr == 0.5
 
-    tokens_per_entity = [('a', 3), ('b', 2), ('c', 1), ('d', 1), ('e', 1), ('f', 1)]
+    tokens_per_entity = [(3, ), (2, ), (1, ), (1, ), (1, ), (1, )]
     max_mpr = compute_max_power_ratio(tokens_per_entity, circulation=9)
     assert max_mpr == 1 / 3
 
-    tokens_per_entity = [('a', 1)]
+    tokens_per_entity = [(1, )]
     max_mpr = compute_max_power_ratio(tokens_per_entity, circulation=1)
     assert max_mpr == 1
 
-    tokens_per_entity = [('a', 1), ('b', 1), ('c', 1)]
+    tokens_per_entity = [(1, ), (1, ), (1, )]
     max_mpr = compute_max_power_ratio(tokens_per_entity, circulation=3)
     assert max_mpr == 1 / 3
 
@@ -158,22 +158,22 @@ def test_compute_theil_index():
     """
     decimals = 3
 
-    tokens_per_entity = [('a', 3.0), ('b', 2), ('c', 1)]
+    tokens_per_entity = [(3.0, ), (2, ), (1, )]
     theil_t = compute_theil_index(tokens_per_entity, 6)
     assert round(theil_t, decimals) == 0.087
 
-    tokens_per_entity = [('a', 3), ('b', 2), ('c', 1), ('d', 1), ('e', 1), ('f', 1)]
+    tokens_per_entity = [(3, ), (2, ), (1, ), (1, ), (1, ), (1, )]
     theil_t = compute_theil_index(tokens_per_entity, 9)
     assert round(theil_t, decimals) == 0.115
 
-    tokens_per_entity = {('a', 432), ('b', 0), ('c', 0), ('d', 0)}
+    tokens_per_entity = [(432, ), (0, ), (0, ), (0, )]
     theil_t = compute_theil_index(tokens_per_entity, 432)
     assert round(theil_t, decimals) == 1.386
 
-    tokens_per_entity = {('a', 432)}
+    tokens_per_entity = [(432, )]
     theil_t = compute_theil_index(tokens_per_entity, 432)
     assert round(theil_t, decimals) == 0
 
-    tokens_per_entity = {}
+    tokens_per_entity = []
     theil_t = compute_theil_index(tokens_per_entity, 432)
     assert theil_t == 0
diff --git a/tokenomics_decentralization/db_helper.py b/tokenomics_decentralization/db_helper.py
@@ -154,7 +154,7 @@ def get_non_clustered_balance_entries(conn, snapshot, ledger, balance_threshold)
 
     start = time()
     query = f'''
-        SELECT addresses.name, balance
+        SELECT balance
         FROM balances
         LEFT JOIN addresses ON balances.address_id=addresses.id
         WHERE snapshot_id=?
@@ -187,15 +187,19 @@ def get_balance_entries(conn, snapshot, ledger, balance_threshold):
 
     start = time()
     query = f'''
-        SELECT IFNULL(entities.name, addresses.name) AS entity, SUM(CAST(balance AS REAL)) AS aggregate_balance
-        FROM balances
-        LEFT JOIN addresses ON balances.address_id=addresses.id
-        LEFT JOIN entities ON addresses.entity_id=entities.id
-        WHERE snapshot_id=?
-        {exclude_below_threshold_clause}
-        {exclude_contract_addresses_clause}
-        {special_addresses_clause}
-        GROUP BY entity
+        WITH entries AS (
+            SELECT IFNULL(entities.name, addresses.name) AS entity, SUM(CAST(balance AS REAL)) AS aggregate_balance
+            FROM balances
+            LEFT JOIN addresses ON balances.address_id=addresses.id
+            LEFT JOIN entities ON addresses.entity_id=entities.id
+            WHERE snapshot_id=?
+            {exclude_below_threshold_clause}
+            {exclude_contract_addresses_clause}
+            {special_addresses_clause}
+            GROUP BY entity
+        )
+        SELECT aggregate_balance
+        FROM entries
         ORDER BY aggregate_balance DESC
     '''
 
diff --git a/tokenomics_decentralization/helper.py b/tokenomics_decentralization/helper.py
@@ -292,7 +292,7 @@ def get_circulation_from_entries(entries):
     """
     circulation = 0
     for entry in entries:
-        circulation += int(entry[1])
+        circulation += int(entry[0])
     return circulation
 
 
diff --git a/tokenomics_decentralization/metrics.py b/tokenomics_decentralization/metrics.py