|
1 | | -# cspell: words cloudscraper |
| 1 | +# cspell: words BLOCKFROST |
2 | 2 |
|
3 | 3 | """ |
4 | | -This script is a simple web scraper tool to prepare testing data for the `cardano/asset` endpoint. |
5 | | -
|
6 | | -Prerequisites before running this script: |
7 | | -- Make sure that you have `BeautifulSoup`, `cloudscraper`, and `certifi` installed. |
8 | | -- Make sure that you have a snapshot file available for this script, can get one from the `catalyst-storage` repo. |
9 | | -- Fill your own client params to `CF_CLEARANCE` and `USER_AGENT`. Other variables can be configured to fit the need. |
| 4 | +This script is a simple tool to prepare testing data for the `cardano/asset` endpoint. |
10 | 5 | """ |
11 | 6 |
|
12 | 7 | import json |
13 | 8 | import os |
14 | | -import time |
15 | | -import cloudscraper |
16 | | -import certifi |
17 | | -from decimal import Decimal |
| 9 | +import requests |
| 10 | +from loguru import logger |
18 | 11 |
|
19 | 12 | from utils import address |
20 | | -from bs4 import BeautifulSoup |
21 | | - |
22 | | -# ----- variables ----- |
23 | | -MAX_ATTEMPT = 3 |
24 | | - |
25 | | -# provide yours here, can acquire `CF_CLEARANCE` by going to `https://preprod.cexplorer.io`, |
26 | | -# and extract this field from the cookies header using the `network` tab |
27 | | -CF_CLEARANCE = "" |
28 | | - |
29 | | -# can be something like "Mozilla/5.0" |
30 | | -USER_AGENT = "" |
31 | | - |
32 | | -# relative path to this script file for the output |
33 | | -OUT_FILE = "./cardano-asset-80000000-preprod.json" |
34 | | - |
35 | | -# the snapshot file to read as a reference of scraping |
36 | | -IN_FILE = "./snapshot-80000000-preprod.json" |
37 | | - |
38 | | - |
39 | | -# ----- functions ----- |
40 | | -def request(url) -> str: |
41 | | - scraper = cloudscraper.create_scraper() |
42 | | - |
43 | | - response = scraper.get( |
44 | | - url, |
45 | | - headers={"User-Agent": USER_AGENT}, |
46 | | - cookies={"cf_clearance": CF_CLEARANCE}, |
47 | | - verify=certifi.where(), |
48 | | - ) |
49 | | - |
50 | | - if response.status_code != 200: |
51 | | - raise Exception(response.text) |
52 | | - |
53 | | - return response.text |
54 | | - |
55 | | - |
56 | | -def get_stake_asset_page(stake_addr: str) -> str: |
57 | | - return request(f"https://preprod.cexplorer.io/stake/{stake_addr}/asset") |
58 | 13 |
|
| 14 | +# relative path to this script file for the output snapshot file |
| 15 | +OUT_FILE = os.environ["CARDANO_ASSETS_OUTPUT_FILE"] |
59 | 16 |
|
60 | | -def get_stake_data_page(stake_addr: str) -> str: |
61 | | - return request(f"https://preprod.cexplorer.io/stake/{stake_addr}") |
| 17 | +# the snapshot file to read as a reference of stake addresses list |
| 18 | +IN_FILE = os.environ["CARDANO_ASSETS_INPUT_FILE"] |
62 | 19 |
|
| 20 | +# blockfrost.io token value |
| 21 | +BLOCKFROST_TOKEN = os.environ["BLOCKFROST_TOKEN"] |
63 | 22 |
|
64 | | -def get_index_page() -> str: |
65 | | - return request("https://preprod.cexplorer.io/") |
| 23 | +# cardano network type |
| 24 | +CARDANO_NETWORK = os.environ["CARDANO_NETWORK"] |
66 | 25 |
|
| 26 | +BLOCKFROST_URL = f"https://cardano-{CARDANO_NETWORK}.blockfrost.io/api/v0" |
67 | 27 |
|
68 | | -def get_asset_page(asset: str) -> str: |
69 | | - return request(f"https://preprod.cexplorer.io/asset/{asset}") |
| 28 | +RECORDS_LIMIT = 100 |
| 29 | +START_POSITION = 0 |
70 | 30 |
|
71 | 31 |
|
72 | | -def epoch_2_slot(epoch: int) -> int: |
73 | | - shelley_start_epoch = 208 |
74 | | - shelley_start_slot = 88_416_000 |
75 | | - slots_per_epoch = 432_000 |
| 32 | +def get_request(s: requests.Session, url: str): |
| 33 | + resp = s.get(url=url, headers={"project_id": BLOCKFROST_TOKEN}) |
| 34 | + if resp.status_code == 404: |
| 35 | + return None |
| 36 | + assert resp.status_code == 200, f"req: {url}, resp: {resp.text}" |
| 37 | + return resp.json() |
76 | 38 |
|
77 | | - if epoch < shelley_start_epoch: |
78 | | - raise Exception("Epochs before 208 (Byron era) have a different slot timing") |
79 | | - |
80 | | - return shelley_start_slot + (epoch - shelley_start_epoch) * slots_per_epoch |
81 | 39 |
|
82 | 40 | # ----- process ----- |
83 | 41 |
|
84 | 42 | # read the snapshot file |
85 | | -snapshot_path = os.path.join(os.path.dirname(__file__), IN_FILE) |
86 | | -with open(snapshot_path, "r", encoding="utf-8") as f: |
| 43 | +with open(IN_FILE, "r", encoding="utf-8") as f: |
87 | 44 | snapshot_data = json.load(f) |
88 | 45 |
|
| 46 | +try: |
| 47 | + # open output file if already exists to write into it |
| 48 | + with open(OUT_FILE, "r", encoding="utf-8") as f: |
| 49 | + formatted_records = json.load(f) |
| 50 | +except: |
| 51 | + formatted_records = {} |
| 52 | + |
89 | 53 | # process each record |
| 54 | +s = requests.Session() |
90 | 55 | formatted_records = {} |
91 | | -processing_records = snapshot_data[:] |
| 56 | +processing_records = snapshot_data[START_POSITION : START_POSITION + RECORDS_LIMIT] |
| 57 | +logger.info( |
| 58 | + f"Start processing start: {START_POSITION}, end: {START_POSITION + min(len(processing_records), RECORDS_LIMIT)}" |
| 59 | +) |
92 | 60 | for i, record in enumerate(processing_records): |
93 | 61 | stake_addr = address.stake_public_key_to_address( |
94 | | - key=record["stake_public_key"][2:], |
95 | | - is_stake=True, |
96 | | - network_type="preprod" |
| 62 | + key=record["stake_public_key"][2:], is_stake=True, network_type="preprod" |
97 | 63 | ) |
98 | 64 |
|
99 | | - attempt_count = 0 |
100 | | - |
101 | | - while attempt_count < MAX_ATTEMPT: |
102 | | - try: |
103 | | - print(f"Scraping {stake_addr}... ({i + 1}/{len(processing_records)})") |
104 | | - |
105 | | - # extracting - stake/:stake_id |
106 | | - stake_html = get_stake_data_page(stake_addr) |
107 | | - stake_dom = BeautifulSoup(stake_html, "html.parser") |
108 | | - |
109 | | - found_result = stake_dom.select_one("div.container-fluid").get_text(strip=True) |
110 | | - if "404 - address not found" in found_result: |
111 | | - print(" Skipped NOT FOUND") |
112 | | - break |
113 | | - |
114 | | - stake_status = stake_dom.select_one("table.table span.badge").get_text(strip=True) |
115 | | - if stake_status.lower() == "inactive": |
116 | | - print(" Skipped INACTIVE") |
117 | | - break |
118 | | - |
119 | | - ada_amount_txt = stake_dom.select_one("table.table tr:nth-child(5) span[title]:nth-child(2)") |
120 | | - ada_amount = int(Decimal(ada_amount_txt.attrs["title"].replace(",", ""))) |
121 | | - |
122 | | - # extracting - index |
123 | | - index_html = get_index_page() |
124 | | - index_dom = BeautifulSoup(index_html, "html.parser") |
125 | | - |
126 | | - epoch_number_txt = index_dom.select_one("#_epoch_no") |
127 | | - epoch_number = int(epoch_number_txt.attrs["data-value"]) |
128 | | - slot_number = epoch_2_slot(epoch_number) |
129 | | - |
130 | | - # extracting - stake/:stake_id/asset |
131 | | - stake_asset_dom = BeautifulSoup(get_stake_asset_page(stake_addr), "html.parser") |
132 | | - |
133 | | - item_rows = stake_asset_dom.select("div.table-responsive table > thead > tr > td > a") |
134 | | - amount_rows = stake_asset_dom.select("div.table-responsive table > thead > tr > td:nth-child(6) > span") |
135 | | - |
136 | | - native_tokens = [] |
137 | | - for j, (item_row, amount_row) in enumerate(zip(item_rows, amount_rows)): |
138 | | - asset_name = "\n".join(item_row.get_text().split("\n")[2:-2]).strip() |
139 | | - asset_url = item_row.attrs["href"] |
140 | | - amount = int(Decimal(amount_row.attrs["title"].replace(",", ""))) |
141 | | - |
142 | | - print(f" Extracting asset {asset_url}... ({j + 1}/{len(item_rows)})") |
143 | | - |
144 | | - # extracting - asset/:asset_id |
145 | | - asset_html = request(f"https://preprod.cexplorer.io{asset_url}") |
146 | | - asset_dom = BeautifulSoup(asset_html, "html.parser") |
147 | | - |
148 | | - policy_hash = asset_dom.select_one("div.container-fluid > div > div:nth-child(2) > a") |
149 | | - policy_hash = policy_hash.get_text(strip=True) |
150 | | - |
151 | | - native_tokens.append({ |
152 | | - "policy_hash": f"0x{policy_hash}", |
153 | | - "asset_name": asset_name, |
154 | | - "amount": amount |
155 | | - }) |
156 | | - |
157 | | - if stake_addr in formatted_records: |
158 | | - print(" Warning OVERRIDDEN STAKE ADDRESS") |
159 | | - |
160 | | - formatted_records[stake_addr] = { |
161 | | - "ada_amount": ada_amount, |
162 | | - "native_tokens": native_tokens, |
163 | | - "slot_number": slot_number, |
164 | | - } |
165 | | - |
166 | | - break |
167 | | - except Exception as e: |
168 | | - print(f"ERROR: {e}") |
| 65 | + logger.info( |
| 66 | + f"Checking: '{stake_addr}'... ({i + 1}/{min(len(processing_records), RECORDS_LIMIT)})" |
| 67 | + ) |
169 | 68 |
|
170 | | - if attempt_count >= MAX_ATTEMPT: |
171 | | - print(" Skipped MAX ATTEMPT REACHED") |
172 | | - break |
| 69 | + addresses = get_request( |
| 70 | + s, |
| 71 | + f"{BLOCKFROST_URL}/accounts/{stake_addr}/addresses", |
| 72 | + ) |
| 73 | + if addresses == None: |
| 74 | + continue |
| 75 | + |
| 76 | + ada_amount = 0 |
| 77 | + native_tokens = {} |
| 78 | + for addr in addresses: |
| 79 | + addr = addr["address"] |
| 80 | + addr_info = get_request( |
| 81 | + s, |
| 82 | + f"{BLOCKFROST_URL}/addresses/{addr}", |
| 83 | + ) |
| 84 | + for amount in addr_info["amount"]: |
| 85 | + if amount["unit"] == "lovelace": |
| 86 | + ada_amount += int(amount["quantity"]) |
| 87 | + continue |
| 88 | + native_tokens[f"0x{amount["unit"]}"] = native_tokens.get( |
| 89 | + amount["unit"], 0 |
| 90 | + ) + int(amount["quantity"]) |
| 91 | + |
| 92 | + # get slot number |
| 93 | + latest_block = get_request( |
| 94 | + s, |
| 95 | + f"{BLOCKFROST_URL}/blocks/latest", |
| 96 | + ) |
173 | 97 |
|
174 | | - time.sleep(3) |
175 | | - print("Retrying...") |
176 | | - attempt_count += 1 |
| 98 | + slot_number = latest_block["slot"] |
| 99 | + formatted_records[stake_addr] = { |
| 100 | + "ada_amount": ada_amount, |
| 101 | + "native_tokens": native_tokens, |
| 102 | + "slot_number": slot_number, |
| 103 | + } |
177 | 104 |
|
178 | 105 | # write into a file |
179 | | -write_file_path = os.path.join(os.path.dirname(__file__), OUT_FILE) |
180 | | -with open(write_file_path, "w") as f: |
| 106 | +with open(OUT_FILE, "w") as f: |
181 | 107 | json.dump(formatted_records, f, indent=2) |
182 | 108 |
|
183 | | -print("Completed preparing data") |
| 109 | +logger.info( |
| 110 | + f"Completed preparing data, start: {START_POSITION}, end: {START_POSITION + min(len(processing_records), RECORDS_LIMIT)}" |
| 111 | +) |
0 commit comments