-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathextract_data.py
More file actions
177 lines (134 loc) · 5.01 KB
/
extract_data.py
File metadata and controls
177 lines (134 loc) · 5.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import time
import json
import dagster as dg
import polars as pl
from pydantic import BaseModel, HttpUrl, ValidationError
from typing import Optional
from termcolor import colored
import requests
class Series(BaseModel):
id: str
name: str
logo: Optional[HttpUrl] = None
class Set(BaseModel):
series_id: str
set_id: str
set_name: str
official_card_count: int | None
total_card_count: int | None
logo: Optional[str] = None
symbol: Optional[str] = None
@dg.asset(kinds={"API", "Polars", "Pydantic"})
def extract_series_data() -> pl.DataFrame:
url: str = "https://api.tcgdex.net/v2/en/series"
data = requests.get(url).json()
# Pydantic validation
try:
validated: list[Series] = [Series(**item) for item in data]
print(
colored(" ✓", "green"), "Pydantic validation passed for all series entries."
)
except ValidationError as e:
print(colored(" ✖", "red"), "Pydantic validation failed.")
print(e)
raise
filtered = [s.model_dump(mode="json") for s in validated if s.id in ["swsh", "sv"]]
return pl.DataFrame(filtered)
@dg.asset(kinds={"API", "Polars", "Pydantic"})
def extract_set_data() -> pl.DataFrame:
url_list = [
"https://api.tcgdex.net/v2/en/series/swsh",
"https://api.tcgdex.net/v2/en/series/sv"
]
flat: list[dict] = []
for url in url_list:
data = requests.get(url).json()
series_id = data.get("id")
for s in data.get("sets", []):
entry = {
"series_id": series_id,
"set_id": s.get("id"),
"set_name": s.get("name"),
"official_card_count": s.get("cardCount", {}).get("official"),
"total_card_count": s.get("cardCount", {}).get("total"),
"logo": s.get("logo"),
"symbol": s.get("symbol")
}
flat.append(entry)
# Pydantic validation
try:
validated: list[Set] = [Set(**item) for item in flat]
print(
colored(" ✓", "green"),
"Pydantic validation passed for all set entries."
)
except ValidationError as e:
print(colored(" ✖", "red"), "Pydantic validation failed.")
print(e)
raise
return pl.DataFrame([s.model_dump(mode="json") for s in validated])
@dg.asset(kinds={"API"})
def extract_card_url_from_set() -> list:
urls = [
"https://api.tcgdex.net/v2/en/sets/sv01",
"https://api.tcgdex.net/v2/en/sets/sv02",
]
all_card_urls = [] # Initialize empty list to collect all URLs
for url in urls:
try:
r = requests.get(url)
r.raise_for_status()
data = r.json()["cards"]
set_card_urls = [f"https://api.tcgdex.net/v2/en/cards/{card['id']}" for card in data]
all_card_urls.extend(set_card_urls) # Add all URLs from this set
time.sleep(0.1)
except requests.RequestException as e:
print(f"Failed to fetch set {url}: {e}")
return all_card_urls
@dg.asset(deps=[extract_card_url_from_set], kinds={"API"})
def extract_card_info() -> list:
card_url_list = extract_card_url_from_set()
cards_list = []
for url in card_url_list:
try:
r = requests.get(url)
r.raise_for_status()
data = r.json()
cards_list.append(data)
time.sleep(0.1)
except requests.RequestException as e:
print(f"Failed to fetch {url}: {e}")
return cards_list
@dg.asset(deps=[extract_card_info], kinds={"Polars"})
def create_card_dataframe() -> pl.DataFrame:
cards_list = extract_card_info()
all_flat_cards = []
for card in cards_list:
flat = {}
# Copy top-level scalar values
scalar_keys = ['category', 'hp', 'id', 'illustrator', 'image', 'localId',
'name', 'rarity', 'regulationMark', 'retreat', 'stage']
for key in scalar_keys:
flat[key] = card.get(key)
# Flatten nested dicts with prefixes
for key, value in card.get("legal", {}).items():
flat[f"legal_{key}"] = value
for key, value in card.get("set", {}).items():
if isinstance(value, dict):
for sub_key, sub_val in value.items():
flat[f"set_{key}_{sub_key}"] = sub_val
else:
flat[f"set_{key}"] = value
# Flatten types (list of strings)
flat["types"] = ", ".join(card.get("types", []))
flat["attacks_json"] = json.dumps(card.get("attacks", []), ensure_ascii=False)
attacks = card.get("attacks", [])
for i, atk in enumerate(attacks):
prefix = f"attack_{i+1}"
flat[f"{prefix}_name"] = atk.get("name")
flat[f"{prefix}_damage"] = atk.get("damage")
flat[f"{prefix}_effect"] = atk.get("effect")
flat[f"{prefix}_cost"] = ", ".join(atk.get("cost", []))
all_flat_cards.append(flat)
df = pl.DataFrame(all_flat_cards)
return df