Skip to content

Commit c03251c

Browse files
committed
feat: updater now fetches latest IAB Content Taxonomy 3.1 and 2.2; fixed header detection and paths; refreshed catalogs
1 parent 937a2b5 commit c03251c

File tree

1 file changed

+17
-9
lines changed

1 file changed

+17
-9
lines changed

scripts/update_catalogs.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
GITHUB_API = "https://api.github.com"
3131

3232
ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
33-
DATA_DIR = os.path.abspath(os.path.join(ROOT, "..", "iab_mapper", "data"))
33+
DATA_DIR = os.path.abspath(os.path.join(ROOT, "iab_mapper", "data"))
3434
RAW_DIR = os.path.join(DATA_DIR, "raw")
3535

3636

@@ -106,7 +106,7 @@ def to_bool(val: Any) -> bool:
106106

107107
def normalize_2x(df: pd.DataFrame) -> List[Dict[str, Any]]:
108108
cols = {c.lower(): c for c in df.columns}
109-
id_col = next((cols[k] for k in cols if k in {"code", "id", "node id", "taxonomy id"}), None)
109+
id_col = next((cols[k] for k in cols if k in {"code", "id", "node id", "taxonomy id", "unique id"}), None)
110110
label_col = next((cols[k] for k in cols if k in {"label", "name", "node", "node name"}), None)
111111
if not id_col or not label_col:
112112
raise ValueError("Could not infer 2.x id/label columns")
@@ -122,12 +122,12 @@ def normalize_2x(df: pd.DataFrame) -> List[Dict[str, Any]]:
122122

123123
def normalize_3x(df: pd.DataFrame) -> List[Dict[str, Any]]:
124124
cols = {c.lower(): c for c in df.columns}
125-
id_col = next((cols[k] for k in cols if k in {"id", "node id", "taxonomy id"}), None)
125+
id_col = next((cols[k] for k in cols if k in {"id", "node id", "taxonomy id", "unique id"}), None)
126126
label_col = next((cols[k] for k in cols if k in {"label", "name", "node", "node name"}), None)
127127
path_col = next((cols[k] for k in cols if k in {"path", "full path", "taxonomy path"}), None)
128128
scd_col = next((cols[k] for k in cols if k in {"scd", "sensitive", "is scd"}), None)
129129

130-
tier_cols = [cols[c] for c in df.columns.str.lower() if c.startswith("tier")]
130+
tier_cols = [cols[c] for c in df.columns.str.lower() if c.startswith("tier ") or c.startswith("tier")]
131131

132132
def row_path(r) -> List[str]:
133133
if path_col:
@@ -160,11 +160,19 @@ def row_path(r) -> List[str]:
160160

161161

162162
def parse_table(text: str, name: str) -> pd.DataFrame:
163-
# Decide delimiter by filename extension
164-
if name.lower().endswith(".tsv"):
165-
return pd.read_csv(io.StringIO(text), sep="\t")
166-
if name.lower().endswith(".csv"):
167-
return pd.read_csv(io.StringIO(text))
163+
# Decide delimiter by filename extension, and detect header row
164+
lower = name.lower()
165+
lines = text.splitlines()
166+
header_idx = 0
167+
for i, line in enumerate(lines[:10]):
168+
if ("unique id" in line.lower()) and ("name" in line.lower()):
169+
header_idx = i
170+
break
171+
sio = io.StringIO(text)
172+
if lower.endswith(".tsv"):
173+
return pd.read_csv(sio, sep="\t", header=header_idx)
174+
if lower.endswith(".csv"):
175+
return pd.read_csv(sio, header=header_idx)
168176
# Try to parse JSON as table-like
169177
try:
170178
data = json.loads(text)

0 commit comments

Comments
 (0)