3030GITHUB_API = "https://api.github.com"
3131
3232ROOT = os .path .abspath (os .path .join (os .path .dirname (__file__ ), ".." ))
33- DATA_DIR = os .path .abspath (os .path .join (ROOT , ".." , " iab_mapper" , "data" ))
33+ DATA_DIR = os .path .abspath (os .path .join (ROOT , "iab_mapper" , "data" ))
3434RAW_DIR = os .path .join (DATA_DIR , "raw" )
3535
3636
@@ -106,7 +106,7 @@ def to_bool(val: Any) -> bool:
106106
107107def normalize_2x (df : pd .DataFrame ) -> List [Dict [str , Any ]]:
108108 cols = {c .lower (): c for c in df .columns }
109- id_col = next ((cols [k ] for k in cols if k in {"code" , "id" , "node id" , "taxonomy id" }), None )
109+ id_col = next ((cols [k ] for k in cols if k in {"code" , "id" , "node id" , "taxonomy id" , "unique id" }), None )
110110 label_col = next ((cols [k ] for k in cols if k in {"label" , "name" , "node" , "node name" }), None )
111111 if not id_col or not label_col :
112112 raise ValueError ("Could not infer 2.x id/label columns" )
@@ -122,12 +122,12 @@ def normalize_2x(df: pd.DataFrame) -> List[Dict[str, Any]]:
122122
123123def normalize_3x (df : pd .DataFrame ) -> List [Dict [str , Any ]]:
124124 cols = {c .lower (): c for c in df .columns }
125- id_col = next ((cols [k ] for k in cols if k in {"id" , "node id" , "taxonomy id" }), None )
125+ id_col = next ((cols [k ] for k in cols if k in {"id" , "node id" , "taxonomy id" , "unique id" }), None )
126126 label_col = next ((cols [k ] for k in cols if k in {"label" , "name" , "node" , "node name" }), None )
127127 path_col = next ((cols [k ] for k in cols if k in {"path" , "full path" , "taxonomy path" }), None )
128128 scd_col = next ((cols [k ] for k in cols if k in {"scd" , "sensitive" , "is scd" }), None )
129129
130- tier_cols = [cols [c ] for c in df .columns .str .lower () if c .startswith ("tier" )]
130+ tier_cols = [cols [c ] for c in df .columns .str .lower () if c .startswith ("tier " ) or c . startswith ( "tier " )]
131131
132132 def row_path (r ) -> List [str ]:
133133 if path_col :
@@ -160,11 +160,19 @@ def row_path(r) -> List[str]:
160160
161161
162162def parse_table (text : str , name : str ) -> pd .DataFrame :
163- # Decide delimiter by filename extension
164- if name .lower ().endswith (".tsv" ):
165- return pd .read_csv (io .StringIO (text ), sep = "\t " )
166- if name .lower ().endswith (".csv" ):
167- return pd .read_csv (io .StringIO (text ))
163+ # Decide delimiter by filename extension, and detect header row
164+ lower = name .lower ()
165+ lines = text .splitlines ()
166+ header_idx = 0
167+ for i , line in enumerate (lines [:10 ]):
168+ if ("unique id" in line .lower ()) and ("name" in line .lower ()):
169+ header_idx = i
170+ break
171+ sio = io .StringIO (text )
172+ if lower .endswith (".tsv" ):
173+ return pd .read_csv (sio , sep = "\t " , header = header_idx )
174+ if lower .endswith (".csv" ):
175+ return pd .read_csv (sio , header = header_idx )
168176 # Try to parse JSON as table-like
169177 try :
170178 data = json .loads (text )
0 commit comments