22Dataset Processing Module - Handles loading and processing datasets
33"""
44
5+ from calendar import c
56from pathlib import Path
67import os
8+ import re
79import pandas as pd
810
911
@@ -20,33 +22,37 @@ def read_csv(abs_path: str) -> pd.DataFrame:
2022 """Read CSV format file"""
2123 return pd .read_csv (abs_path )
2224
23-
24- def load_litroacp_datasets () -> dict [str , pd .DataFrame ]:
25- """
26- Load all datasets from litroacp folder
27-
28- Returns:
29- Dictionary with dataset names as keys and DataFrames as values
30- """
31- datasets = {}
32- litroacp_path = Path (__file__ ).parent / "litroacp"
33-
34- for file_name in sorted (os .listdir (litroacp_path )):
35- file_path = litroacp_path / file_name
36-
37- try :
38- if file_name .endswith (".jsonl" ):
39- df = DatasetReader .read_jsonl (str (file_path ))
40- datasets [file_name ] = df
41- print (f"Loaded { file_name } with shape: { df .shape } " )
42- elif file_name .endswith (".csv" ):
43- df = DatasetReader .read_csv (str (file_path ))
44- datasets [file_name ] = df
45- print (f"Loaded { file_name } with shape: { df .shape } " )
46- except Exception as e :
47- print (f"Error loading { file_name } : { e } " )
48-
49- return datasets
50-
51-
52- __all__ = ["DatasetReader" , "load_litroacp_datasets" ]
25+ def load_datasets (self , datasets_dir_path : Path ) -> dict [str , pd .DataFrame ]:
26+ """
27+ Load all datasets from litroacp folder
28+
29+ Returns:
30+ Dictionary with dataset names as keys and DataFrames as values
31+ """
32+ datasets = {}
33+ # litroacp_path = Path(__file__).parent / "litroacp"
34+
35+ for file_name in sorted (os .listdir (datasets_dir_path )):
36+ file_path = datasets_dir_path / file_name
37+ try :
38+ if file_name .endswith (".jsonl" ):
39+ df = DatasetReader .read_jsonl (str (file_path ))
40+ datasets [file_name ] = df
41+ print (f"Loaded { file_name } with shape: { df .shape } " )
42+ elif file_name .endswith (".xml" ):
43+ with open (file_path , "r" , encoding = "utf-8" ) as file :
44+ policy_pattern = re .compile (r"<Policy[\s\S]*?<\/Policy>" )
45+ xacml_content = file .read ()
46+ policies = policy_pattern .findall (xacml_content )
47+ datasets [file_name ] = pd .DataFrame ({"policy" : policies })
48+ print (f"Loaded { file_name } with { len (policies )} policies." )
49+ else :
50+ # print(f"Unsupported file format for {file_name}, skipping.")
51+ continue
52+ except Exception as e :
53+ print (f"Error loading { file_name } : { e } " )
54+
55+ return datasets
56+
57+
58+ __all__ = ["DatasetReader" ]
0 commit comments