Skip to content

Commit 800bb34

Browse files
committed
Add dataset loading functionality for Litroacp and XACBench
- Introduced DatasetReader to load datasets from specified directories. - Implemented loading for Litroacp datasets and printed the loaded dataset keys. - Added loading for XACBench datasets with corresponding output of loaded dataset keys.
1 parent a27ba0c commit 800bb34

File tree

11 files changed

+50
-47885
lines changed

11 files changed

+50
-47885
lines changed

datasets/process.py

Lines changed: 36 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
Dataset Processing Module - Handles loading and processing datasets
33
"""
44

5+
from calendar import c
56
from pathlib import Path
67
import os
8+
import re
79
import pandas as pd
810

911

@@ -20,33 +22,37 @@ def read_csv(abs_path: str) -> pd.DataFrame:
2022
"""Read CSV format file"""
2123
return pd.read_csv(abs_path)
2224

23-
24-
def load_litroacp_datasets() -> dict[str, pd.DataFrame]:
25-
"""
26-
Load all datasets from litroacp folder
27-
28-
Returns:
29-
Dictionary with dataset names as keys and DataFrames as values
30-
"""
31-
datasets = {}
32-
litroacp_path = Path(__file__).parent / "litroacp"
33-
34-
for file_name in sorted(os.listdir(litroacp_path)):
35-
file_path = litroacp_path / file_name
36-
37-
try:
38-
if file_name.endswith(".jsonl"):
39-
df = DatasetReader.read_jsonl(str(file_path))
40-
datasets[file_name] = df
41-
print(f"Loaded {file_name} with shape: {df.shape}")
42-
elif file_name.endswith(".csv"):
43-
df = DatasetReader.read_csv(str(file_path))
44-
datasets[file_name] = df
45-
print(f"Loaded {file_name} with shape: {df.shape}")
46-
except Exception as e:
47-
print(f"Error loading {file_name}: {e}")
48-
49-
return datasets
50-
51-
52-
__all__ = ["DatasetReader", "load_litroacp_datasets"]
25+
def load_datasets(self, datasets_dir_path: Path) -> dict[str, pd.DataFrame]:
26+
"""
27+
Load all datasets from litroacp folder
28+
29+
Returns:
30+
Dictionary with dataset names as keys and DataFrames as values
31+
"""
32+
datasets = {}
33+
# litroacp_path = Path(__file__).parent / "litroacp"
34+
35+
for file_name in sorted(os.listdir(datasets_dir_path)):
36+
file_path = datasets_dir_path / file_name
37+
try:
38+
if file_name.endswith(".jsonl"):
39+
df = DatasetReader.read_jsonl(str(file_path))
40+
datasets[file_name] = df
41+
print(f"Loaded {file_name} with shape: {df.shape}")
42+
elif file_name.endswith(".xml"):
43+
with open(file_path, "r", encoding="utf-8") as file:
44+
policy_pattern = re.compile(r"<Policy[\s\S]*?<\/Policy>")
45+
xacml_content = file.read()
46+
policies = policy_pattern.findall(xacml_content)
47+
datasets[file_name] = pd.DataFrame({"policy": policies})
48+
print(f"Loaded {file_name} with {len(policies)} policies.")
49+
else:
50+
# print(f"Unsupported file format for {file_name}, skipping.")
51+
continue
52+
except Exception as e:
53+
print(f"Error loading {file_name}: {e}")
54+
55+
return datasets
56+
57+
58+
__all__ = ["DatasetReader"]

datasets/xacbench/__init__.py

Lines changed: 0 additions & 8 deletions
This file was deleted.

datasets/xacbench/converter.py

Lines changed: 0 additions & 90 deletions
This file was deleted.

datasets/xacbench/eval.py

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)