Skip to content

Commit 779af00

Browse files
author
Patrick Bareiss
committed
Better dataset format for data yml files
1 parent 269c983 commit 779af00

File tree

407 files changed

+407
-358
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

407 files changed

+407
-358
lines changed

bin/dataset_schema.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:97085370d23378475c243e900bfeb0b462b849ff3e2b4f38fec5547177c91a3b
3+
size 2274

bin/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ splunk-sdk
55
gitpython
66
tqdm
77
colorama
8-
pyyaml
8+
jsonschema

bin/validate.py

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
#!/usr/bin/env python3
2+
"""
3+
YAML Dataset Validation Script
4+
5+
This script validates YAML files in the specified directory against a
6+
predefined JSON schema.
7+
All dataset YAML files must conform to the specified structure with mandatory fields.
8+
"""
9+
10+
import argparse
11+
import json
12+
import sys
13+
import uuid
14+
from datetime import datetime
15+
from pathlib import Path
16+
from typing import Dict, List, Any
17+
18+
import yaml
19+
from jsonschema import validate, ValidationError, draft7_format_checker
20+
21+
22+
def load_yaml_schema() -> Dict[str, Any]:
23+
"""
24+
Load and return the JSON schema for validating YAML dataset files.
25+
26+
Returns:
27+
Dict containing the JSON schema definition
28+
29+
Raises:
30+
FileNotFoundError: If schema file doesn't exist
31+
json.JSONDecodeError: If schema file is invalid JSON
32+
"""
33+
# Get the schema file path relative to the script location
34+
script_dir = Path(__file__).parent
35+
schema_path = script_dir / 'dataset_schema.json'
36+
37+
try:
38+
with open(schema_path, 'r', encoding='utf-8') as file:
39+
return json.load(file)
40+
except FileNotFoundError:
41+
raise FileNotFoundError(f"Schema file not found: {schema_path}")
42+
except json.JSONDecodeError as e:
43+
raise json.JSONDecodeError(f"Invalid JSON in schema file {schema_path}: {e}")
44+
45+
46+
def validate_uuid(uuid_string: str) -> bool:
47+
"""
48+
Validate that a string is a properly formatted UUID.
49+
50+
Args:
51+
uuid_string: String to validate as UUID
52+
53+
Returns:
54+
True if valid UUID, False otherwise
55+
"""
56+
try:
57+
uuid.UUID(uuid_string)
58+
return True
59+
except ValueError:
60+
return False
61+
62+
63+
def validate_date(date_string: str) -> bool:
64+
"""
65+
Validate that a string is a properly formatted date (YYYY-MM-DD).
66+
67+
Args:
68+
date_string: String to validate as date
69+
70+
Returns:
71+
True if valid date, False otherwise
72+
"""
73+
try:
74+
datetime.strptime(date_string, '%Y-%m-%d')
75+
return True
76+
except ValueError:
77+
return False
78+
79+
80+
def load_yaml_file(file_path: Path) -> Dict[str, Any]:
81+
"""
82+
Load and parse a YAML file.
83+
84+
Args:
85+
file_path: Path to the YAML file
86+
87+
Returns:
88+
Parsed YAML content as dictionary
89+
90+
Raises:
91+
yaml.YAMLError: If YAML parsing fails
92+
FileNotFoundError: If file doesn't exist
93+
"""
94+
try:
95+
with open(file_path, 'r', encoding='utf-8') as file:
96+
return yaml.safe_load(file)
97+
except yaml.YAMLError as e:
98+
raise yaml.YAMLError(f"YAML parsing error in {file_path}: {e}")
99+
except FileNotFoundError:
100+
raise FileNotFoundError(f"File not found: {file_path}")
101+
102+
103+
def validate_yaml_file(file_path: Path, schema: Dict[str, Any]) -> List[str]:
104+
"""
105+
Validate a single YAML file against the schema.
106+
107+
Args:
108+
file_path: Path to the YAML file to validate
109+
schema: JSON schema to validate against
110+
111+
Returns:
112+
List of validation errors (empty if valid)
113+
"""
114+
errors = []
115+
116+
try:
117+
# Load YAML content
118+
yaml_content = load_yaml_file(file_path)
119+
120+
# Validate against JSON schema
121+
validate(yaml_content, schema, format_checker=draft7_format_checker)
122+
123+
# Additional custom validations
124+
if 'id' in yaml_content and not validate_uuid(yaml_content['id']):
125+
errors.append(f"Invalid UUID format for 'id': {yaml_content['id']}")
126+
127+
if 'date' in yaml_content and not validate_date(yaml_content['date']):
128+
errors.append(
129+
f"Invalid date format for 'date': {yaml_content['date']} "
130+
f"(expected YYYY-MM-DD)"
131+
)
132+
133+
except ValidationError as e:
134+
errors.append(f"Schema validation error: {e.message}")
135+
if e.absolute_path:
136+
errors.append(f" Path: {' -> '.join(str(p) for p in e.absolute_path)}")
137+
except yaml.YAMLError as e:
138+
errors.append(f"YAML parsing error: {e}")
139+
except FileNotFoundError as e:
140+
errors.append(f"File error: {e}")
141+
except Exception as e:
142+
errors.append(f"Unexpected error: {e}")
143+
144+
return errors
145+
146+
147+
def find_yaml_files(input_dir: Path) -> List[Path]:
148+
"""
149+
Find all YAML files in the specified directory.
150+
151+
Args:
152+
input_dir: Path to the directory to search for YAML files
153+
154+
Returns:
155+
List of paths to YAML files
156+
"""
157+
yaml_files = []
158+
159+
# Look for .yml and .yaml files recursively
160+
for pattern in ['**/*.yml', '**/*.yaml']:
161+
yaml_files.extend(input_dir.glob(pattern))
162+
163+
# Exclude template files and files with 'old' in the name
164+
yaml_files = [
165+
f for f in yaml_files
166+
if not f.name.startswith('TEMPLATE') and 'old' not in f.name.lower()
167+
]
168+
169+
return sorted(yaml_files)
170+
171+
172+
def parse_arguments():
173+
"""
174+
Parse command-line arguments.
175+
176+
Returns:
177+
argparse.Namespace: Parsed arguments
178+
"""
179+
parser = argparse.ArgumentParser(
180+
description="Validate YAML files against a predefined JSON schema.",
181+
formatter_class=argparse.RawDescriptionHelpFormatter,
182+
epilog="""
183+
Examples:
184+
%(prog)s # Validate files in the default 'datasets' directory
185+
%(prog)s /path/to/data # Validate files in a specific directory
186+
%(prog)s ../other_datasets # Validate files in a relative path
187+
"""
188+
)
189+
190+
parser.add_argument(
191+
'input_folder',
192+
nargs='?',
193+
default='datasets',
194+
help='Directory to search for YAML files (default: datasets)'
195+
)
196+
197+
return parser.parse_args()
198+
199+
200+
def main():
201+
"""
202+
Main function to validate all YAML files in the specified directory.
203+
"""
204+
# Parse command-line arguments
205+
args = parse_arguments()
206+
207+
# Get the project root directory and input directory
208+
script_dir = Path(__file__).parent
209+
project_root = script_dir.parent
210+
211+
# Handle input folder path (can be relative or absolute)
212+
if Path(args.input_folder).is_absolute():
213+
input_dir = Path(args.input_folder)
214+
else:
215+
input_dir = project_root / args.input_folder
216+
217+
if not input_dir.exists():
218+
print(f"Error: Input directory not found: {input_dir}")
219+
sys.exit(1)
220+
221+
if not input_dir.is_dir():
222+
print(f"Error: Input path is not a directory: {input_dir}")
223+
sys.exit(1)
224+
225+
print(f"Validating YAML files in: {input_dir}")
226+
227+
# Load the JSON schema
228+
try:
229+
schema = load_yaml_schema()
230+
except (FileNotFoundError, json.JSONDecodeError) as e:
231+
print(f"Error loading schema: {e}")
232+
sys.exit(1)
233+
234+
# Find all YAML files
235+
yaml_files = find_yaml_files(input_dir)
236+
237+
if not yaml_files:
238+
print(f"No YAML files found in the input directory: {input_dir}")
239+
return
240+
241+
print(f"Found {len(yaml_files)} YAML files to validate...")
242+
print("-" * 60)
243+
244+
total_files = len(yaml_files)
245+
valid_files = 0
246+
invalid_files = 0
247+
failed_validations = [] # Track failed files and their errors
248+
249+
# Validate each file
250+
for yaml_file in yaml_files:
251+
# Try to get relative path from project root, fallback to input_dir
252+
try:
253+
relative_path = yaml_file.relative_to(project_root)
254+
except ValueError:
255+
relative_path = yaml_file.relative_to(input_dir)
256+
257+
print(f"\nValidating: {relative_path}")
258+
259+
errors = validate_yaml_file(yaml_file, schema)
260+
261+
if errors:
262+
invalid_files += 1
263+
print(f"❌ INVALID - {len(errors)} error(s):")
264+
for error in errors:
265+
print(f" • {error}")
266+
# Store failed validation details
267+
failed_validations.append((relative_path, errors))
268+
else:
269+
valid_files += 1
270+
print("✅ VALID")
271+
272+
# Print summary
273+
print("\n" + "=" * 60)
274+
print("VALIDATION SUMMARY")
275+
print("=" * 60)
276+
print(f"Total files processed: {total_files}")
277+
print(f"Valid files: {valid_files}")
278+
print(f"Invalid files: {invalid_files}")
279+
280+
if invalid_files > 0:
281+
print(f"\n{invalid_files} file(s) failed validation!")
282+
283+
# Print detailed failed validations at the end
284+
print("\n" + "=" * 60)
285+
print("FAILED VALIDATIONS")
286+
print("=" * 60)
287+
for file_path, errors in failed_validations:
288+
print(f"\n📁 {file_path}")
289+
print("-" * 40)
290+
for i, error in enumerate(errors, 1):
291+
print(f"{i}. {error}")
292+
293+
sys.exit(1)
294+
else:
295+
print("\n✅ All files passed validation!")
296+
297+
298+
if __name__ == "__main__":
299+
main()

datasets/attack_techniques/T1003.002/hivenightmare/atomic_red_team.yml

Lines changed: 0 additions & 14 deletions
This file was deleted.

datasets/attack_techniques/T1018/constrained/powerview.yml renamed to datasets/attack_techniques/T1018/constrained/powerview_old.yml

File renamed without changes.

datasets/attack_techniques/T1018/unconstrained/powerview.yml renamed to datasets/attack_techniques/T1018/unconstrained/powerview_old.yml

File renamed without changes.

datasets/attack_techniques/T1018/unconstrained2/getadcomputer.yml renamed to datasets/attack_techniques/T1018/unconstrained2/getadcomputer_old.yml

File renamed without changes.

datasets/attack_techniques/T1021.002/atomic_red_team/atomic_red_team.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ datasets:
2828
source: XmlWinEventLog:Security
2929
- name: firewall-powershell
3030
path: /datasets/attack_techniques/T1021.002/atomic_red_team/firewall-powershell.log
31-
sourcetype: firewall
31+
sourcetype: XmlWinEventLog
32+
source: XmlWinEventLog:Microsoft-Windows-PowerShell/Operational
3233
- name: 4688_wmiexec_windows-security
3334
path: /datasets/attack_techniques/T1021.002/atomic_red_team/4688_wmiexec_windows-security.log
3435
sourcetype: XmlWinEventLog

datasets/attack_techniques/T1021.004/atomic_red_team/atomic_red_team.yml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@ id: cc9b260c-efc9-11eb-916b-150bf0941fbb
33
date: '2022-07-24'
44
description: 'Simulated lateral movement with SSH'
55
environment: attack_range
6-
dataset:
7-
- https://media.githubusercontent.com/media/splunk/attack_data/master/datasets/attack_techniques/T1021.004/atomic_red_team/linux-sysmon.log
8-
sourcetypes:
9-
- sysmon_linux
10-
references:
11-
- https://attack.mitre.org/techniques/T1021/004/
6+
directory: atomic_red_team
7+
mitre_technique:
8+
- T1021.004
9+
datasets:
10+
- name: linux-sysmon
11+
path: /datasets/attack_techniques/T1021.004/atomic_red_team/linux-sysmon.log
12+
sourcetype: sysmon:linux
13+
source: Syslog:Linux-Sysmon/Operational

datasets/attack_techniques/T1021.006/compmgtm_access/compmgtm_access.yml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@ id: 2eba8f04-033e-11f0-bf1c-629be3538069
33
date: '2025-03-17'
44
description: Generated datasets for compmgtm access in attack range.
55
environment: attack_range
6-
dataset:
7-
- https://media.githubusercontent.com/media/splunk/attack_data/master/datasets/attack_techniques/T1021.006/compmgtm_access/compmgmt_load.log
8-
sourcetypes:
9-
- 'XmlWinEventLog:Microsoft-Windows-Sysmon/Operational'
10-
references:
11-
- https://www.cisa.gov/news-events/cybersecurity-advisories/aa25-071a
6+
directory: compmgtm_access
7+
mitre_technique:
8+
- T1021.006
9+
datasets:
10+
- name: compmgmt_load
11+
path: /datasets/attack_techniques/T1021.006/compmgtm_access/compmgmt_load.log
12+
sourcetype: XmlWinEventLog
13+
source: XmlWinEventLog:Microsoft-Windows-Sysmon/Operational

0 commit comments

Comments
 (0)