|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import os |
| 4 | +import sys |
| 5 | +import argparse |
| 6 | +import glob |
| 7 | +import uuid |
| 8 | +import urllib |
| 9 | +import requests |
| 10 | +from urllib3 import disable_warnings |
| 11 | +import yaml |
| 12 | +from pathlib import Path |
| 13 | + |
| 14 | + |
| 15 | +def load_environment_variables(): |
| 16 | + """Load required environment variables for Splunk connection.""" |
| 17 | + required_vars = ['SPLUNK_HOST', 'SPLUNK_HEC_TOKEN'] |
| 18 | + env_vars = {} |
| 19 | + for var in required_vars: |
| 20 | + value = os.environ.get(var) |
| 21 | + if not value: |
| 22 | + raise ValueError(f"Environment variable {var} is required but not set") |
| 23 | + env_vars[var.lower().replace('splunk_', '')] = value |
| 24 | + return env_vars |
| 25 | + |
| 26 | + |
| 27 | +def find_data_yml_files(folder_path): |
| 28 | + """Find all data.yml files recursively in folder and subfolders.""" |
| 29 | + data_yml_files = [] |
| 30 | + folder_path = Path(folder_path) |
| 31 | + |
| 32 | + # Use pathlib to recursively find all data.yml files |
| 33 | + for yml_file in folder_path.rglob("data.yml"): |
| 34 | + data_yml_files.append(str(yml_file)) |
| 35 | + |
| 36 | + if not data_yml_files: |
| 37 | + print(f"Warning: No data.yml files found in {folder_path}") |
| 38 | + else: |
| 39 | + print(f"Found {len(data_yml_files)} data.yml files") |
| 40 | + |
| 41 | + return data_yml_files |
| 42 | + |
| 43 | + |
| 44 | +def parse_data_yml(yml_file_path): |
| 45 | + """Parse a data.yml file and extract dataset information.""" |
| 46 | + try: |
| 47 | + with open(yml_file_path, 'r') as file: |
| 48 | + data = yaml.safe_load(file) |
| 49 | + |
| 50 | + # Extract required fields |
| 51 | + file_id = data.get('id', str(uuid.uuid4())) |
| 52 | + datasets = data.get('datasets', []) |
| 53 | + |
| 54 | + # Return tuple of (id, datasets_list) |
| 55 | + return file_id, datasets |
| 56 | + |
| 57 | + except Exception as e: |
| 58 | + print(f"Error parsing {yml_file_path}: {e}") |
| 59 | + return None, [] |
| 60 | + |
| 61 | + |
| 62 | +def find_data_files(folder_path): |
| 63 | + """Find all data files in the specified folder (supports .log, .json, .txt).""" |
| 64 | + files = [] |
| 65 | + for ext in ("*.log", "*.json", "*.txt"): |
| 66 | + files.extend(glob.glob(os.path.join(folder_path, ext))) |
| 67 | + if not files: |
| 68 | + print(f"Warning: No data files found in {folder_path}") |
| 69 | + return files |
| 70 | + |
| 71 | + |
| 72 | +def send_data_to_splunk(file_path, splunk_host, hec_token, event_host_uuid, |
| 73 | + index="test", source="test", sourcetype="test"): |
| 74 | + """Send a data file to Splunk HEC.""" |
| 75 | + disable_warnings() |
| 76 | + hec_channel = str(uuid.uuid4()) |
| 77 | + headers = { |
| 78 | + "Authorization": f"Splunk {hec_token}", |
| 79 | + "X-Splunk-Request-Channel": hec_channel, |
| 80 | + } |
| 81 | + url_params = { |
| 82 | + "index": index, |
| 83 | + "source": source, |
| 84 | + "sourcetype": sourcetype, |
| 85 | + "host": event_host_uuid, |
| 86 | + } |
| 87 | + url = urllib.parse.urljoin( |
| 88 | + f"https://{splunk_host}:8088", |
| 89 | + "services/collector/raw" |
| 90 | + ) |
| 91 | + with open(file_path, "rb") as datafile: |
| 92 | + try: |
| 93 | + res = requests.post( |
| 94 | + url, |
| 95 | + params=url_params, |
| 96 | + data=datafile.read(), |
| 97 | + allow_redirects=True, |
| 98 | + headers=headers, |
| 99 | + verify=False, |
| 100 | + ) |
| 101 | + res.raise_for_status() |
| 102 | + print(f":white_check_mark: Sent {file_path} to Splunk HEC") |
| 103 | + except Exception as e: |
| 104 | + print(f":x: Error sending {file_path} to Splunk HEC: {e}") |
| 105 | + |
| 106 | + |
| 107 | +def main(): |
| 108 | + parser = argparse.ArgumentParser( |
| 109 | + description="Recursively find and replay datasets from data.yml files " |
| 110 | + "to Splunk via HTTP Event Collector (HEC)", |
| 111 | + epilog=""" |
| 112 | +Environment Variables Required: |
| 113 | + SPLUNK_HOST - Splunk server hostname/IP |
| 114 | + SPLUNK_HEC_TOKEN - Splunk HEC token |
| 115 | +
|
| 116 | +Example usage: |
| 117 | + python replay_all.py /path/to/datasets/folder |
| 118 | + python replay_all.py datasets/attack_techniques --host-uuid 12345678-abcd-efgh |
| 119 | + export SPLUNK_HOST="192.168.1.100" |
| 120 | + export SPLUNK_HEC_TOKEN="your-hec-token" |
| 121 | +
|
| 122 | +This script will: |
| 123 | +1. Recursively find all data.yml files in the specified directory |
| 124 | +2. Parse each data.yml file to extract dataset information |
| 125 | +3. Replay each dataset using the source and sourcetype from the yml file |
| 126 | +4. Use the id field from data.yml as the host field for Splunk events |
| 127 | + """, |
| 128 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
| 129 | + ) |
| 130 | + parser.add_argument( |
| 131 | + 'path', |
| 132 | + help='Path to a directory containing data.yml files ' |
| 133 | + '(searches recursively)' |
| 134 | + ) |
| 135 | + parser.add_argument( |
| 136 | + '--source', |
| 137 | + default='test', |
| 138 | + help='Source field for Splunk events (default: test)' |
| 139 | + ) |
| 140 | + parser.add_argument( |
| 141 | + '--sourcetype', |
| 142 | + default='test', |
| 143 | + help='Sourcetype field for Splunk events (default: test)' |
| 144 | + ) |
| 145 | + parser.add_argument( |
| 146 | + '--index', |
| 147 | + default='test', |
| 148 | + help='Splunk index to send events to (default: test)' |
| 149 | + ) |
| 150 | + parser.add_argument( |
| 151 | + '--host-uuid', |
| 152 | + help='UUID to use as the host field for Splunk events ' |
| 153 | + '(generates random UUID if not provided)' |
| 154 | + ) |
| 155 | + args = parser.parse_args() |
| 156 | + |
| 157 | + try: |
| 158 | + env_vars = load_environment_variables() |
| 159 | + splunk_host = env_vars['host'] |
| 160 | + hec_token = env_vars['hec_token'] |
| 161 | + |
| 162 | + if not os.path.isdir(args.path): |
| 163 | + print(f"Error: {args.path} is not a valid directory") |
| 164 | + sys.exit(1) |
| 165 | + |
| 166 | + # Find all data.yml files recursively |
| 167 | + data_yml_files = find_data_yml_files(args.path) |
| 168 | + |
| 169 | + if not data_yml_files: |
| 170 | + print(f"No data.yml files found in {args.path}") |
| 171 | + sys.exit(1) |
| 172 | + |
| 173 | + # Process each data.yml file |
| 174 | + for yml_file in data_yml_files: |
| 175 | + print(f"\nProcessing {yml_file}...") |
| 176 | + file_id, datasets = parse_data_yml(yml_file) |
| 177 | + |
| 178 | + if not file_id or not datasets: |
| 179 | + print(f"Skipping {yml_file} - no valid data found") |
| 180 | + continue |
| 181 | + |
| 182 | + # Use the id from data.yml as host field (unless user provided one) |
| 183 | + event_host_uuid = args.host_uuid or file_id |
| 184 | + print(f"Using host UUID: {event_host_uuid}") |
| 185 | + |
| 186 | + # Process each dataset in the data.yml file |
| 187 | + for dataset in datasets: |
| 188 | + dataset_name = dataset.get('name', 'unknown') |
| 189 | + dataset_path = dataset.get('path', '') |
| 190 | + dataset_source = dataset.get('source', args.source) |
| 191 | + dataset_sourcetype = dataset.get('sourcetype', args.sourcetype) |
| 192 | + |
| 193 | + if not dataset_path: |
| 194 | + print(f"Warning: No path specified for dataset " |
| 195 | + f"'{dataset_name}', skipping") |
| 196 | + continue |
| 197 | + |
| 198 | + # Handle relative paths - relative to attack_data root |
| 199 | + if dataset_path.startswith('/datasets/'): |
| 200 | + # Convert to absolute path based on project structure |
| 201 | + if Path(args.path).name == 'datasets': |
| 202 | + base_dir = Path(args.path).parent |
| 203 | + else: |
| 204 | + base_dir = Path(args.path) |
| 205 | + while (base_dir.name != 'attack_data' and |
| 206 | + base_dir.parent != base_dir): |
| 207 | + base_dir = base_dir.parent |
| 208 | + |
| 209 | + if base_dir.name == 'attack_data': |
| 210 | + full_path = base_dir / dataset_path.lstrip('/') |
| 211 | + else: |
| 212 | + # Fallback: assume current working directory structure |
| 213 | + full_path = Path.cwd() / dataset_path.lstrip('/') |
| 214 | + else: |
| 215 | + # Assume relative to yml file location |
| 216 | + yml_dir = Path(yml_file).parent |
| 217 | + full_path = yml_dir / dataset_path |
| 218 | + |
| 219 | + if not full_path.exists(): |
| 220 | + print(f"Warning: Dataset file not found: {full_path}") |
| 221 | + continue |
| 222 | + |
| 223 | + print(f" Sending dataset '{dataset_name}' from {full_path}") |
| 224 | + print(f" source: {dataset_source}") |
| 225 | + print(f" sourcetype: {dataset_sourcetype}") |
| 226 | + |
| 227 | + send_data_to_splunk( |
| 228 | + file_path=str(full_path), |
| 229 | + splunk_host=splunk_host, |
| 230 | + hec_token=hec_token, |
| 231 | + event_host_uuid=event_host_uuid, |
| 232 | + index=args.index, |
| 233 | + source=dataset_source, |
| 234 | + sourcetype=dataset_sourcetype, |
| 235 | + ) |
| 236 | + |
| 237 | + except Exception as e: |
| 238 | + print(f"Error: {e}") |
| 239 | + sys.exit(1) |
| 240 | + |
| 241 | + |
| 242 | +if __name__ == "__main__": |
| 243 | + main() |
0 commit comments