Skip to content

Commit 4735013

Browse files
author
Patrick Bareiss
committed
replay improvements
1 parent 8907e32 commit 4735013

File tree

3 files changed

+421
-50
lines changed

3 files changed

+421
-50
lines changed
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
name: Replay Changed Datasets to Splunk
2+
3+
on:
4+
push:
5+
branches: [ main, master ]
6+
paths:
7+
- 'datasets/**'
8+
pull_request:
9+
branches: [ main, master ]
10+
paths:
11+
- 'datasets/**'
12+
workflow_dispatch:
13+
inputs:
14+
dataset_path:
15+
description: 'Specific dataset path to replay (optional, defaults to all changed files)'
16+
required: false
17+
type: string
18+
19+
jobs:
20+
replay-datasets:
21+
runs-on: ubuntu-latest
22+
23+
steps:
24+
- name: Checkout repository
25+
uses: actions/checkout@v4
26+
with:
27+
fetch-depth: 0 # Fetch full history for file change detection
28+
29+
- name: Set up Python
30+
uses: actions/setup-python@v4
31+
with:
32+
python-version: '3.9'
33+
34+
- name: Install dependencies
35+
run: |
36+
cd bin
37+
pip install -r requirements.txt
38+
39+
- name: Find changed YAML files
40+
id: changed-files
41+
if: github.event_name != 'workflow_dispatch' || github.event.inputs.dataset_path == ''
42+
run: |
43+
# Get list of changed YAML files in datasets directory
44+
if [ "${{ github.event_name }}" = "pull_request" ]; then
45+
# For PR, compare against base branch
46+
BASE_SHA="${{ github.event.pull_request.base.sha }}"
47+
HEAD_SHA="${{ github.event.pull_request.head.sha }}"
48+
echo "Comparing PR: $BASE_SHA...$HEAD_SHA"
49+
YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "")
50+
else
51+
# For push, compare against previous commit
52+
BASE_SHA="${{ github.event.before }}"
53+
HEAD_SHA="${{ github.sha }}"
54+
echo "Comparing push: $BASE_SHA...$HEAD_SHA"
55+
YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "")
56+
fi
57+
58+
if [ -z "$YAML_FILES" ]; then
59+
echo "No YAML dataset files changed"
60+
echo "yaml_files=" >> $GITHUB_OUTPUT
61+
else
62+
echo "Changed YAML files:"
63+
echo "$YAML_FILES"
64+
# Convert newlines to spaces for easier handling
65+
YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ')
66+
echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT
67+
fi
68+
69+
- name: Set manual dataset path
70+
id: manual-path
71+
if: github.event_name == 'workflow_dispatch' && github.event.inputs.dataset_path != ''
72+
run: |
73+
# For manual dispatch, find YAML files in the specified path
74+
if [ -f "${{ github.event.inputs.dataset_path }}" ]; then
75+
# Single file provided
76+
echo "yaml_files=${{ github.event.inputs.dataset_path }}" >> $GITHUB_OUTPUT
77+
else
78+
# Directory provided - find YAML files
79+
YAML_FILES=$(python bin/find_changed_datasets.py --directory "${{ github.event.inputs.dataset_path }}" --output files 2>/dev/null || echo "")
80+
if [ -n "$YAML_FILES" ]; then
81+
YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ')
82+
echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT
83+
else
84+
echo "yaml_files=" >> $GITHUB_OUTPUT
85+
fi
86+
fi
87+
88+
- name: Replay datasets to Splunk
89+
if: steps.changed-files.outputs.yaml_files != '' || steps.manual-path.outputs.yaml_files != ''
90+
env:
91+
SPLUNK_HOST: ${{ secrets.SPLUNK_HOST }}
92+
SPLUNK_HEC_TOKEN: ${{ secrets.SPLUNK_HEC_TOKEN }}
93+
run: |
94+
# Get the YAML files to process
95+
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
96+
YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}"
97+
else
98+
YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}"
99+
fi
100+
101+
if [ -z "$YAML_FILES" ]; then
102+
echo "No YAML files to process"
103+
exit 0
104+
fi
105+
106+
echo "Processing YAML files: $YAML_FILES"
107+
108+
# Run replay script with all YAML files
109+
# The replay script now reads all metadata from the YAML files themselves
110+
python bin/replay.py $YAML_FILES || echo "Failed to replay some datasets"
111+
112+
- name: Summary
113+
if: always()
114+
run: |
115+
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
116+
YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}"
117+
if [ -n "$YAML_FILES" ]; then
118+
echo "Manual replay completed for YAML files: $YAML_FILES"
119+
else
120+
echo "No YAML files found in specified path: ${{ github.event.inputs.dataset_path }}"
121+
fi
122+
else
123+
YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}"
124+
if [ -n "$YAML_FILES" ]; then
125+
echo "Automated replay completed for changed YAML files: $YAML_FILES"
126+
else
127+
echo "No YAML dataset changes detected, no replay needed"
128+
fi
129+
fi

bin/find_changed_datasets.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Script to identify changed or added YAML dataset files for replay.
4+
This script simplifies the bash logic from the GitHub Actions workflow.
5+
"""
6+
7+
import sys
8+
import argparse
9+
import subprocess
10+
from pathlib import Path
11+
12+
13+
def run_git_command(cmd):
14+
"""Run a git command and return the output."""
15+
try:
16+
result = subprocess.run(
17+
cmd, shell=True, capture_output=True, text=True, check=True
18+
)
19+
return result.stdout.strip()
20+
except subprocess.CalledProcessError as e:
21+
print(f"Git command failed: {cmd}")
22+
print(f"Error: {e.stderr}")
23+
return ""
24+
25+
26+
def find_changed_files(base_sha, head_sha):
27+
"""Find files that changed between two commits."""
28+
if not base_sha or not head_sha:
29+
print("Error: Both base and head SHA are required")
30+
return []
31+
32+
cmd = f"git diff --name-only {base_sha}...{head_sha}"
33+
output = run_git_command(cmd)
34+
35+
if not output:
36+
return []
37+
38+
# Filter for files in datasets directory
39+
changed_files = []
40+
for line in output.split('\n'):
41+
if line.strip() and line.startswith('datasets/'):
42+
changed_files.append(line.strip())
43+
44+
return changed_files
45+
46+
47+
def find_yaml_files_in_directories(changed_files):
48+
"""Find directories containing YAML files from changed files."""
49+
yaml_dirs = set()
50+
51+
for file_path in changed_files:
52+
# Get the directory containing the changed file
53+
current_dir = Path(file_path).parent
54+
55+
# Walk up the directory tree to find YAML files
56+
while current_dir != Path("datasets") and current_dir != Path("."):
57+
# Check if this directory contains YAML files
58+
yaml_files = (list(current_dir.glob("*.yml")) +
59+
list(current_dir.glob("*.yaml")))
60+
61+
if yaml_files:
62+
yaml_dirs.add(str(current_dir))
63+
break
64+
65+
current_dir = current_dir.parent
66+
67+
return sorted(yaml_dirs)
68+
69+
70+
def find_all_yaml_files(directories):
71+
"""Find all YAML files in the given directories."""
72+
yaml_files = []
73+
74+
for dir_path in directories:
75+
dir_path = Path(dir_path)
76+
if dir_path.exists() and dir_path.is_dir():
77+
# Find YAML files in this directory (not recursive)
78+
yaml_files.extend(dir_path.glob("*.yml"))
79+
yaml_files.extend(dir_path.glob("*.yaml"))
80+
81+
return [str(f) for f in sorted(yaml_files)]
82+
83+
84+
def main():
85+
parser = argparse.ArgumentParser(
86+
description="Find changed dataset YAML files for replay",
87+
formatter_class=argparse.RawDescriptionHelpFormatter,
88+
epilog="""
89+
Examples:
90+
# Find changes between two commits
91+
python find_changed_datasets.py --base-sha abc123 --head-sha def456
92+
93+
# Find changes in current branch vs main
94+
python find_changed_datasets.py --compare-branch main
95+
96+
# List all YAML files in a specific directory
97+
python find_changed_datasets.py --directory datasets/attack_techniques/T1003.003
98+
99+
Output formats:
100+
--output directories : Print directories containing YAML files (default)
101+
--output files : Print individual YAML file paths
102+
"""
103+
)
104+
105+
group = parser.add_mutually_exclusive_group(required=True)
106+
group.add_argument(
107+
'--base-sha',
108+
help='Base commit SHA to compare from'
109+
)
110+
group.add_argument(
111+
'--compare-branch',
112+
help='Compare current HEAD against this branch (e.g., main, origin/main)'
113+
)
114+
group.add_argument(
115+
'--directory',
116+
help='Specific directory to find YAML files in'
117+
)
118+
119+
parser.add_argument(
120+
'--head-sha',
121+
help='Head commit SHA to compare to (defaults to HEAD if using --base-sha)'
122+
)
123+
parser.add_argument(
124+
'--output',
125+
choices=['directories', 'files'],
126+
default='directories',
127+
help='Output format: directories or individual files'
128+
)
129+
130+
args = parser.parse_args()
131+
132+
try:
133+
if args.directory:
134+
# Direct directory mode
135+
if not Path(args.directory).exists():
136+
print(f"Error: Directory {args.directory} does not exist")
137+
sys.exit(1)
138+
139+
if args.output == 'files':
140+
yaml_files = find_all_yaml_files([args.directory])
141+
for f in yaml_files:
142+
print(f)
143+
else:
144+
if find_all_yaml_files([args.directory]):
145+
print(args.directory)
146+
147+
elif args.compare_branch:
148+
# Compare against a branch
149+
head_sha = run_git_command("git rev-parse HEAD")
150+
base_sha = run_git_command(f"git merge-base HEAD {args.compare_branch}")
151+
152+
if not head_sha or not base_sha:
153+
print("Error: Could not determine commit SHAs")
154+
sys.exit(1)
155+
156+
changed_files = find_changed_files(base_sha, head_sha)
157+
if not changed_files:
158+
print("No dataset files changed")
159+
sys.exit(0)
160+
161+
print(f"Changed files: {len(changed_files)}", file=sys.stderr)
162+
for f in changed_files:
163+
print(f" {f}", file=sys.stderr)
164+
165+
yaml_dirs = find_yaml_files_in_directories(changed_files)
166+
167+
if args.output == 'files':
168+
yaml_files = find_all_yaml_files(yaml_dirs)
169+
for f in yaml_files:
170+
print(f)
171+
else:
172+
for d in yaml_dirs:
173+
print(d)
174+
175+
else:
176+
# Base/head SHA mode
177+
head_sha = args.head_sha or run_git_command("git rev-parse HEAD")
178+
179+
changed_files = find_changed_files(args.base_sha, head_sha)
180+
if not changed_files:
181+
print("No dataset files changed")
182+
sys.exit(0)
183+
184+
print(f"Changed files: {len(changed_files)}", file=sys.stderr)
185+
for f in changed_files:
186+
print(f" {f}", file=sys.stderr)
187+
188+
yaml_dirs = find_yaml_files_in_directories(changed_files)
189+
190+
if args.output == 'files':
191+
yaml_files = find_all_yaml_files(yaml_dirs)
192+
for f in yaml_files:
193+
print(f)
194+
else:
195+
for d in yaml_dirs:
196+
print(d)
197+
198+
except Exception as e:
199+
print(f"Error: {e}", file=sys.stderr)
200+
sys.exit(1)
201+
202+
203+
if __name__ == "__main__":
204+
main()

0 commit comments

Comments
 (0)