Skip to content

Commit 9268150

Browse files
authored
Merge pull request #1043 from splunk/larger_disk_runner
Larger disk runner
2 parents 3b92935 + 01f33cd commit 9268150

File tree

2 files changed

+139
-137
lines changed

2 files changed

+139
-137
lines changed

.github/workflows/mirror_data_archive.yml

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,44 +2,45 @@ name: mirror-archive-on-merge-to-default-branch
22

33
on:
44
push:
5-
branches:
6-
- master
5+
branches:
6+
- master
77

88
jobs:
99
mirror-archive:
10-
runs-on: ubuntu-latest
10+
runs-on:
11+
group: attack-data-runners
1112
env:
12-
BUCKET: attack-range-attack-data
13-
ATTACK_DATA_ARCHIVE_FILE: attack_data.tar.zstd
13+
BUCKET: attack-range-attack-data
14+
ATTACK_DATA_ARCHIVE_FILE: attack_data.tar.zstd
1415
steps:
15-
- name: Checkout Repo
16-
uses: actions/checkout@v4
17-
# We must EXPLICITLY specificy lfs: true. It defaults to false
18-
with:
19-
lfs: true
20-
21-
- name: Setup AWS CLI and Credentials
22-
uses: aws-actions/configure-aws-credentials@v4
23-
with:
24-
aws-access-key-id: ${{ secrets.ACCESS_KEY}}
25-
aws-secret-access-key: ${{ secrets.SECRET_ACCESS_KEY }}
26-
aws-region: us-west-2
16+
- name: Checkout Repo
17+
uses: actions/checkout@v4
18+
# We must EXPLICITLY specificy lfs: true. It defaults to false
19+
with:
20+
lfs: true
2721

28-
- name: Create archive of ONLY the datasets folder
29-
run: |
30-
# The structure of the tar + zstd archive should mirror that of checking out the repo directly
31-
mkdir attack_data
32-
mv datasets/ attack_data/.
33-
34-
#Build some metadata about the archive for documentation purposes
35-
git rev-parse HEAD > attack_data/git_hash.txt
36-
date -u > attack_data/cache_build_date.txt
22+
- name: Setup AWS CLI and Credentials
23+
uses: aws-actions/configure-aws-credentials@v4
24+
with:
25+
aws-access-key-id: ${{ secrets.ACCESS_KEY}}
26+
aws-secret-access-key: ${{ secrets.SECRET_ACCESS_KEY }}
27+
aws-region: us-west-2
3728

38-
# Compress with number of threads equal to number of CPU cores.
39-
# Compression level 10 is a great compromise of speed and file size.
40-
# File size reductions are diminishing returns after this - determined experimentally.
41-
tar -c attack_data | zstd --compress -T0 -10 -o $ATTACK_DATA_ARCHIVE_FILE
29+
- name: Create archive of ONLY the datasets folder
30+
run: |
31+
# The structure of the tar + zstd archive should mirror that of checking out the repo directly
32+
mkdir attack_data
33+
mv datasets/ attack_data/.
4234
43-
- name: Upload Attack data archive file to S3 Bucket
44-
run: |
45-
aws s3 cp $ATTACK_DATA_ARCHIVE_FILE s3://$BUCKET/
35+
#Build some metadata about the archive for documentation purposes
36+
git rev-parse HEAD > attack_data/git_hash.txt
37+
date -u > attack_data/cache_build_date.txt
38+
39+
# Compress with number of threads equal to number of CPU cores.
40+
# Compression level 10 is a great compromise of speed and file size.
41+
# File size reductions are diminishing returns after this - determined experimentally.
42+
tar -c attack_data | zstd --compress -T0 -10 -o $ATTACK_DATA_ARCHIVE_FILE
43+
44+
- name: Upload Attack data archive file to S3 Bucket
45+
run: |
46+
aws s3 cp $ATTACK_DATA_ARCHIVE_FILE s3://$BUCKET/

.github/workflows/replay-datasets.yml

Lines changed: 105 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: Replay Changed Datasets to Splunk
22

33
on:
44
push:
5-
branches: [ master ]
5+
branches: [master]
66
paths:
77
- 'datasets/**'
88
workflow_dispatch:
@@ -14,115 +14,116 @@ on:
1414

1515
jobs:
1616
replay-datasets:
17-
runs-on: ubuntu-latest
18-
17+
runs-on:
18+
group: attack-data-runners
19+
1920
steps:
20-
- name: Checkout repository
21-
uses: actions/checkout@v4
22-
with:
23-
fetch-depth: 0 # Fetch full history for file change detection
24-
25-
- name: Pull Git LFS files
26-
run: git lfs pull
27-
28-
- name: Set up Python
29-
uses: actions/setup-python@v4
30-
with:
31-
python-version: '3.9'
32-
33-
- name: Install dependencies
34-
run: |
35-
cd bin
36-
pip install -r requirements.txt
37-
38-
- name: Find changed YAML files
39-
id: changed-files
40-
if: github.event_name != 'workflow_dispatch' || github.event.inputs.dataset_path == ''
41-
run: |
42-
# Get list of changed YAML files in datasets directory
43-
if [ "${{ github.event_name }}" = "pull_request" ]; then
44-
# For PR, compare against base branch
45-
BASE_SHA="${{ github.event.pull_request.base.sha }}"
46-
HEAD_SHA="${{ github.event.pull_request.head.sha }}"
47-
echo "Comparing PR: $BASE_SHA...$HEAD_SHA"
48-
YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "")
49-
else
50-
# For push, compare against previous commit
51-
BASE_SHA="${{ github.event.before }}"
52-
HEAD_SHA="${{ github.sha }}"
53-
echo "Comparing push: $BASE_SHA...$HEAD_SHA"
54-
YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "")
55-
fi
56-
57-
if [ -z "$YAML_FILES" ]; then
58-
echo "No YAML dataset files changed"
59-
echo "yaml_files=" >> $GITHUB_OUTPUT
60-
else
61-
echo "Changed YAML files:"
62-
echo "$YAML_FILES"
63-
# Convert newlines to spaces for easier handling
64-
YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ')
65-
echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT
66-
fi
67-
68-
- name: Set manual dataset path
69-
id: manual-path
70-
if: github.event_name == 'workflow_dispatch' && github.event.inputs.dataset_path != ''
71-
run: |
72-
# For manual dispatch, find YAML files in the specified path
73-
if [ -f "${{ github.event.inputs.dataset_path }}" ]; then
74-
# Single file provided
75-
echo "yaml_files=${{ github.event.inputs.dataset_path }}" >> $GITHUB_OUTPUT
76-
else
77-
# Directory provided - find YAML files
78-
YAML_FILES=$(python bin/find_changed_datasets.py --directory "${{ github.event.inputs.dataset_path }}" --output files 2>/dev/null || echo "")
79-
if [ -n "$YAML_FILES" ]; then
21+
- name: Checkout repository
22+
uses: actions/checkout@v4
23+
with:
24+
fetch-depth: 0 # Fetch full history for file change detection
25+
26+
- name: Pull Git LFS files
27+
run: git lfs pull
28+
29+
- name: Set up Python
30+
uses: actions/setup-python@v4
31+
with:
32+
python-version: '3.9'
33+
34+
- name: Install dependencies
35+
run: |
36+
cd bin
37+
pip install -r requirements.txt
38+
39+
- name: Find changed YAML files
40+
id: changed-files
41+
if: github.event_name != 'workflow_dispatch' || github.event.inputs.dataset_path == ''
42+
run: |
43+
# Get list of changed YAML files in datasets directory
44+
if [ "${{ github.event_name }}" = "pull_request" ]; then
45+
# For PR, compare against base branch
46+
BASE_SHA="${{ github.event.pull_request.base.sha }}"
47+
HEAD_SHA="${{ github.event.pull_request.head.sha }}"
48+
echo "Comparing PR: $BASE_SHA...$HEAD_SHA"
49+
YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "")
50+
else
51+
# For push, compare against previous commit
52+
BASE_SHA="${{ github.event.before }}"
53+
HEAD_SHA="${{ github.sha }}"
54+
echo "Comparing push: $BASE_SHA...$HEAD_SHA"
55+
YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "")
56+
fi
57+
58+
if [ -z "$YAML_FILES" ]; then
59+
echo "No YAML dataset files changed"
60+
echo "yaml_files=" >> $GITHUB_OUTPUT
61+
else
62+
echo "Changed YAML files:"
63+
echo "$YAML_FILES"
64+
# Convert newlines to spaces for easier handling
8065
YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ')
8166
echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT
67+
fi
68+
69+
- name: Set manual dataset path
70+
id: manual-path
71+
if: github.event_name == 'workflow_dispatch' && github.event.inputs.dataset_path != ''
72+
run: |
73+
# For manual dispatch, find YAML files in the specified path
74+
if [ -f "${{ github.event.inputs.dataset_path }}" ]; then
75+
# Single file provided
76+
echo "yaml_files=${{ github.event.inputs.dataset_path }}" >> $GITHUB_OUTPUT
8277
else
83-
echo "yaml_files=" >> $GITHUB_OUTPUT
78+
# Directory provided - find YAML files
79+
YAML_FILES=$(python bin/find_changed_datasets.py --directory "${{ github.event.inputs.dataset_path }}" --output files 2>/dev/null || echo "")
80+
if [ -n "$YAML_FILES" ]; then
81+
YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ')
82+
echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT
83+
else
84+
echo "yaml_files=" >> $GITHUB_OUTPUT
85+
fi
8486
fi
85-
fi
86-
87-
- name: Replay datasets to Splunk
88-
if: steps.changed-files.outputs.yaml_files != '' || steps.manual-path.outputs.yaml_files != ''
89-
env:
90-
SPLUNK_HOST: ${{ secrets.SPLUNK_HOST }}
91-
SPLUNK_HEC_TOKEN: ${{ secrets.SPLUNK_HEC_TOKEN }}
92-
run: |
93-
# Get the YAML files to process
94-
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
95-
YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}"
96-
else
97-
YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}"
98-
fi
99-
100-
if [ -z "$YAML_FILES" ]; then
101-
echo "No YAML files to process"
102-
exit 0
103-
fi
104-
105-
echo "Processing YAML files: $YAML_FILES"
106-
107-
# Run replay script with all YAML files
108-
# The replay script now reads all metadata from the YAML files themselves
109-
python bin/replay.py $YAML_FILES || echo "Failed to replay some datasets"
110-
111-
- name: Summary
112-
if: always()
113-
run: |
114-
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
115-
YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}"
116-
if [ -n "$YAML_FILES" ]; then
117-
echo "Manual replay completed for YAML files: $YAML_FILES"
87+
88+
- name: Replay datasets to Splunk
89+
if: steps.changed-files.outputs.yaml_files != '' || steps.manual-path.outputs.yaml_files != ''
90+
env:
91+
SPLUNK_HOST: ${{ secrets.SPLUNK_HOST }}
92+
SPLUNK_HEC_TOKEN: ${{ secrets.SPLUNK_HEC_TOKEN }}
93+
run: |
94+
# Get the YAML files to process
95+
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
96+
YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}"
11897
else
119-
echo "No YAML files found in specified path: ${{ github.event.inputs.dataset_path }}"
98+
YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}"
12099
fi
121-
else
122-
YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}"
123-
if [ -n "$YAML_FILES" ]; then
124-
echo "Automated replay completed for changed YAML files: $YAML_FILES"
100+
101+
if [ -z "$YAML_FILES" ]; then
102+
echo "No YAML files to process"
103+
exit 0
104+
fi
105+
106+
echo "Processing YAML files: $YAML_FILES"
107+
108+
# Run replay script with all YAML files
109+
# The replay script now reads all metadata from the YAML files themselves
110+
python bin/replay.py $YAML_FILES || echo "Failed to replay some datasets"
111+
112+
- name: Summary
113+
if: always()
114+
run: |
115+
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
116+
YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}"
117+
if [ -n "$YAML_FILES" ]; then
118+
echo "Manual replay completed for YAML files: $YAML_FILES"
119+
else
120+
echo "No YAML files found in specified path: ${{ github.event.inputs.dataset_path }}"
121+
fi
125122
else
126-
echo "No YAML dataset changes detected, no replay needed"
123+
YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}"
124+
if [ -n "$YAML_FILES" ]; then
125+
echo "Automated replay completed for changed YAML files: $YAML_FILES"
126+
else
127+
echo "No YAML dataset changes detected, no replay needed"
128+
fi
127129
fi
128-
fi

0 commit comments

Comments
 (0)