From 28765e36b4b1be4464f152f2610caab883b56f68 Mon Sep 17 00:00:00 2001 From: Aiden Mitchell Date: Fri, 16 Jan 2026 10:56:30 -0800 Subject: [PATCH 1/8] Refactor sync scripts into modular architecture - Extract monolithic sync_detection_rules.py into shared library modules - Create separate sync_test_rules.py and sync_shared_samples.py scripts - Move helper scripts to .github/scripts/ - Add PR commenting for exclusions (membership, bulk, link_analysis) - Add do-not-merge label check to skip PRs from syncing - Update rule-validate.yml to use local scripts - Rename update-test-rules.yml to sync-test-rules.yml - Add new sync-shared-samples.yml workflow New directory structure: .github/scripts/ lib/ - Shared library modules sync_test_rules.py sync_shared_samples.py mql_format.py check_invisible_chars.py generate_rule_ids/ Co-Authored-By: Claude Opus 4.5 --- .../scripts}/check_invisible_chars.py | 0 .../scripts/generate_rule_ids}/main.py | 0 .../generate_rule_ids}/requirements.txt | 0 .github/scripts/lib/__init__.py | 111 ++ .github/scripts/lib/constants.py | 33 + .github/scripts/lib/file_utils.py | 99 ++ .github/scripts/lib/github_client.py | 43 + .github/scripts/lib/labels.py | 84 ++ .github/scripts/lib/membership.py | 129 ++ .github/scripts/lib/pr_comments.py | 147 +++ .github/scripts/lib/uuid_utils.py | 24 + .github/scripts/lib/yaml_utils.py | 210 ++++ {scripts => .github/scripts}/mql_format.py | 482 ++++---- .github/scripts/sync_shared_samples.py | 471 +++++++ .github/scripts/sync_test_rules.py | 400 ++++++ .github/workflows/rule-validate.yml | 18 +- .github/workflows/sync-shared-samples.yml | 152 +++ ...ate-test-rules.yml => sync-test-rules.yml} | 78 +- scripts/sync_detection_rules.py | 1080 ----------------- 19 files changed, 2175 insertions(+), 1386 deletions(-) rename {scripts => .github/scripts}/check_invisible_chars.py (100%) rename {scripts/generate-rule-ids => .github/scripts/generate_rule_ids}/main.py (100%) rename {scripts/generate-rule-ids => .github/scripts/generate_rule_ids}/requirements.txt (100%) create mode 100644 .github/scripts/lib/__init__.py create mode 100644 .github/scripts/lib/constants.py create mode 100644 .github/scripts/lib/file_utils.py create mode 100644 .github/scripts/lib/github_client.py create mode 100644 .github/scripts/lib/labels.py create mode 100644 .github/scripts/lib/membership.py create mode 100644 .github/scripts/lib/pr_comments.py create mode 100644 .github/scripts/lib/uuid_utils.py create mode 100644 .github/scripts/lib/yaml_utils.py rename {scripts => .github/scripts}/mql_format.py (96%) mode change 100755 => 100644 create mode 100644 .github/scripts/sync_shared_samples.py create mode 100644 .github/scripts/sync_test_rules.py create mode 100644 .github/workflows/sync-shared-samples.yml rename .github/workflows/{update-test-rules.yml => sync-test-rules.yml} (77%) delete mode 100644 scripts/sync_detection_rules.py diff --git a/scripts/check_invisible_chars.py b/.github/scripts/check_invisible_chars.py similarity index 100% rename from scripts/check_invisible_chars.py rename to .github/scripts/check_invisible_chars.py diff --git a/scripts/generate-rule-ids/main.py b/.github/scripts/generate_rule_ids/main.py similarity index 100% rename from scripts/generate-rule-ids/main.py rename to .github/scripts/generate_rule_ids/main.py diff --git a/scripts/generate-rule-ids/requirements.txt b/.github/scripts/generate_rule_ids/requirements.txt similarity index 100% rename from scripts/generate-rule-ids/requirements.txt rename to .github/scripts/generate_rule_ids/requirements.txt diff --git a/.github/scripts/lib/__init__.py b/.github/scripts/lib/__init__.py new file mode 100644 index 00000000000..ce763a8f359 --- /dev/null +++ b/.github/scripts/lib/__init__.py @@ -0,0 +1,111 @@ +""" +Shared library for sync scripts. +""" +from .constants import ( + IN_TEST_RULES_LABEL, + AUTHOR_MEMBERSHIP_EXCLUSION_LABEL, + MANUAL_EXCLUSION_LABEL, + BULK_PR_LABEL, + LINK_ANALYSIS_EXCLUSION_LABEL, + HUNTING_REQUIRED_LABEL, + DO_NOT_MERGE_LABEL, + SKIP_TEXTS, + DEFAULT_ORG_NAME, + DEFAULT_COMMENT_TRIGGER, + DEFAULT_MAX_RULES_PER_PR, + DEFAULT_DELETE_RULES_DELAY_DAYS, + DEFAULT_REQUIRED_CHECK_NAME, + DEFAULT_REQUIRED_CHECK_CONCLUSION, + DEFAULT_AUTHOR_TAG_PREFIX, + DEFAULT_RULE_STATUS_PREFIX, + DEFAULT_OPEN_PR_TAG, +) + +from .github_client import create_github_session + +from .labels import has_label, apply_label, remove_label + +from .membership import ( + is_user_in_org, + has_trigger_comment, + has_required_action_completed, +) + +from .yaml_utils import ( + check_skip_texts, + add_id_to_yaml, + extract_rule_name, + prepend_pr_details, + rename_rules, + add_block, +) + +from .uuid_utils import generate_deterministic_uuid + +from .file_utils import ( + get_file_contents, + save_file, + pr_has_synced_files, + clean_output_folder, + count_yaml_rules_in_pr, +) + +from .pr_comments import ( + add_pr_comment, + has_existing_comment, + generate_exclusion_comment, + post_exclusion_comment_if_needed, + COMMENT_MARKER, +) + +__all__ = [ + # Constants + 'IN_TEST_RULES_LABEL', + 'AUTHOR_MEMBERSHIP_EXCLUSION_LABEL', + 'MANUAL_EXCLUSION_LABEL', + 'BULK_PR_LABEL', + 'LINK_ANALYSIS_EXCLUSION_LABEL', + 'HUNTING_REQUIRED_LABEL', + 'DO_NOT_MERGE_LABEL', + 'SKIP_TEXTS', + 'DEFAULT_ORG_NAME', + 'DEFAULT_COMMENT_TRIGGER', + 'DEFAULT_MAX_RULES_PER_PR', + 'DEFAULT_DELETE_RULES_DELAY_DAYS', + 'DEFAULT_REQUIRED_CHECK_NAME', + 'DEFAULT_REQUIRED_CHECK_CONCLUSION', + 'DEFAULT_AUTHOR_TAG_PREFIX', + 'DEFAULT_RULE_STATUS_PREFIX', + 'DEFAULT_OPEN_PR_TAG', + # GitHub client + 'create_github_session', + # Labels + 'has_label', + 'apply_label', + 'remove_label', + # Membership + 'is_user_in_org', + 'has_trigger_comment', + 'has_required_action_completed', + # YAML utils + 'check_skip_texts', + 'add_id_to_yaml', + 'extract_rule_name', + 'prepend_pr_details', + 'rename_rules', + 'add_block', + # UUID utils + 'generate_deterministic_uuid', + # File utils + 'get_file_contents', + 'save_file', + 'pr_has_synced_files', + 'clean_output_folder', + 'count_yaml_rules_in_pr', + # PR comments + 'add_pr_comment', + 'has_existing_comment', + 'generate_exclusion_comment', + 'post_exclusion_comment_if_needed', + 'COMMENT_MARKER', +] diff --git a/.github/scripts/lib/constants.py b/.github/scripts/lib/constants.py new file mode 100644 index 00000000000..1bcb7d2fec4 --- /dev/null +++ b/.github/scripts/lib/constants.py @@ -0,0 +1,33 @@ +""" +Constants and configuration values for sync scripts. +""" + +# Label names +IN_TEST_RULES_LABEL = 'in-test-rules' +AUTHOR_MEMBERSHIP_EXCLUSION_LABEL = 'test-rules:excluded:author_membership' +MANUAL_EXCLUSION_LABEL = 'test-rules:excluded:manual' +BULK_PR_LABEL = 'test-rules:excluded:bulk_rules' +LINK_ANALYSIS_EXCLUSION_LABEL = 'test-rules:excluded:link_analysis' +HUNTING_REQUIRED_LABEL = 'hunting-required' +DO_NOT_MERGE_LABEL = 'do-not-merge' + +# Skip texts configuration: {text: [labels_to_apply]} +# Files containing these texts will be skipped from syncing +SKIP_TEXTS = { + 'ml.link_analysis': [HUNTING_REQUIRED_LABEL, LINK_ANALYSIS_EXCLUSION_LABEL] +} + +# Default configuration values +DEFAULT_ORG_NAME = 'sublime-security' +DEFAULT_COMMENT_TRIGGER = '/update-test-rules' +DEFAULT_MAX_RULES_PER_PR = 10 +DEFAULT_DELETE_RULES_DELAY_DAYS = 3 + +# Required check configuration +DEFAULT_REQUIRED_CHECK_NAME = 'Rule Tests and ID Updated' +DEFAULT_REQUIRED_CHECK_CONCLUSION = 'success' + +# Tag configuration +DEFAULT_AUTHOR_TAG_PREFIX = 'pr_author_' +DEFAULT_RULE_STATUS_PREFIX = 'rule_status_' +DEFAULT_OPEN_PR_TAG = 'created_from_open_prs' diff --git a/.github/scripts/lib/file_utils.py b/.github/scripts/lib/file_utils.py new file mode 100644 index 00000000000..c68e790bdfa --- /dev/null +++ b/.github/scripts/lib/file_utils.py @@ -0,0 +1,99 @@ +""" +File operations and GitHub file content utilities. +""" +import base64 +import os + + +def get_file_contents(session, repo_owner, repo_name, file_path, ref): + """ + Get file contents from GitHub at a specific commit. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + file_path (str): Path to the file in the repository + ref (str): Git ref (branch, tag, or commit SHA) to fetch from + + Returns: + str: Decoded file content + """ + # Construct the contents API URL with the specific ref + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}' + params = {'ref': ref} + + response = session.get(url, params=params) + response.raise_for_status() + content = response.json()['content'] + return base64.b64decode(content).decode('utf-8') + + +def save_file(output_folder, path, content): + """ + Save content to a file in the output folder. + + Args: + output_folder (str): Base output folder path + path (str): Filename or path to save + content (str): Content to write + """ + file_path = os.path.join(output_folder, os.path.basename(path)) + with open(file_path, 'w') as file: + file.write(content) + + +def pr_has_synced_files(output_folder, pr_number): + """ + Check if a PR has any synced files in the output folder. + + Args: + output_folder (str): Base output folder path + pr_number (int): Pull request number + + Returns: + bool: True if files exist for this PR, False otherwise + """ + if not os.path.exists(output_folder): + return False + prefix = f"{pr_number}_" + for filename in os.listdir(output_folder): + if filename.startswith(prefix) and filename.endswith('.yml'): + return True + return False + + +def clean_output_folder(output_folder, valid_files): + """ + Remove files from output folder that are not in the valid_files set. + + Args: + output_folder (str): Base output folder path + valid_files (set): Set of filenames to keep + """ + if not os.path.exists(output_folder): + return + for filename in os.listdir(output_folder): + file_path = os.path.join(output_folder, filename) + if filename not in valid_files: + print(f"Removing file: {filename}") + os.remove(file_path) + + +def count_yaml_rules_in_pr(files): + """ + Count the number of YAML rule files in the PR. + + Args: + files (list): List of file objects from GitHub API + + Returns: + int: Number of YAML files in detection-rules directory + """ + yaml_count = 0 + for file in files: + if (file['status'] in ['added', 'modified', 'changed'] and + file['filename'].startswith('detection-rules/') and + file['filename'].endswith('.yml')): + yaml_count += 1 + return yaml_count diff --git a/.github/scripts/lib/github_client.py b/.github/scripts/lib/github_client.py new file mode 100644 index 00000000000..f38e80f6d02 --- /dev/null +++ b/.github/scripts/lib/github_client.py @@ -0,0 +1,43 @@ +""" +GitHub API session setup with retry logic. +""" +import os + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + + +def create_github_session(token=None): + """ + Create a requests session configured for GitHub API with retry logic. + + Args: + token (str, optional): GitHub token. If not provided, uses GITHUB_TOKEN env var. + + Returns: + requests.Session: Configured session with retry strategy and auth headers. + """ + if token is None: + token = os.getenv('GITHUB_TOKEN') + + # Configure retry strategy + retry_strategy = Retry( + total=3, # Maximum number of retries + backoff_factor=2, # Exponential backoff factor (wait 2^retry seconds) + status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on + allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"] + ) + + adapter = HTTPAdapter(max_retries=retry_strategy) + session = requests.Session() + session.mount("http://", adapter) + session.mount("https://", adapter) + + headers = { + 'Authorization': f'token {token}', + 'Accept': 'application/vnd.github.v3+json' + } + session.headers.update(headers) + + return session diff --git a/.github/scripts/lib/labels.py b/.github/scripts/lib/labels.py new file mode 100644 index 00000000000..1b64b436cdf --- /dev/null +++ b/.github/scripts/lib/labels.py @@ -0,0 +1,84 @@ +""" +GitHub label management functions. +""" +import sys + + +def has_label(session, repo_owner, repo_name, pr_number, label_name): + """ + Check if a PR has a specific label. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + label_name (str): Label name to check for + + Returns: + bool: True if PR has the label, False otherwise + """ + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/labels' + response = session.get(url) + response.raise_for_status() + labels = response.json() + + return any(label['name'] == label_name for label in labels) + + +def apply_label(session, repo_owner, repo_name, pr_number, label_name): + """ + Apply a label to a PR. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + label_name (str): Label name to apply + + Returns: + bool: True if label was applied successfully, False otherwise + """ + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/labels' + payload = {'labels': [label_name]} + + try: + response = session.post(url, json=payload) + response.raise_for_status() + print(f"\tApplied label '{label_name}' to PR #{pr_number}") + return True + except Exception as e: + print(f"\tFailed to apply label '{label_name}' to PR #{pr_number}: {e}") + print("Failed to get valid response after retries. Exiting script.") + sys.exit(1) + + +def remove_label(session, repo_owner, repo_name, pr_number, label_name): + """ + Remove a label from a PR. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + label_name (str): Label name to remove + + Returns: + bool: True if label was removed successfully, False otherwise + """ + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/labels/{label_name}' + + try: + response = session.delete(url) + if response.status_code == 404: + print(f"\tLabel '{label_name}' not found on PR #{pr_number}") + return True # Consider it successful if the label wasn't there + response.raise_for_status() + print(f"\tRemoved label '{label_name}' from PR #{pr_number}") + return True + except Exception as e: + print(f"\tFailed to remove label '{label_name}' from PR #{pr_number}: {e}") + print("Failed to get valid response after retries. Exiting script.") + sys.exit(1) diff --git a/.github/scripts/lib/membership.py b/.github/scripts/lib/membership.py new file mode 100644 index 00000000000..2c4b6e78a65 --- /dev/null +++ b/.github/scripts/lib/membership.py @@ -0,0 +1,129 @@ +""" +GitHub organization membership and PR checks. +""" +import sys + + +def is_user_in_org(session, username, org_name): + """ + Check if a user is a member of a specific organization. + + Args: + session: GitHub API session + username (str): GitHub username + org_name (str): Organization name + + Returns: + bool: True if user is a member, False otherwise + """ + url = f'https://api.github.com/orgs/{org_name}/members/{username}' + try: + response = session.get(url) + # 404 is expected when user is not in org, so handle it separately + if response.status_code == 404: + return False + response.raise_for_status() + return response.status_code == 204 + except Exception as e: + print(f"Error checking organization membership for {username} in {org_name}: {e}") + print("Failed to get valid response after retries. Exiting script.") + sys.exit(1) + + +def has_trigger_comment(session, repo_owner, repo_name, pr_number, org_name, trigger_comment): + """ + Check if a PR has a comment with the trigger text from a member of the specified org. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + org_name (str): Organization name to filter commenters + trigger_comment (str): Comment text to look for + + Returns: + bool: True if a matching comment is found, False otherwise + """ + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/comments' + response = session.get(url) + response.raise_for_status() + comments = response.json() + + for comment in comments: + # Check if comment contains the trigger and author is in the organization + if trigger_comment in comment['body']: + print(f"\tPR #{pr_number}: Author not in {org_name} and trigger comment found") + if is_user_in_org(session, comment['user']['login'], org_name): + print(f"\tPR #{pr_number}: Author not in {org_name} and trigger comment from {comment['user']['login']} is a {org_name} member") + return True + print(f"\tPR #{pr_number}: Author not in {org_name} and trigger comment from {comment['user']['login']} is NOT a {org_name} member") + + print(f"\tPR #{pr_number}: Author not in {org_name} and trigger comment NOT found") + + return False + + +def has_required_action_completed(session, repo_owner, repo_name, pr_sha, action_name, required_status): + """ + Check if a required GitHub Actions workflow has completed with the expected status for a PR. + Uses the GitHub Checks API to poll for check results. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_sha (str): SHA of the PR head commit + action_name (str): Name of the action/check to look for + required_status (str): Required status (success, failure, etc.) + + Returns: + bool: True if the action has completed with the required status, False otherwise + """ + # Use the GitHub Checks API to get all check runs for this commit + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/commits/{pr_sha}/check-runs' + custom_headers = {'Accept': 'application/vnd.github.v3+json'} + + # Temporarily update session headers for this request + original_accept = session.headers.get('Accept') + session.headers.update(custom_headers) + + try: + response = session.get(url) + response.raise_for_status() + except Exception as e: + print(f"\tError checking action status: {e}") + print("Failed to get valid response after retries. Exiting script.") + sys.exit(1) + finally: + # Restore original Accept header + session.headers['Accept'] = original_accept + + check_runs = response.json() + + if 'check_runs' not in check_runs or len(check_runs['check_runs']) == 0: + print(f"\tNo check runs found for commit {pr_sha}") + return False + + # Look for the specific action by name + for check in check_runs['check_runs']: + check_name = check['name'] + check_conclusion = check['conclusion'] + check_status = check['status'] + + if action_name.lower() in check_name.lower(): + + # Check if the action is complete + if check_status != 'completed': + print(f"\tCheck '{check_name}' is still in progress (status: {check_status})") + return False + + # Check if the action has the required conclusion + if check_conclusion == required_status: + return True + else: + print(f"\tCheck '{check_name}' has conclusion '{check_conclusion}', expected '{required_status}'") + return False + + print(f"\tNo check matching '{action_name}' found") + return False diff --git a/.github/scripts/lib/pr_comments.py b/.github/scripts/lib/pr_comments.py new file mode 100644 index 00000000000..444bb12d0d7 --- /dev/null +++ b/.github/scripts/lib/pr_comments.py @@ -0,0 +1,147 @@ +""" +PR comment management functions. +""" +from .constants import ( + AUTHOR_MEMBERSHIP_EXCLUSION_LABEL, + BULK_PR_LABEL, + LINK_ANALYSIS_EXCLUSION_LABEL, + DEFAULT_COMMENT_TRIGGER, +) + + +# Marker to identify bot comments for deduplication +COMMENT_MARKER = '' + + +def has_existing_comment(session, repo_owner, repo_name, pr_number, marker_text): + """ + Check if a PR already has a comment with the specified marker. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + marker_text (str): Text marker to search for + + Returns: + bool: True if comment with marker exists, False otherwise + """ + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/comments' + response = session.get(url) + response.raise_for_status() + comments = response.json() + + for comment in comments: + if marker_text in comment.get('body', ''): + return True + + return False + + +def add_pr_comment(session, repo_owner, repo_name, pr_number, body): + """ + Add a comment to a PR. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + body (str): Comment body text + + Returns: + bool: True if comment was added successfully, False otherwise + """ + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/comments' + payload = {'body': body} + + try: + response = session.post(url, json=payload) + response.raise_for_status() + print(f"\tAdded comment to PR #{pr_number}") + return True + except Exception as e: + print(f"\tFailed to add comment to PR #{pr_number}: {e}") + return False + + +def generate_exclusion_comment(exclusion_type, org_name=None, max_rules=None, rule_count=None, comment_trigger=None): + """ + Generate a user-friendly comment explaining why a PR was excluded from syncing. + + Args: + exclusion_type (str): Type of exclusion (author_membership, bulk_rules, link_analysis) + org_name (str, optional): Organization name for membership exclusions + max_rules (int, optional): Max rules limit for bulk exclusions + rule_count (int, optional): Actual rule count for bulk exclusions + comment_trigger (str, optional): Comment trigger text + + Returns: + str: Formatted comment body with marker + """ + if comment_trigger is None: + comment_trigger = DEFAULT_COMMENT_TRIGGER + + if exclusion_type == AUTHOR_MEMBERSHIP_EXCLUSION_LABEL: + body = f"""{COMMENT_MARKER} +### Test Rules Sync - Action Required + +This PR was not automatically synced to test-rules because the author is not a member of the `{org_name}` organization. + +**To enable syncing**, an organization member can comment `{comment_trigger}` on this PR. + +Once triggered, the rules will be synced on the next scheduled run (every 10 minutes). +""" + elif exclusion_type == BULK_PR_LABEL: + body = f"""{COMMENT_MARKER} +### Test Rules Sync - Excluded + +This PR contains **{rule_count} rules**, which exceeds the maximum of **{max_rules} rules** allowed per PR for automatic syncing. + +This limit helps ensure the test-rules environment remains manageable. If you need to test these rules, consider: +- Splitting the PR into smaller PRs with fewer rules +- Contacting the team to request a manual sync +""" + elif exclusion_type == LINK_ANALYSIS_EXCLUSION_LABEL: + body = f"""{COMMENT_MARKER} +### Test Rules Sync - Excluded + +This PR contains rules that use `ml.link_analysis`, which is not supported in the test-rules environment. + +The `hunting-required` label has been applied. These rules will need to be tested through alternative methods. +""" + else: + body = f"""{COMMENT_MARKER} +### Test Rules Sync - Excluded + +This PR has been excluded from automatic syncing. Please check the applied labels for more details. +""" + + return body + + +def post_exclusion_comment_if_needed(session, repo_owner, repo_name, pr_number, exclusion_type, **kwargs): + """ + Post an exclusion comment to a PR if one doesn't already exist. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + exclusion_type (str): Type of exclusion + **kwargs: Additional arguments passed to generate_exclusion_comment + + Returns: + bool: True if comment was added or already exists, False on error + """ + # Check if we've already commented + marker = f"{COMMENT_MARKER}\n### Test Rules Sync" + if has_existing_comment(session, repo_owner, repo_name, pr_number, COMMENT_MARKER): + print(f"\tPR #{pr_number} already has an exclusion comment, skipping") + return True + + # Generate and post the comment + body = generate_exclusion_comment(exclusion_type, **kwargs) + return add_pr_comment(session, repo_owner, repo_name, pr_number, body) diff --git a/.github/scripts/lib/uuid_utils.py b/.github/scripts/lib/uuid_utils.py new file mode 100644 index 00000000000..69534b3524a --- /dev/null +++ b/.github/scripts/lib/uuid_utils.py @@ -0,0 +1,24 @@ +""" +UUID generation utilities. +""" +import uuid + + +def generate_deterministic_uuid(seed_string): + """ + Generate a deterministic UUID based on a seed string. + This ensures the same input will always produce the same UUID. + + Args: + seed_string (str): A string to use as a seed for UUID generation + + Returns: + str: A UUID string in the format of XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX + """ + # Create a namespace UUID (using the DNS namespace as a standard practice) + namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') + + # Create a UUID using the namespace and the seed string + deterministic_uuid = uuid.uuid5(namespace, seed_string) + + return str(deterministic_uuid) diff --git a/.github/scripts/lib/yaml_utils.py b/.github/scripts/lib/yaml_utils.py new file mode 100644 index 00000000000..4a3a474a89f --- /dev/null +++ b/.github/scripts/lib/yaml_utils.py @@ -0,0 +1,210 @@ +""" +YAML manipulation utilities for rule files. +""" +import re + +from .uuid_utils import generate_deterministic_uuid + + +def check_skip_texts(content, skip_texts): + """ + Check if file content contains any of the configured skip texts (case-insensitive). + + Args: + content (str): File content + skip_texts (dict): Dictionary of {text: [labels]} to check + + Returns: + tuple: (matched_texts, all_labels) where matched_texts is a list of + matching texts and all_labels is a set of all labels to apply + """ + matched_texts = [] + all_labels = set() + + for text, labels in skip_texts.items(): + if text.lower() in content.lower(): + matched_texts.append(text) + all_labels.update(labels) + + return matched_texts, all_labels + + +def add_id_to_yaml(content, filename): + """ + Adds or replaces an ID field in the YAML content. + Extracts the original ID if present. + + Args: + content (str): The YAML content + filename (str): The filename to use as seed for UUID generation + + Returns: + tuple: (modified_content, original_id) - The modified YAML content with the UUID added/replaced + and the original ID if found, otherwise None + """ + # Use the filename directly as the seed for UUID generation + # Generate a deterministic UUID based on the seed + new_uuid = generate_deterministic_uuid(filename) + original_id = None + + # Check if 'id:' already exists in the content + if 'id:' in content: + # Extract the original ID + pattern = r'^\s*id:\s*([^\n]*)' + match = re.search(pattern, content, flags=re.MULTILINE) + if match: + original_id = match.group(1).strip() + if original_id.startswith('"') and original_id.endswith('"'): + original_id = original_id[1:-1] # Remove surrounding quotes + elif original_id.startswith("'") and original_id.endswith("'"): + original_id = original_id[1:-1] # Remove surrounding quotes + + # Replace with the new ID + modified_content = re.sub(pattern, f'id: "{new_uuid}"', content, flags=re.MULTILINE) + return modified_content, original_id + else: + # If it doesn't exist, add it to the very end of the YAML file + # Make sure we have a clean end to the file (no trailing whitespace) + modified_content = content.rstrip() + + # Add a newline and the ID field + modified_content += f'\nid: "{new_uuid}"' + + return modified_content, original_id + + +def extract_rule_name(content): + """ + Extract the rule name from YAML content. + + Args: + content (str): YAML content + + Returns: + str: The rule name or empty string if not found + """ + current_name = "" + lines = content.split('\n') + for line in lines: + if 'name:' in line: + # replace the quotes and spaces to create a clean filename + current_name = line.replace('name: ', '').strip() + break + + return current_name + + +def prepend_pr_details(rule_name, pr): + """ + Prepend PR number to rule name. + + Args: + rule_name (str): Original rule name + pr (dict): PR object with 'number' key + + Returns: + str: Modified rule name with PR number prefix + """ + # maintain the original quoting around the name + pr_num = pr['number'] + if rule_name.startswith('"') and rule_name.endswith('"'): + stripped = rule_name.strip('" ') + new_name = f'"PR# {pr_num} - {stripped}"' + elif rule_name.startswith("'") and rule_name.endswith("'"): + stripped = rule_name.strip("' ") + new_name = f"'PR# {pr_num} - {stripped}'" + else: + new_name = f"PR# {pr_num} - {rule_name}" + + return new_name + + +def rename_rules(content, pr): + """ + Rename rules in content to include PR number. + + Args: + content (str): YAML content + pr (dict): PR object with 'number' key + + Returns: + str: Modified content with PR number in rule name + """ + # extract the current name + current_name = extract_rule_name(content) + # build out the new name to inject the PR number + new_name = prepend_pr_details(current_name, pr) + + content = content.replace(current_name, new_name) + return content + + +def add_block(yaml_string, block_name, value): + """ + Add a value to a YAML block (tags or references). + + Args: + yaml_string (str): The YAML content + block_name (str): Block name ('tags' or 'references') + value (str): Value to add to the block + + Returns: + str: Modified YAML content + """ + # throw an error if the block name isn't known + if block_name not in ['tags', 'references', 'tags:', 'references:']: + raise ValueError(f'Block Name: {block_name} is unsupported') + # if it doesn't have the : needed, add it. + + if not block_name.endswith(':'): + block_name = f"{block_name}:" + + if block_name in yaml_string: + # find the tags block + start_block = yaml_string.find(block_name) + + # the end of the block by locating the next section or end of the string + end_block = start_block + + while True: + next_line_start = yaml_string.find("\n", end_block + 1) + # if there isn't a new line found, we've hit the end of the file + # or if the next line doesn't start with a space (which indicates it's still within the tag section) + if next_line_start == -1 or not yaml_string[next_line_start + 1].isspace(): + if next_line_start != -1: + end_block = next_line_start + else: + len(yaml_string) + break + end_block = next_line_start + + # get the original block + block = yaml_string[start_block:end_block].strip() + + existing_block_entries = [] + # Split the tags into a list + for line in block.splitlines(): + # within the tags_block is the tag section header, skip that one + if line.strip() == block_name: + continue + line = line.strip() + line = line.lstrip('-') + # strip leading spaces after the - too + line = line.strip() + + existing_block_entries.append(line) + # add the author tag to the existing tags array + existing_block_entries.append(f"{value}") + + new_block_string = block_name + for entry in existing_block_entries: + new_block_string += f"\n - {entry}" + # replace the old with the new + modified_yaml_string = yaml_string.replace(block, new_block_string) + else: + # just add it at the end + new_block_string = f"{block_name}\n - {value}" + # add additional tag to help filter down to the right rule id later + modified_yaml_string = yaml_string.strip() + "\n" + new_block_string + + return modified_yaml_string diff --git a/scripts/mql_format.py b/.github/scripts/mql_format.py old mode 100755 new mode 100644 similarity index 96% rename from scripts/mql_format.py rename to .github/scripts/mql_format.py index 60d3b61cc28..a046cd3c74a --- a/scripts/mql_format.py +++ b/.github/scripts/mql_format.py @@ -1,241 +1,241 @@ -#!/usr/bin/env python3 -""" -MQL Formatter - formats MQL rules using Sublime's Format API - -Usage: - # Format files in place - ./mql_format.py detection-rules/*.yml - - # Check if files need formatting (exit 1 if changes needed) - ./mql_format.py --check detection-rules/*.yml -""" - -import sys -import re -import argparse -from pathlib import Path -from concurrent.futures import ThreadPoolExecutor, as_completed - -try: - import requests -except ImportError: - print("::error::requests package required. Install with: pip install requests") - sys.exit(1) - -try: - import yaml -except ImportError: - print("::error::PyYAML package required. Install with: pip install pyyaml") - sys.exit(1) - -API_URL = "https://play.sublime.security/v1/rules/format" -MAX_WORKERS = 100 - -# Files to exclude from formatting (e.g., special comment formatting) -EXCLUDE_FILES = { - "attachment_cve_2023_38831.yml", -} - - -def format_source(source: str) -> str: - """Format MQL source using the Sublime API.""" - resp = requests.post(API_URL, json={ - "source": source, - "max_line_width": 80, - "indent": 2, - "prefer_multi_line_root": True, - }, timeout=30) - - # Handle 500 errors gracefully - this is a known API bug with empty comment lines - if resp.status_code == 500: - error = requests.HTTPError("500 Server Error") - error.response = resp - raise error - - resp.raise_for_status() - return resp.json()["source"] - - -def extract_source(content: str) -> str | None: - """Extract source field using PyYAML.""" - try: - data = yaml.safe_load(content) - return data.get("source") if data else None - except yaml.YAMLError: - return None - - -def replace_source(content: str, new_source: str) -> str: - """Replace source block in YAML file, preserving everything else.""" - lines = content.split('\n') - result = [] - source_indent = 2 # default - - i = 0 - while i < len(lines): - line = lines[i] - - if re.match(r'^source:\s*\|', line): - result.append(line) - - # Find the indentation from the next non-empty line - for j in range(i + 1, len(lines)): - if lines[j].strip(): - source_indent = len(lines[j]) - len(lines[j].lstrip()) - break - - # Insert the new formatted source with proper indentation - indent = ' ' * source_indent - for src_line in new_source.split('\n'): - result.append(indent + src_line) - - # Skip the old source lines - i += 1 - while i < len(lines): - if not lines[i].strip(): # blank line - i += 1 - elif lines[i][0].isspace(): # indented = still in source - i += 1 - else: # non-indented = next field - break - continue - - result.append(line) - i += 1 - - return '\n'.join(result) - - -def normalize(s: str) -> str: - """Normalize source for comparison (ignore trailing whitespace).""" - return '\n'.join(line.rstrip() for line in s.strip().split('\n')) - - -def process_file(file_data: dict) -> dict: - """Process a single file - called in thread pool.""" - path = file_data["path"] - content = file_data["content"] - source = file_data["source"] - - try: - formatted_source = format_source(source) - changed = normalize(formatted_source) != normalize(source) - return { - "path": path, - "content": content, - "formatted_source": formatted_source, - "changed": changed, - "error": None, - "is_500": False - } - except requests.HTTPError as e: - # Check if this is a 500 error - # Note: Response object may be falsy even if it exists, so use hasattr + is not None - if hasattr(e, 'response') and e.response is not None and hasattr(e.response, 'status_code') and e.response.status_code == 500: - return { - "path": path, - "error": "500 Server Error", - "is_500": True - } - return { - "path": path, - "error": str(e), - "is_500": False - } - except requests.RequestException as e: - return { - "path": path, - "error": str(e), - "is_500": False - } - - -def main(): - parser = argparse.ArgumentParser( - description="Format MQL rules using Sublime's Format API" - ) - parser.add_argument("files", nargs="+", help="YAML rule files to format") - parser.add_argument("--check", action="store_true", - help="Check if files are formatted (exit 1 if not)") - args = parser.parse_args() - - # Collect files to process - files_data = [] - for filepath in args.files: - path = Path(filepath) - if not path.exists(): - print(f"::warning file={filepath}::{filepath} does not exist, skipping") - continue - - if path.name in EXCLUDE_FILES: - print(f"[skip] {path.name} excluded", flush=True) - continue - - content = path.read_text() - source = extract_source(content) - if not source: - print(f"::warning file={filepath}::{path.name} has no source field, skipping") - continue - - files_data.append({ - "path": path, - "content": content, - "source": source - }) - - if not files_data: - print("::error::No valid files to process") - sys.exit(1) - - total = len(files_data) - print(f"Processing {total} files with {MAX_WORKERS} workers...", flush=True) - - changed_count = 0 - unchanged_count = 0 - completed = 0 - - # Process files in parallel - with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: - futures = {executor.submit(process_file, fd): fd for fd in files_data} - - for future in as_completed(futures): - result = future.result() - completed += 1 - progress = f"[{completed}/{total}]" - path = result["path"] - - if result.get("error"): - # Handle 500 errors as warnings - if result.get("is_500"): - print(f"::warning file={path}::{progress} {path.name} skipped - API returned 500 error", flush=True) - unchanged_count += 1 - continue - else: - print(f"::error file={path}::{progress} {path.name} formatting failed: {result['error']}") - sys.exit(1) - - if result["changed"]: - changed_count += 1 - if args.check: - print(f"::error file={path}::{progress} {path.name} needs formatting", flush=True) - else: - new_content = replace_source(result["content"], result["formatted_source"]) - path.write_text(new_content) - print(f"{progress} {path.name} reformatted", flush=True) - else: - unchanged_count += 1 - print(f"{progress} {path.name} unchanged", flush=True) - - print(f"\n{'─' * 50}", flush=True) - if args.check: - if changed_count > 0: - print(f"::error::{changed_count} files need formatting, {unchanged_count} files OK") - sys.exit(1) - else: - print(f"All {unchanged_count} files are properly formatted") - else: - print(f"{changed_count} files reformatted, {unchanged_count} unchanged") - - -if __name__ == "__main__": - main() +#!/usr/bin/env python3 +""" +MQL Formatter - formats MQL rules using Sublime's Format API + +Usage: + # Format files in place + ./mql_format.py detection-rules/*.yml + + # Check if files need formatting (exit 1 if changes needed) + ./mql_format.py --check detection-rules/*.yml +""" + +import sys +import re +import argparse +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed + +try: + import requests +except ImportError: + print("::error::requests package required. Install with: pip install requests") + sys.exit(1) + +try: + import yaml +except ImportError: + print("::error::PyYAML package required. Install with: pip install pyyaml") + sys.exit(1) + +API_URL = "https://play.sublime.security/v1/rules/format" +MAX_WORKERS = 100 + +# Files to exclude from formatting (e.g., special comment formatting) +EXCLUDE_FILES = { + "attachment_cve_2023_38831.yml", +} + + +def format_source(source: str) -> str: + """Format MQL source using the Sublime API.""" + resp = requests.post(API_URL, json={ + "source": source, + "max_line_width": 80, + "indent": 2, + "prefer_multi_line_root": True, + }, timeout=30) + + # Handle 500 errors gracefully - this is a known API bug with empty comment lines + if resp.status_code == 500: + error = requests.HTTPError("500 Server Error") + error.response = resp + raise error + + resp.raise_for_status() + return resp.json()["source"] + + +def extract_source(content: str) -> str | None: + """Extract source field using PyYAML.""" + try: + data = yaml.safe_load(content) + return data.get("source") if data else None + except yaml.YAMLError: + return None + + +def replace_source(content: str, new_source: str) -> str: + """Replace source block in YAML file, preserving everything else.""" + lines = content.split('\n') + result = [] + source_indent = 2 # default + + i = 0 + while i < len(lines): + line = lines[i] + + if re.match(r'^source:\s*\|', line): + result.append(line) + + # Find the indentation from the next non-empty line + for j in range(i + 1, len(lines)): + if lines[j].strip(): + source_indent = len(lines[j]) - len(lines[j].lstrip()) + break + + # Insert the new formatted source with proper indentation + indent = ' ' * source_indent + for src_line in new_source.split('\n'): + result.append(indent + src_line) + + # Skip the old source lines + i += 1 + while i < len(lines): + if not lines[i].strip(): # blank line + i += 1 + elif lines[i][0].isspace(): # indented = still in source + i += 1 + else: # non-indented = next field + break + continue + + result.append(line) + i += 1 + + return '\n'.join(result) + + +def normalize(s: str) -> str: + """Normalize source for comparison (ignore trailing whitespace).""" + return '\n'.join(line.rstrip() for line in s.strip().split('\n')) + + +def process_file(file_data: dict) -> dict: + """Process a single file - called in thread pool.""" + path = file_data["path"] + content = file_data["content"] + source = file_data["source"] + + try: + formatted_source = format_source(source) + changed = normalize(formatted_source) != normalize(source) + return { + "path": path, + "content": content, + "formatted_source": formatted_source, + "changed": changed, + "error": None, + "is_500": False + } + except requests.HTTPError as e: + # Check if this is a 500 error + # Note: Response object may be falsy even if it exists, so use hasattr + is not None + if hasattr(e, 'response') and e.response is not None and hasattr(e.response, 'status_code') and e.response.status_code == 500: + return { + "path": path, + "error": "500 Server Error", + "is_500": True + } + return { + "path": path, + "error": str(e), + "is_500": False + } + except requests.RequestException as e: + return { + "path": path, + "error": str(e), + "is_500": False + } + + +def main(): + parser = argparse.ArgumentParser( + description="Format MQL rules using Sublime's Format API" + ) + parser.add_argument("files", nargs="+", help="YAML rule files to format") + parser.add_argument("--check", action="store_true", + help="Check if files are formatted (exit 1 if not)") + args = parser.parse_args() + + # Collect files to process + files_data = [] + for filepath in args.files: + path = Path(filepath) + if not path.exists(): + print(f"::warning file={filepath}::{filepath} does not exist, skipping") + continue + + if path.name in EXCLUDE_FILES: + print(f"[skip] {path.name} excluded", flush=True) + continue + + content = path.read_text() + source = extract_source(content) + if not source: + print(f"::warning file={filepath}::{path.name} has no source field, skipping") + continue + + files_data.append({ + "path": path, + "content": content, + "source": source + }) + + if not files_data: + print("::error::No valid files to process") + sys.exit(1) + + total = len(files_data) + print(f"Processing {total} files with {MAX_WORKERS} workers...", flush=True) + + changed_count = 0 + unchanged_count = 0 + completed = 0 + + # Process files in parallel + with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + futures = {executor.submit(process_file, fd): fd for fd in files_data} + + for future in as_completed(futures): + result = future.result() + completed += 1 + progress = f"[{completed}/{total}]" + path = result["path"] + + if result.get("error"): + # Handle 500 errors as warnings + if result.get("is_500"): + print(f"::warning file={path}::{progress} {path.name} skipped - API returned 500 error", flush=True) + unchanged_count += 1 + continue + else: + print(f"::error file={path}::{progress} {path.name} formatting failed: {result['error']}") + sys.exit(1) + + if result["changed"]: + changed_count += 1 + if args.check: + print(f"::error file={path}::{progress} {path.name} needs formatting", flush=True) + else: + new_content = replace_source(result["content"], result["formatted_source"]) + path.write_text(new_content) + print(f"{progress} {path.name} reformatted", flush=True) + else: + unchanged_count += 1 + print(f"{progress} {path.name} unchanged", flush=True) + + print(f"\n{'─' * 50}", flush=True) + if args.check: + if changed_count > 0: + print(f"::error::{changed_count} files need formatting, {unchanged_count} files OK") + sys.exit(1) + else: + print(f"All {unchanged_count} files are properly formatted") + else: + print(f"{changed_count} files reformatted, {unchanged_count} unchanged") + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/sync_shared_samples.py b/.github/scripts/sync_shared_samples.py new file mode 100644 index 00000000000..e856f95d53c --- /dev/null +++ b/.github/scripts/sync_shared_samples.py @@ -0,0 +1,471 @@ +#!/usr/bin/env python3 +""" +Sync Shared Samples Script + +Syncs detection rules from open PRs to the shared-samples branch. +This script handles: +- File-based syncing to shared-samples branch +- PR# prefix in rule names +- Author tags and references +- Closed PR rule deletion via Sublime API (after delay) +- Bulk PR limits +""" +import os +import sys +from datetime import datetime, timedelta, timezone +from urllib.parse import quote + +import requests + +# Add the lib directory to path for imports +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from lib import ( + # Constants + DO_NOT_MERGE_LABEL, + BULK_PR_LABEL, + DEFAULT_MAX_RULES_PER_PR, + DEFAULT_DELETE_RULES_DELAY_DAYS, + DEFAULT_AUTHOR_TAG_PREFIX, + DEFAULT_RULE_STATUS_PREFIX, + DEFAULT_OPEN_PR_TAG, + # Functions + create_github_session, + has_label, + apply_label, + remove_label, + add_id_to_yaml, + add_block, + rename_rules, + get_file_contents, + save_file, + clean_output_folder, + count_yaml_rules_in_pr, +) + +# Configuration from environment +GITHUB_TOKEN = os.getenv('GITHUB_TOKEN') +SUBLIME_API_TOKEN = os.getenv('SUBLIME_API_TOKEN') +REPO_OWNER = os.getenv('REPO_OWNER', 'sublime-security') +REPO_NAME = os.getenv('REPO_NAME', 'sublime-rules') +OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', 'detection-rules') + +# Feature flags +ADD_AUTHOR_TAG = os.getenv('ADD_AUTHOR_TAG', 'true').lower() == 'true' +AUTHOR_TAG_PREFIX = os.getenv('AUTHOR_TAG_PREFIX', DEFAULT_AUTHOR_TAG_PREFIX) +ADD_RULE_STATUS_TAG = os.getenv('ADD_RULE_STATUS_TAG', 'true').lower() == 'true' +RULE_STATUS_PREFIX = os.getenv('RULE_STATUS_PREFIX', DEFAULT_RULE_STATUS_PREFIX) +ADD_PR_REFERENCE = os.getenv('ADD_PR_REFERENCE', 'true').lower() == 'true' +CREATE_OPEN_PR_TAG = os.getenv('CREATE_OPEN_PR_TAG', 'true').lower() == 'true' +OPEN_PR_TAG = os.getenv('OPEN_PR_TAG', DEFAULT_OPEN_PR_TAG) + +# File inclusion flags +INCLUDE_ADDED = os.getenv('INCLUDE_ADDED', 'true').lower() == 'true' +INCLUDE_UPDATES = os.getenv('INCLUDE_UPDATES', 'true').lower() == 'true' + +# Closed PR handling +DELETE_RULES_FROM_CLOSED_PRS = os.getenv('DELETE_RULES_FROM_CLOSED_PRS', 'true').lower() == 'true' +DELETE_RULES_FROM_CLOSED_PRS_DELAY = int(os.getenv('DELETE_RULES_FROM_CLOSED_PRS_DELAY', str(DEFAULT_DELETE_RULES_DELAY_DAYS))) + +# Bulk PR limits +SKIP_BULK_PRS = os.getenv('SKIP_BULK_PRS', 'true').lower() == 'true' +MAX_RULES_PER_PR = int(os.getenv('MAX_RULES_PER_PR', str(DEFAULT_MAX_RULES_PER_PR))) + +# Create output folder if it doesn't exist +if not os.path.exists(OUTPUT_FOLDER): + os.makedirs(OUTPUT_FOLDER) + + +def search_sublime_rule_feed(rule_name): + """ + Search for rules in the Sublime rule feed by name. + + Args: + rule_name (str): Rule name to search for + + Returns: + dict: Search results or None on error + """ + # Strip quotes for searching + rule_name = rule_name.strip("\"'") + rule_name = quote(rule_name) + url = f"https://platform.sublime.security/v0/rules?limit=50&offset=0&search={rule_name}" + + headers = { + "accept": "application/json", + "authorization": f"Bearer {SUBLIME_API_TOKEN}" + } + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + except requests.exceptions.HTTPError as err: + print(f"HTTP error occurred: {err}") + return None + except requests.exceptions.ConnectionError as err: + print(f"Connection error occurred: {err}") + return None + else: + print(f"\tSearch Feed Response Code: {response.status_code}") + response = response.json() + print(f"\tSearch Feed Found Count: {response['count']}") + return response + + +def sublime_delete_rule(rule_id): + """ + Delete a rule from the Sublime platform. + + Args: + rule_id (str): Rule ID to delete + + Returns: + bool: True if deletion was successful + """ + url = f"https://platform.sublime.security/v0/rules/{rule_id}" + + headers = { + "accept": "application/json", + "authorization": f"Bearer {SUBLIME_API_TOKEN}" + } + response = requests.delete(url, headers=headers) + + print(f"\tDelete Rule Response Code: {response.status_code}") + + return response.ok + + +def get_open_pull_requests(session): + """Fetch all open pull requests from the repository.""" + pull_requests = [] + page = 1 + per_page = 30 + + while True: + url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls' + params = {'page': page, 'per_page': per_page, 'sort': 'updated', 'direction': 'desc'} + print(f"Fetching page {page} of Pull Requests") + response = session.get(url, params=params) + response.raise_for_status() + + pull_requests.extend(response.json()) + + if 'Link' in response.headers: + links = response.headers['Link'].split(', ') + has_next = any('rel="next"' in link for link in links) + else: + has_next = False + + if not has_next: + print(f"Fetched page {page} of Pull Requests") + print(f"PRs on page {page}: {len(response.json())}") + break + + print(f"Fetched page {page} of Pull Requests") + print(f"PRs on page {page}: {len(response.json())}") + print(f"PRs found so far: {len(pull_requests)}") + print(f"Moving to page {page + 1}") + page += 1 + + print(f"Total PRs: {len(pull_requests)}") + return pull_requests + + +def get_closed_pull_requests(session): + """Fetch recently closed pull requests from the repository.""" + closed_pull_requests = [] + page = 1 + per_page = 30 + max_closed = 60 + + while len(closed_pull_requests) <= max_closed: + if len(closed_pull_requests) >= max_closed: + print("hit max closed prs length") + break + + url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls' + params = {'page': page, 'per_page': per_page, 'state': 'closed', 'sort': 'updated', 'direction': 'desc'} + print(f"Fetching page {page} of CLOSED Pull Requests") + response = session.get(url, params=params) + response.raise_for_status() + + closed_pull_requests.extend(response.json()) + + if 'Link' in response.headers: + links = response.headers['Link'].split(', ') + has_next = any('rel="next"' in link for link in links) + else: + has_next = False + + if not has_next: + print(f"Fetched page {page} of Pull Requests") + print(f"PRs on page {page}: {len(response.json())}") + break + + print(f"Fetched page {page} of CLOSED Pull Requests") + print(f"CLOSED PRs on page {page}: {len(response.json())}") + print(f"CLOSED PRs found so far: {len(closed_pull_requests)}") + print(f"Moving to page {page + 1}") + page += 1 + + print(f"Total CLOSED PRs: {len(closed_pull_requests)}") + return closed_pull_requests + + +def get_files_for_pull_request(session, pr_number): + """Fetch files changed in a pull request.""" + url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls/{pr_number}/files' + response = session.get(url) + response.raise_for_status() + return response.json() + + +def handle_closed_prs(session): + """ + Handle closed PRs by deleting rules from closed PRs after a delay period. + Uses comprehensive search by PR number pattern to catch all rules including orphaned ones. + + Returns: + set: Set of rule IDs that were deleted + """ + if not DELETE_RULES_FROM_CLOSED_PRS: + return set() + + closed_pr_header = [ + ' _____ _ _ ______ _ _ ______ _ ', + '/ __ \\ | | | | ___ \\ | | | | ___ \\ | | ', + '| / \\/ | ___ ___ ___ __| | | |_/ / _| | | | |_/ /___ __ _ _ _ ___ ___| |_ ___ ', + '| | | |/ _ \\/ __|/ _ \\/ _\' | | __/ | | | | | | // _ \\/ _\' | | | |/ _ \\/ __| __/ __|', + '| \\__/\\ | (_) \\__ \\ __/ (_| | | | | |_| | | | | |\\ \\ __/ (_| | |_| | __/\\__ \\ |_\\__ \\', + ' \\____/_|\\___/|___/\\___|\\__,_| \\_| \\__,_|_|_| \\_| \\_\\___|\\__, |\\__,_|\\___||___/\\__|___/', + ' | | ', + ' |_| ', + ] + + for line in closed_pr_header: + print(line) + + deleted_ids = set() + closed_pull_requests = get_closed_pull_requests(session) + + for closed_pr in closed_pull_requests: + pr_number = closed_pr['number'] + print(f"Processing CLOSED PR #{pr_number}: {closed_pr['title']}") + + if closed_pr['base']['ref'] != "main": + print(f"\tSkipping non-main branch PR #{closed_pr['number']}: {closed_pr['title']} -- dest branch: {closed_pr['base']['ref']}") + continue + + # Check delay for merged PRs + if closed_pr['merged_at'] is not None: + merged_at_time = datetime.strptime(closed_pr['merged_at'], "%Y-%m-%dT%H:%M:%SZ").replace( + tzinfo=timezone.utc) + + if not merged_at_time <= datetime.now(tz=timezone.utc) - timedelta(days=DELETE_RULES_FROM_CLOSED_PRS_DELAY): + time_remaining = (merged_at_time + timedelta(days=DELETE_RULES_FROM_CLOSED_PRS_DELAY)) - datetime.now(tz=timezone.utc) + + remaining_days = time_remaining.days + remaining_hours, remaining_remainder = divmod(time_remaining.seconds, 3600) + remaining_minutes, remaining_seconds = divmod(remaining_remainder, 60) + + print(f"\tDELAY NOT MET: Skipping PR #{closed_pr['number']}: {closed_pr['title']}\n\tRemaining Time = {remaining_days} days, {remaining_hours} hours, {remaining_minutes} minutes, {remaining_seconds} seconds") + continue + + # Search for all rules with this PR number pattern + pr_search_pattern = f"PR# {pr_number} - " + print(f"\tSearching for all rules with pattern: '{pr_search_pattern}'") + + found_rules = search_sublime_rule_feed(pr_search_pattern) + if found_rules is None: + print(f"\tError searching for rules with pattern '{pr_search_pattern}' for PR#{pr_number}") + continue + + print(f"\tFound {found_rules['count']} rules matching PR pattern") + + # Process all found rules + for found_rule in found_rules.get('rules', []): + rule_name = found_rule.get('name', '') + rule_id = found_rule.get('id', '') + + # Verify this rule actually belongs to this PR + if not rule_name.startswith(pr_search_pattern): + print(f"\tSkipping rule '{rule_name}' - doesn't match expected pattern") + continue + + print(f"\tEvaluating rule: {rule_name}") + + # Verify this rule has the expected tags + rule_tags = found_rule.get('tags', []) + + # Check for the open PR tag + if CREATE_OPEN_PR_TAG and OPEN_PR_TAG not in rule_tags: + print(f"\t\tSkipping rule - missing required tag '{OPEN_PR_TAG}'") + continue + + # Check for the author tag if enabled + if ADD_AUTHOR_TAG: + expected_author_tag = f"{AUTHOR_TAG_PREFIX}{closed_pr['user']['login']}" + if expected_author_tag not in rule_tags: + print(f"\t\tSkipping rule - missing expected author tag '{expected_author_tag}'") + print(f"\t\tRule tags: {rule_tags}") + continue + + # All checks passed - delete this rule + print(f"\t\tRule matches all criteria - deleting rule ID: {rule_id}") + deleted = sublime_delete_rule(rule_id) + if deleted: + print(f"\t\tDELETED rule: {rule_id}") + deleted_ids.add(rule_id) + else: + print(f"\t\tERROR DELETING rule: {rule_id}") + + print(f"Deleted {len(deleted_ids)} Rules from Closed PRs:") + for deleted_id in deleted_ids: + print(f"\t{deleted_id}") + + return deleted_ids + + +def handle_pr_rules(session): + """ + Process open PRs to sync rules to shared-samples branch. + + Returns: + set: Set of filenames that were processed + """ + header = [ + ' _____ ______ _ _ ______ _ ', + '| _ | | ___ \\ | | | | ___ \\ | | ', + '| | | |_ __ ___ _ __ | |_/ / _| | | | |_/ /___ __ _ _ _ ___ ___| |_ ___ ', + '| | | | \'_ \\ / _ \\ \'_ \\ | __/ | | | | | | // _ \\/ _\' | | | |/ _ \\/ __| __/ __|', + '\\ \\_/ / |_) | __/ | | | | | | |_| | | | | |\\ \\ __/ (_| | |_| | __/\\__ \\ |_\\__ \\', + ' \\___/| .__/ \\___|_| |_| \\_| \\__,_|_|_| \\_| \\_\\___|\\__, |\\__,_|\\___||___/\\__|___/', + ' | | | | ', + ' |_| |_| ', + ] + + for line in header: + print(line) + + pull_requests = get_open_pull_requests(session) + new_files = set() + + for pr in pull_requests: + pr_number = pr['number'] + + # Check for do-not-merge label - skip entirely if present + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, DO_NOT_MERGE_LABEL): + print(f"Skipping PR #{pr_number} (has '{DO_NOT_MERGE_LABEL}' label): {pr['title']}") + continue + + # Skip draft PRs + if pr['draft']: + print(f"Skipping draft PR #{pr_number}: {pr['title']}") + continue + + # Skip non-main PRs + if pr['base']['ref'] != 'main': + print(f"Skipping non-main branch PR #{pr_number}: {pr['title']} -- dest branch: {pr['base']['ref']}") + continue + + print(f"Processing PR #{pr_number}: {pr['title']}") + + # Get the latest commit SHA + latest_sha = pr['head']['sha'] + print(f"\tLatest commit SHA: {latest_sha}") + + files = get_files_for_pull_request(session, pr_number) + + # Check if PR has too many rules + if SKIP_BULK_PRS: + yaml_rule_count = count_yaml_rules_in_pr(files) + if yaml_rule_count > MAX_RULES_PER_PR: + print(f"\tSkipping PR #{pr_number}: Contains {yaml_rule_count} YAML rules (max allowed: {MAX_RULES_PER_PR})") + + # Apply bulk label if not already present + if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): + print(f"\tPR #{pr_number} doesn't have the '{BULK_PR_LABEL}' label. Applying...") + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL) + + continue + else: + # Remove bulk label if rule count is now under limit + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL) + + # Process files in the PR + for file in files: + print(f"\tStatus of {file['filename']}: {file['status']}") + process_file = False + + # Check file type and status + if (file['status'] in ['added', 'modified', 'changed'] and + file['filename'].startswith('detection-rules/') and + file['filename'].endswith('.yml')): + if file['status'] == "added" and INCLUDE_ADDED: + process_file = True + elif file['status'] in ['modified', 'changed'] and INCLUDE_UPDATES: + process_file = True + else: + print(f"\tSkipping {file['status']} file: {file['filename']} in PR #{pr_number} -- INCLUDE_UPDATES == {INCLUDE_UPDATES}, INCLUDE_ADDED == {INCLUDE_ADDED}") + else: + print(f"\tSkipping {file['status']} file: {file['filename']} in PR #{pr_number} -- unmanaged file status") + + if process_file: + # Fetch file content + content = get_file_contents( + session, REPO_OWNER, REPO_NAME, + file['filename'], latest_sha + ) + + # Process the file + target_save_filename = f"{pr_number}_{os.path.basename(file['filename'])}" + + # Get modified content and original ID + modified_content, original_id = add_id_to_yaml(content, target_save_filename) + + # Add author tag if enabled + if ADD_AUTHOR_TAG: + modified_content = add_block(modified_content, 'tags', f"{AUTHOR_TAG_PREFIX}{pr['user']['login']}") + + # Add open PR tag if enabled + if CREATE_OPEN_PR_TAG: + modified_content = add_block(modified_content, 'tags', OPEN_PR_TAG) + + # Add rule status tag if enabled + if ADD_RULE_STATUS_TAG: + modified_content = add_block(modified_content, 'tags', f"{RULE_STATUS_PREFIX}{file['status']}") + + # Add PR reference if enabled + if ADD_PR_REFERENCE: + modified_content = add_block(modified_content, 'references', pr['html_url']) + + # Always rename rules with PR# prefix (required for handle_closed_prs) + modified_content = rename_rules(modified_content, pr) + + # Save the file + save_file(OUTPUT_FOLDER, target_save_filename, modified_content) + new_files.add(target_save_filename) + print(f"\tSaved: {target_save_filename}") + + # Clean up files no longer in open PRs + clean_output_folder(OUTPUT_FOLDER, new_files) + return new_files + + +if __name__ == '__main__': + sublime_header = [ + ' ______ __ __ ______ __ __ __ __ ______ ', + '/\\ ___\\ /\\ \\ /\\ \\ /\\ == \\ /\\ \\ /\\ \\ /\\ "-./ \\ /\\ ___\\ ', + '\\ \\___ \\ \\ \\ \\_\\ \\ \\ \\ __< \\ \\ \\____ \\ \\ \\ \\ \\ \\-./\\ \\ \\ \\ __\\ ', + ' \\/\\_____\\ \\ \\_____\\ \\ \\_____\\ \\ \\_____\\ \\ \\_\\ \\ \\_\\ \\ \\_\\ \\ \\_____\\ ', + ' \\/_____/ \\/_____/ \\/_____/ \\/_____/ \\/_/ \\/_/ \\/_/ \\/_____/ ', + ' ', + ] + + for line in sublime_header: + print(line) + + print("Running shared-samples sync...") + session = create_github_session(GITHUB_TOKEN) + handle_pr_rules(session) + handle_closed_prs(session) diff --git a/.github/scripts/sync_test_rules.py b/.github/scripts/sync_test_rules.py new file mode 100644 index 00000000000..3da881b3078 --- /dev/null +++ b/.github/scripts/sync_test_rules.py @@ -0,0 +1,400 @@ +#!/usr/bin/env python3 +""" +Sync Test Rules Script + +Syncs detection rules from open PRs to the test-rules branch. +This script handles test-rules specific logic including: +- Draft PR handling with label/comment triggers +- Organization membership filtering +- Bulk PR limits +- Link analysis exclusions +- PR commenting for exclusions +""" +import os +import sys + +# Add the lib directory to path for imports +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from lib import ( + # Constants + IN_TEST_RULES_LABEL, + AUTHOR_MEMBERSHIP_EXCLUSION_LABEL, + MANUAL_EXCLUSION_LABEL, + BULK_PR_LABEL, + DO_NOT_MERGE_LABEL, + SKIP_TEXTS, + DEFAULT_ORG_NAME, + DEFAULT_COMMENT_TRIGGER, + DEFAULT_MAX_RULES_PER_PR, + DEFAULT_REQUIRED_CHECK_NAME, + DEFAULT_REQUIRED_CHECK_CONCLUSION, + # Functions + create_github_session, + has_label, + apply_label, + remove_label, + is_user_in_org, + has_trigger_comment, + has_required_action_completed, + check_skip_texts, + add_id_to_yaml, + add_block, + get_file_contents, + save_file, + pr_has_synced_files, + clean_output_folder, + count_yaml_rules_in_pr, + post_exclusion_comment_if_needed, +) + +# Configuration from environment +GITHUB_TOKEN = os.getenv('GITHUB_TOKEN') +REPO_OWNER = os.getenv('REPO_OWNER', 'sublime-security') +REPO_NAME = os.getenv('REPO_NAME', 'sublime-rules') +OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', 'detection-rules') + +# Test-rules specific configuration +FILTER_BY_ORG_MEMBERSHIP = os.getenv('FILTER_BY_ORG_MEMBERSHIP', 'true').lower() == 'true' +ORG_NAME = os.getenv('ORG_NAME', DEFAULT_ORG_NAME) +INCLUDE_PRS_WITH_COMMENT = os.getenv('INCLUDE_PRS_WITH_COMMENT', 'true').lower() == 'true' +COMMENT_TRIGGER = os.getenv('COMMENT_TRIGGER', DEFAULT_COMMENT_TRIGGER) + +# File filtering +SKIP_FILES_WITH_TEXT = os.getenv('SKIP_FILES_WITH_TEXT', 'true').lower() == 'true' + +# Bulk PR limits +SKIP_BULK_PRS = os.getenv('SKIP_BULK_PRS', 'true').lower() == 'true' +MAX_RULES_PER_PR = int(os.getenv('MAX_RULES_PER_PR', str(DEFAULT_MAX_RULES_PER_PR))) + +# Action completion checks +CHECK_ACTION_COMPLETION = os.getenv('CHECK_ACTION_COMPLETION', 'true').lower() == 'true' +REQUIRED_CHECK_NAME = os.getenv('REQUIRED_CHECK_NAME', DEFAULT_REQUIRED_CHECK_NAME) +REQUIRED_CHECK_CONCLUSION = os.getenv('REQUIRED_CHECK_CONCLUSION', DEFAULT_REQUIRED_CHECK_CONCLUSION) + +# Labeling +ADD_TEST_RULES_LABEL = os.getenv('ADD_TEST_RULES_LABEL', 'true').lower() == 'true' + +# Feature flags from original script (all disabled for test-rules mode) +INCLUDE_ADDED = os.getenv('INCLUDE_ADDED', 'true').lower() == 'true' +INCLUDE_UPDATES = os.getenv('INCLUDE_UPDATES', 'true').lower() == 'true' + +# Create output folder if it doesn't exist +if not os.path.exists(OUTPUT_FOLDER): + os.makedirs(OUTPUT_FOLDER) + + +def get_open_pull_requests(session): + """Fetch all open pull requests from the repository.""" + pull_requests = [] + page = 1 + per_page = 30 + + while True: + url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls' + params = {'page': page, 'per_page': per_page, 'sort': 'updated', 'direction': 'desc'} + print(f"Fetching page {page} of Pull Requests") + response = session.get(url, params=params) + response.raise_for_status() + + pull_requests.extend(response.json()) + + if 'Link' in response.headers: + links = response.headers['Link'].split(', ') + has_next = any('rel="next"' in link for link in links) + else: + has_next = False + + if not has_next: + print(f"Fetched page {page} of Pull Requests") + print(f"PRs on page {page}: {len(response.json())}") + break + + print(f"Fetched page {page} of Pull Requests") + print(f"PRs on page {page}: {len(response.json())}") + print(f"PRs found so far: {len(pull_requests)}") + print(f"Moving to page {page + 1}") + page += 1 + + print(f"Total PRs: {len(pull_requests)}") + return pull_requests + + +def get_files_for_pull_request(session, pr_number): + """Fetch files changed in a pull request.""" + url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls/{pr_number}/files' + response = session.get(url) + response.raise_for_status() + return response.json() + + +def handle_pr_rules(session): + """ + Process open PRs to sync rules to test-rules branch. + + Returns: + set: Set of filenames that were processed + """ + header = [ + ' _____ _ ______ _ ', + '|_ _| | | | ___ \\ | | ', + ' | | ___ ___| |_ | |_/ / _| | ___ ___ ', + ' | |/ _ \\/ __| __| | / | | | |/ _ \\/ __|', + ' | | __/\\__ \\ |_ | |\\ \\ |_| | | __/\\__ \\', + ' \\_/\\___||___/\\__| \\_| \\_\\__,_|_|\\___||___/', + ' ', + ] + + for line in header: + print(line) + + pull_requests = get_open_pull_requests(session) + new_files = set() + + for pr in pull_requests: + pr_number = pr['number'] + + # Check for do-not-merge label first - skip entirely if present + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, DO_NOT_MERGE_LABEL): + print(f"Skipping PR #{pr_number} (has '{DO_NOT_MERGE_LABEL}' label): {pr['title']}") + continue + + # Draft PR handling + if pr['draft']: + # Process drafts if they have in-test-rules label OR trigger comment + has_in_test_rules = has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + has_comment = False + + if INCLUDE_PRS_WITH_COMMENT and not has_in_test_rules: + # Check for trigger comment from org member + has_comment = has_trigger_comment( + session, REPO_OWNER, REPO_NAME, pr_number, ORG_NAME, COMMENT_TRIGGER + ) + if has_comment: + # Apply the in-test-rules label since trigger comment was found + print(f"\tDraft PR #{pr_number} has trigger comment, applying '{IN_TEST_RULES_LABEL}' label") + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + + if has_in_test_rules or has_comment: + print(f"Processing draft PR #{pr_number} (has '{IN_TEST_RULES_LABEL}' label or trigger comment): {pr['title']}") + else: + print(f"Skipping draft PR #{pr_number}: {pr['title']}") + continue + + # Skip PRs not targeting main + if pr['base']['ref'] != 'main': + print(f"Skipping non-main branch PR #{pr_number}: {pr['title']} -- dest branch: {pr['base']['ref']}") + continue + + # Check for manual exclusion label (user opted out of test-rules) + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, MANUAL_EXCLUSION_LABEL): + print(f"Skipping manually excluded PR #{pr_number}: {pr['title']}") + # Remove in-test-rules label if both are present (manual exclusion takes precedence) + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + print(f"\tRemoving '{IN_TEST_RULES_LABEL}' label since manual exclusion takes precedence") + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + continue + + # Check if user removed the in-test-rules label (opt-out) + if pr_has_synced_files(OUTPUT_FOLDER, pr_number) and not has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + print(f"PR #{pr_number} has synced files but '{IN_TEST_RULES_LABEL}' label was removed - applying manual exclusion") + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, MANUAL_EXCLUSION_LABEL) + continue + + # Organization membership and comment trigger checks + process_pr = True + print(f"Processing PR #{pr_number}: {pr['title']}") + + if FILTER_BY_ORG_MEMBERSHIP: + author_in_org = is_user_in_org(session, pr['user']['login'], ORG_NAME) + has_comment = False + + if author_in_org: + print(f"\tPR #{pr_number}: Author {pr['user']['login']} is in {ORG_NAME}") + # Remove exclusion label if present + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL) + else: + # Check for trigger comment if author not in org + if INCLUDE_PRS_WITH_COMMENT: + has_comment = has_trigger_comment( + session, REPO_OWNER, REPO_NAME, pr_number, ORG_NAME, COMMENT_TRIGGER + ) + + # If trigger comment was found, remove the exclusion label + if has_comment and has_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): + print(f"\tPR #{pr_number}: Removing '{AUTHOR_MEMBERSHIP_EXCLUSION_LABEL}' label due to trigger comment") + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL) + + if not has_comment: + print(f"\tSkipping PR #{pr_number}: Author {pr['user']['login']} is not in {ORG_NAME} and is missing comment trigger") + + # Apply exclusion label if not already present + if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): + print(f"\tPR #{pr_number} doesn't have the '{AUTHOR_MEMBERSHIP_EXCLUSION_LABEL}' label. Applying...") + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL) + # Post comment explaining how to enable sync + post_exclusion_comment_if_needed( + session, REPO_OWNER, REPO_NAME, pr_number, + AUTHOR_MEMBERSHIP_EXCLUSION_LABEL, + org_name=ORG_NAME, + comment_trigger=COMMENT_TRIGGER + ) + + # Remove in-test-rules label if previously applied + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + + process_pr = False + + if not process_pr: + continue + + # Get the latest commit SHA + latest_sha = pr['head']['sha'] + print(f"\tLatest commit SHA: {latest_sha}") + + # Check if required checks have completed + if CHECK_ACTION_COMPLETION: + if not has_required_action_completed( + session, REPO_OWNER, REPO_NAME, latest_sha, + REQUIRED_CHECK_NAME, REQUIRED_CHECK_CONCLUSION + ): + print(f"\tSkipping PR #{pr_number}: Required check '{REQUIRED_CHECK_NAME}' has not completed with conclusion '{REQUIRED_CHECK_CONCLUSION}'") + # Remove in-test-rules label if previously applied + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + continue + + files = get_files_for_pull_request(session, pr_number) + + # Check if PR has too many rules + if SKIP_BULK_PRS: + yaml_rule_count = count_yaml_rules_in_pr(files) + if yaml_rule_count > MAX_RULES_PER_PR: + print(f"\tSkipping PR #{pr_number}: Contains {yaml_rule_count} YAML rules (max allowed: {MAX_RULES_PER_PR})") + + # Apply label if not already present + if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): + print(f"\tPR #{pr_number} doesn't have the '{BULK_PR_LABEL}' label. Applying...") + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL) + # Post comment explaining the limit + post_exclusion_comment_if_needed( + session, REPO_OWNER, REPO_NAME, pr_number, + BULK_PR_LABEL, + max_rules=MAX_RULES_PER_PR, + rule_count=yaml_rule_count + ) + + # Remove in-test-rules label if previously applied + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + + continue + else: + # Remove bulk label if rule count is now under limit + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL) + + # Process files in the PR + for file in files: + print(f"\tStatus of {file['filename']}: {file['status']}") + process_file = False + + # Check file type and status + if (file['status'] in ['added', 'modified', 'changed'] and + file['filename'].startswith('detection-rules/') and + file['filename'].endswith('.yml')): + if file['status'] == "added" and INCLUDE_ADDED: + process_file = True + elif file['status'] in ['modified', 'changed'] and INCLUDE_UPDATES: + process_file = True + else: + print(f"\tSkipping {file['status']} file: {file['filename']} in PR #{pr_number} -- INCLUDE_UPDATES == {INCLUDE_UPDATES}, INCLUDE_ADDED == {INCLUDE_ADDED}") + else: + print(f"\tSkipping {file['status']} file: {file['filename']} in PR #{pr_number} -- unmanaged file status") + + if process_file: + # Fetch file content + content = get_file_contents( + session, REPO_OWNER, REPO_NAME, + file['filename'], latest_sha + ) + + # Skip files with specific text patterns + if SKIP_FILES_WITH_TEXT and SKIP_TEXTS: + matched_texts, labels_to_apply = check_skip_texts(content, SKIP_TEXTS) + if matched_texts: + print(f"\tSkipping file {file['filename']}: contains texts {matched_texts}") + + # Apply all associated labels + for label in labels_to_apply: + if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, label): + print(f"\tPR #{pr_number} doesn't have the '{label}' label. Applying...") + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, label) + + # Post comment for link_analysis exclusion + from lib.constants import LINK_ANALYSIS_EXCLUSION_LABEL + if LINK_ANALYSIS_EXCLUSION_LABEL in labels_to_apply: + post_exclusion_comment_if_needed( + session, REPO_OWNER, REPO_NAME, pr_number, + LINK_ANALYSIS_EXCLUSION_LABEL + ) + + # Remove in-test-rules label + if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + continue + + # Process the file + target_save_filename = f"{pr_number}_{os.path.basename(file['filename'])}" + + # Get modified content and original ID + modified_content, original_id = add_id_to_yaml(content, target_save_filename) + + # Add test-rules specific fields + # Store the original id + if original_id: + modified_content = modified_content.rstrip() + modified_content += f'\nog_id: "{original_id}"' + + # Add the PR number as testing_pr + modified_content = modified_content.rstrip() + modified_content += f"\ntesting_pr: {pr_number}" + + # Add the commit SHA as testing_sha + modified_content = modified_content.rstrip() + modified_content += f"\ntesting_sha: {latest_sha}" + + # Save the file + save_file(OUTPUT_FOLDER, target_save_filename, modified_content) + new_files.add(target_save_filename) + print(f"\tSaved: {target_save_filename}") + + # Apply the in-test-rules label + if ADD_TEST_RULES_LABEL: + if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + print(f"\tPR #{pr_number} doesn't have the '{IN_TEST_RULES_LABEL}' label. Applying...") + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + + # Clean up files no longer in open PRs + clean_output_folder(OUTPUT_FOLDER, new_files) + return new_files + + +if __name__ == '__main__': + sublime_header = [ + ' ______ __ __ ______ __ __ __ __ ______ ', + '/\\ ___\\ /\\ \\ /\\ \\ /\\ == \\ /\\ \\ /\\ \\ /\\ "-./ \\ /\\ ___\\ ', + '\\ \\___ \\ \\ \\ \\_\\ \\ \\ \\ __< \\ \\ \\____ \\ \\ \\ \\ \\ \\-./\\ \\ \\ \\ __\\ ', + ' \\/\\_____\\ \\ \\_____\\ \\ \\_____\\ \\ \\_____\\ \\ \\_\\ \\ \\_\\ \\ \\_\\ \\ \\_____\\ ', + ' \\/_____/ \\/_____/ \\/_____/ \\/_____/ \\/_/ \\/_/ \\/_/ \\/_____/ ', + ' ', + ] + + for line in sublime_header: + print(line) + + print("Running test-rules sync...") + session = create_github_session(GITHUB_TOKEN) + handle_pr_rules(session) diff --git a/.github/workflows/rule-validate.yml b/.github/workflows/rule-validate.yml index 216cc545930..33679a0b4f4 100644 --- a/.github/workflows/rule-validate.yml +++ b/.github/workflows/rule-validate.yml @@ -114,32 +114,22 @@ jobs: with: python-version: '3.10' - - name: Checkout scripts from Sublime fork main - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - repository: sublime-security/sublime-rules - ref: main - path: sublime-rules-main - - name: Check for invisible characters if: github.event_name != 'issue_comment' - run: python sublime-rules-main/scripts/check_invisible_chars.py + run: python .github/scripts/check_invisible_chars.py - name: Add Rule IDs as Needed & Check for Duplicates if: github.event_name != 'issue_comment' # Run before testing, just in case this could invalidate the rule itself run: | - pip install -r sublime-rules-main/scripts/generate-rule-ids/requirements.txt - python sublime-rules-main/scripts/generate-rule-ids/main.py + pip install -r .github/scripts/generate_rule_ids/requirements.txt + python .github/scripts/generate_rule_ids/main.py - name: Auto-format MQL if: github.event_name != 'issue_comment' run: | pip install -q requests pyyaml - python sublime-rules-main/scripts/mql_format.py detection-rules/*.yml insights/**/*.yml - - # Delete path to prevent interference with later steps (such as git add and commit) - rm -r sublime-rules-main + python .github/scripts/mql_format.py detection-rules/*.yml insights/**/*.yml - name: Validate Rules if: github.event_name != 'issue_comment' diff --git a/.github/workflows/sync-shared-samples.yml b/.github/workflows/sync-shared-samples.yml new file mode 100644 index 00000000000..569318a5335 --- /dev/null +++ b/.github/workflows/sync-shared-samples.yml @@ -0,0 +1,152 @@ +name: Sync Shared Samples + +on: + schedule: + - cron: '*/10 * * * *' + workflow_dispatch: # Allow manual triggering + +env: + PYTHON_VERSION: "3.13" + +jobs: + sync-shared-samples: + runs-on: ubuntu-latest + permissions: + contents: write + checks: write + pull-requests: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: pip install requests + + - name: Create and switch to shared-samples branch + run: | + git fetch --all + if git show-ref --quiet refs/remotes/origin/shared-samples; then + git checkout shared-samples + git pull origin shared-samples + else + git checkout -b shared-samples + fi + + - name: Run the sync script + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SUBLIME_API_TOKEN: ${{ secrets.SUBLIME_API_TOKEN }} + # Enable bulk PR limits + SKIP_BULK_PRS: 'true' + MAX_RULES_PER_PR: '10' + # Enable all standard features + ADD_AUTHOR_TAG: 'true' + ADD_RULE_STATUS_TAG: 'true' + ADD_PR_REFERENCE: 'true' + CREATE_OPEN_PR_TAG: 'true' + # Enable rule deletion for closed PRs + DELETE_RULES_FROM_CLOSED_PRS: 'true' + DELETE_RULES_FROM_CLOSED_PRS_DELAY: '3' + run: python .github/scripts/sync_shared_samples.py + + - name: Commit changes + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + + # First check if there are any changes + if [[ -z $(git status --porcelain detection-rules) ]]; then + echo "No changes to detection rules" + exit 0 + fi + + # Get list of modified files + FILES=$(git status --porcelain detection-rules | awk '{print $2}') + + # Track stats for the summary + ADDED_COUNT=0 + MODIFIED_COUNT=0 + DELETED_COUNT=0 + UPDATED_PRS=() + + # Process each file individually + for FILE in $FILES; do + echo "Processing $FILE" + # Skip non-rule files + if [[ ! "$FILE" =~ .*\.yml$ ]]; then + continue + fi + + BASENAME=$(basename "$FILE") + + # Extract PR number from filename (format: PR_NUMBER_filename.yml) + PR_NUMBER=$(echo "$BASENAME" | grep -o "^[0-9]*") + UPDATED_PRS+=("$PR_NUMBER") + + echo "Processing $BASENAME from $PR_NUMBER for inclusion" + + # Handle removed files + if [[ ! -f "$FILE" ]]; then + echo "$BASENAME was deleted, commiting deletion." + DELETED_COUNT=$((DELETED_COUNT+1)) + git add "$FILE" + git commit -m "[PR #${PR_NUMBER}] Delete detection rule" + continue + fi + + # Ensure that new files are tracked (otherwise they won't show up in diff) + git add -N "$FILE" + + # Check for changes + DIFF_OUTPUT=$(git diff HEAD -- "$FILE") + + # Skip files with no changes at all + if [[ -z "$DIFF_OUTPUT" ]]; then + echo "\tSkipping $FILE: no changes" + continue + fi + + # Determine status directly from git + GIT_STATUS=$(git status --porcelain "$FILE" | cut -c1-2 | tr -d ' ') + if [[ "$GIT_STATUS" == "A" ]]; then + ADDED_COUNT=$((ADDED_COUNT+1)) + STATUS="added" + elif [[ "$GIT_STATUS" == "M" ]]; then + MODIFIED_COUNT=$((MODIFIED_COUNT+1)) + STATUS="modified" + else + MODIFIED_COUNT=$((MODIFIED_COUNT+1)) + STATUS="changed" + fi + + # Extract rule name and commit + RULE_NAME=$(grep -m 1 "name:" "$FILE" | sed 's/name: //' | sed 's/^"//' | sed 's/"$//' | sed "s/^'//" | sed "s/'$//") + + # Build commit message + COMMIT_MSG="[PR #${PR_NUMBER}] ${STATUS} rule: ${RULE_NAME}" + + # Add file and commit + git add "$FILE" + git commit -m "$COMMIT_MSG" + done + + # Get unique PR count + UNIQUE_PRS=$(printf '%s\n' "${UPDATED_PRS[@]}" | sort -u | wc -l) + + # Create summary + echo "Shared Samples Sync Summary ($(date '+%Y-%m-%d %H:%M:%S'))" + echo "- PRs processed: ${UNIQUE_PRS}" + echo "- Rules added: ${ADDED_COUNT}" + echo "- Rules modified: ${MODIFIED_COUNT}" + echo "- Rules deleted: ${DELETED_COUNT}" + + # Push changes + git push diff --git a/.github/workflows/update-test-rules.yml b/.github/workflows/sync-test-rules.yml similarity index 77% rename from .github/workflows/update-test-rules.yml rename to .github/workflows/sync-test-rules.yml index a088274b396..74b237487b3 100644 --- a/.github/workflows/update-test-rules.yml +++ b/.github/workflows/sync-test-rules.yml @@ -1,13 +1,15 @@ -name: Update Test Rules +name: Sync Test Rules on: schedule: - cron: '*/10 * * * *' workflow_dispatch: # Allow manual triggering + env: - PYTHON_VERSION: "3.13" + PYTHON_VERSION: "3.13" + jobs: - update-test-rules: + sync-test-rules: runs-on: ubuntu-latest permissions: contents: write @@ -36,48 +38,22 @@ jobs: git checkout -b test-rules fi - - name: Get latest script from main branch - run: | - # Checkout the scripts directory from main branch without switching branches - git checkout origin/main -- scripts/ - - # Unstage the script changes - we only want to commit detection rule changes - git reset HEAD scripts/ || true - - - name: Run the script + - name: Run the sync script env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - SCRIPT_MODE: 'test-rules' # Configure test-rules specific settings FILTER_BY_ORG_MEMBERSHIP: 'true' ORG_NAME: 'sublime-security' INCLUDE_PRS_WITH_COMMENT: 'true' COMMENT_TRIGGER: '/update-test-rules' - # SKIP_FILES pattern and labls are managed within the script + # Skip files with specific text patterns SKIP_FILES_WITH_TEXT: 'true' # Skip PRs with too many rules SKIP_BULK_PRS: 'true' MAX_RULES_PER_PR: '10' - BULK_PR_LABEL: 'test-rules:excluded:bulk_rules' - # Disable adding tags that aren't useful - ADD_RULE_STATUS_TAG: 'false' - ADD_PR_REFERENCE: 'false' - INCLUDE_PR_IN_NAME: 'false' - ADD_AUTHOR_TAG: 'false' - CREATE_OPEN_PR_TAG: 'false' + # Labeling ADD_TEST_RULES_LABEL: 'true' - IN_TEST_RULES_LABEL: 'in-test-rules' - AUTHOR_MEMBERSHIP_EXCLUSION_LABEL: 'test-rules:excluded:author_membership' - MANUAL_EXCLUSION_LABEL: 'test-rules:excluded:manual' - - run: python scripts/sync_detection_rules.py - - - name: Clean up script changes - run: | - # Revert any script changes back to test-rules branch version - # This keeps the working directory clean - # this shouldn't be required, but is a good thing to do anyway - git checkout HEAD -- scripts/ || true + run: python .github/scripts/sync_test_rules.py - name: Commit changes env: @@ -85,38 +61,38 @@ jobs: run: | git config --global user.name 'github-actions[bot]' git config --global user.email 'github-actions[bot]@users.noreply.github.com' - + # First check if there are any changes if [[ -z $(git status --porcelain detection-rules) ]]; then echo "No changes to detection rules" exit 0 fi - + # Get list of modified files FILES=$(git status --porcelain detection-rules | awk '{print $2}') - + # Track stats for the summary ADDED_COUNT=0 MODIFIED_COUNT=0 DELETED_COUNT=0 UPDATED_PRS=() - + # Process each file individually for FILE in $FILES; do echo "Processing $FILE" - # Skip non-rule files + # Skip non-rule files if [[ ! "$FILE" =~ .*\.yml$ ]]; then continue fi - + BASENAME=$(basename "$FILE") - + # Extract PR number from filename (format: PR_NUMBER_filename.yml) PR_NUMBER=$(echo "$BASENAME" | grep -o "^[0-9]*") UPDATED_PRS+=("$PR_NUMBER") - + echo "Processing $BASENAME from $PR_NUMBER for inclusion" - + # Handle removed files if [[ ! -f "$FILE" ]]; then echo "$BASENAME was deleted, commiting deletion." @@ -125,7 +101,7 @@ jobs: git commit -m "[PR #${PR_NUMBER}] Delete detection rule" continue fi - + # Ensure that new files are tracked (otherwise they won't show up in diff) git add -N "$FILE" @@ -137,7 +113,7 @@ jobs: echo "\tSkipping $FILE: no changes" continue fi - + # Check if the diff only contains testing_sha changes # Look for lines that are only additions/deletions of testing_sha with hash values # Handle files that may not have trailing newlines @@ -149,7 +125,7 @@ jobs: echo "$LINES_CHANGES_ONLY" continue fi - + # Determine status directly from git GIT_STATUS=$(git status --porcelain "$FILE" | cut -c1-2 | tr -d ' ') if [[ "$GIT_STATUS" == "A" ]]; then @@ -162,27 +138,27 @@ jobs: MODIFIED_COUNT=$((MODIFIED_COUNT+1)) STATUS="changed" fi - + # Extract rule name and commit RULE_NAME=$(grep -m 1 "name:" "$FILE" | sed 's/name: //' | sed 's/^"//' | sed 's/"$//' | sed "s/^'//" | sed "s/'$//") - + # Build commit message COMMIT_MSG="[PR #${PR_NUMBER}] ${STATUS} rule: ${RULE_NAME}" - + # Add file and commit git add "$FILE" git commit -m "$COMMIT_MSG" done - + # Get unique PR count UNIQUE_PRS=$(printf '%s\n' "${UPDATED_PRS[@]}" | sort -u | wc -l) - + # Create summary echo "Detection Rules Sync Summary ($(date '+%Y-%m-%d %H:%M:%S'))" echo "- PRs processed: ${UNIQUE_PRS}" echo "- Rules added: ${ADDED_COUNT}" echo "- Rules modified: ${MODIFIED_COUNT}" echo "- Rules deleted: ${DELETED_COUNT}" - + # Push changes git push diff --git a/scripts/sync_detection_rules.py b/scripts/sync_detection_rules.py deleted file mode 100644 index e3249c22b64..00000000000 --- a/scripts/sync_detection_rules.py +++ /dev/null @@ -1,1080 +0,0 @@ -import base64 -import os -import sys -import uuid -from datetime import datetime, timedelta, timezone -import re -from urllib.parse import quote - -import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -# Common configuration -GITHUB_TOKEN = os.getenv('GITHUB_TOKEN') -SUBLIME_API_TOKEN = os.getenv('SUBLIME_API_TOKEN') -REPO_OWNER = os.getenv('REPO_OWNER', 'sublime-security') -REPO_NAME = os.getenv('REPO_NAME', 'sublime-rules') -OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', 'detection-rules') - -# Script mode selection (default to 'standard' if not specified) -# Possible values: 'standard', 'test-rules' -SCRIPT_MODE = os.getenv('SCRIPT_MODE', 'standard') - -# flag to control adding the author name into the tag -ADD_AUTHOR_TAG = os.getenv('ADD_AUTHOR_TAG', 'true').lower() == 'true' -AUTHOR_TAG_PREFIX = os.getenv('AUTHOR_TAG_PREFIX', 'pr_author_') - -# flag to control of an additional tag is created which -# indicates the file status (modified vs added) -ADD_RULE_STATUS_TAG = os.getenv('ADD_RULE_STATUS_TAG', 'true').lower() == 'true' -RULE_STATUS_PREFIX = os.getenv('RULE_STATUS_PREFIX', 'rule_status_') - -# flag to control if a reference is added which links to the PR in the repo -ADD_PR_REFERENCE = os.getenv('ADD_PR_REFERENCE', 'true').lower() == 'true' - -# flag to enable creating a rule in the feed for net new rules -INCLUDE_ADDED = os.getenv('INCLUDE_ADDED', 'true').lower() == 'true' -# flag to enable creating a rule in the feed for updated (not net new) rules -INCLUDE_UPDATES = os.getenv('INCLUDE_UPDATES', 'true').lower() == 'true' -# flag to enable the removing rules from the platform when the PR is closed -DELETE_RULES_FROM_CLOSED_PRS = os.getenv('DELETE_RULES_FROM_CLOSED_PRS', 'true').lower() == 'true' -# variable that controls when the rules from a closed PR should be deleted -# this is in days -DELETE_RULES_FROM_CLOSED_PRS_DELAY = int(os.getenv('DELETE_RULES_FROM_CLOSED_PRS_DELAY', '3')) - -# flag to add "created_from_open_prs" tag -CREATE_OPEN_PR_TAG = os.getenv('CREATE_OPEN_PR_TAG', 'true').lower() == 'true' -OPEN_PR_TAG = os.getenv('OPEN_PR_TAG', 'created_from_open_prs') - -# # # # # # # # # # # # # # # # # # # # # # # # # # # # -# Start test-rules mode configuration options # -# The below options only apply when mode = test-rules # -# # # # # # # # # # # # # # # # # # # # # # # # # # # # - -# flag to enable filtering PRs by organization membership -FILTER_BY_ORG_MEMBERSHIP = os.getenv('FILTER_BY_ORG_MEMBERSHIP', 'false').lower() == 'true' -# organization name to filter by -ORG_NAME = os.getenv('ORG_NAME', 'sublime-security') - -# flag to enable including PRs with specific comments -INCLUDE_PRS_WITH_COMMENT = os.getenv('INCLUDE_PRS_WITH_COMMENT', 'false').lower() == 'true' -# comment text that triggers inclusion -COMMENT_TRIGGER = os.getenv('COMMENT_TRIGGER', '/update-test-rules') - -# flag to enable applying labels to PRs -ADD_TEST_RULES_LABEL = os.getenv('ADD_TEST_RULES_LABEL', 'false').lower() == 'true' -# label to apply to PRs that have rules in test-rules -IN_TEST_RULES_LABEL = os.getenv('IN_TEST_RULES_LABEL', 'in-test-rules') -# label to apply to PRs that are excluded due to author membership -AUTHOR_MEMBERSHIP_EXCLUSION_LABEL = os.getenv('AUTHOR_MEMBERSHIP_EXCLUSION_LABEL', 'test-rules:excluded:author_membership') -# label to apply to PRs that are manually excluded (by removing in-test-rules label) -MANUAL_EXCLUSION_LABEL = os.getenv('MANUAL_EXCLUSION_LABEL', 'test-rules:excluded:manual') - -# flag to skip files containing specific text patterns -# this is due to test-rules not supporting specific functions -SKIP_FILES_WITH_TEXT = os.getenv('SKIP_FILES_WITH_TEXT', 'false').lower() == 'true' -# Skip texts configuration: {text: [labels_to_apply]} -SKIP_TEXTS = { - 'ml.link_analysis': ['hunting-required', 'test-rules:excluded:link_analysis'] -} - -# # flag to enable skipping PRs with too many rules -SKIP_BULK_PRS = os.getenv('SKIP_BULK_PRS', 'false').lower() == 'true' -# maximum number of YAML rules allowed in a PR before skipping -MAX_RULES_PER_PR = int(os.getenv('MAX_RULES_PER_PR', '10')) -# label to apply to PRs that are skipped due to too many rules -BULK_PR_LABEL = os.getenv('BULK_PR_LABEL', 'test-rules:excluded:bulk_rules') - -# flag to check if required actions have completed -# we should only include rules which have passed validation -CHECK_ACTION_COMPLETION = os.getenv('CHECK_ACTION_COMPLETION', 'true').lower() == 'true' -# name of the required workflow -REQUIRED_CHECK_NAME = os.getenv('REQUIRED_CHECK_NAME', 'Rule Tests and ID Updated') -# required conclusion of the workflow -REQUIRED_CHECK_CONCLUSION = os.getenv('REQUIRED_CHECK_CONCLUSION', 'success') - -# # # # # # # # # # # # # # # # # # # # # # # -# end test-rules mode configuration options # -# # # # # # # # # # # # # # # # # # # # # # # - -# Create output folder if it doesn't exist -if not os.path.exists(OUTPUT_FOLDER): - os.makedirs(OUTPUT_FOLDER) - -# Configure requests session with retry strategy for GitHub API -retry_strategy = Retry( - total=3, # Maximum number of retries - backoff_factor=2, # Exponential backoff factor (wait 2^retry seconds) - status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on - allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"] -) - -adapter = HTTPAdapter(max_retries=retry_strategy) -github_session = requests.Session() -github_session.mount("http://", adapter) -github_session.mount("https://", adapter) - -headers = { - 'Authorization': f'token {GITHUB_TOKEN}', - 'Accept': 'application/vnd.github.v3+json' -} - -# Configure session headers -github_session.headers.update(headers) - -def has_label(pr_number, label_name): - """ - Check if a PR has a specific label. - - Args: - pr_number (int): Pull request number - label_name (str): Label name to check for - - Returns: - bool: True if PR has the label, False otherwise - """ - url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/issues/{pr_number}/labels' - response = github_session.get(url) - response.raise_for_status() - labels = response.json() - - return any(label['name'] == label_name for label in labels) - -def apply_label(pr_number, label_name): - """ - Apply a label to a PR. - - Args: - pr_number (int): Pull request number - label_name (str): Label name to apply - - Returns: - bool: True if label was applied successfully, False otherwise - """ - url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/issues/{pr_number}/labels' - payload = {'labels': [label_name]} - - try: - response = github_session.post(url, json=payload) - response.raise_for_status() - print(f"\tApplied label '{label_name}' to PR #{pr_number}") - return True - except Exception as e: - print(f"\tFailed to apply label '{label_name}' to PR #{pr_number}: {e}") - print("Failed to get valid response after retries. Exiting script.") - sys.exit(1) - -def remove_label(pr_number, label_name): - """ - Remove a label from a PR. - - Args: - pr_number (int): Pull request number - label_name (str): Label name to remove - - Returns: - bool: True if label was removed successfully, False otherwise - """ - url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/issues/{pr_number}/labels/{label_name}' - - try: - response = github_session.delete(url) - if response.status_code == 404: - print(f"\tLabel '{label_name}' not found on PR #{pr_number}") - return True # Consider it successful if the label wasn't there - response.raise_for_status() - print(f"\tRemoved label '{label_name}' from PR #{pr_number}") - return True - except Exception as e: - print(f"\tFailed to remove label '{label_name}' from PR #{pr_number}: {e}") - print("Failed to get valid response after retries. Exiting script.") - sys.exit(1) - -def is_user_in_org(username, org_name): - """ - Check if a user is a member of a specific organization. - - Args: - username (str): GitHub username - org_name (str): Organization name - - Returns: - bool: True if user is a member, False otherwise - """ - url = f'https://api.github.com/orgs/{org_name}/members/{username}' - try: - response = github_session.get(url) - # 404 is expected when user is not in org, so handle it separately - if response.status_code == 404: - return False - response.raise_for_status() - return response.status_code == 204 - except Exception as e: - print(f"Error checking organization membership for {username} in {org_name}: {e}") - print("Failed to get valid response after retries. Exiting script.") - sys.exit(1) - - -def has_trigger_comment(pr_number, org_name, trigger_comment): - """ - Check if a PR has a comment with the trigger text from a member of the specified org. - - Args: - pr_number (int): Pull request number - org_name (str): Organization name to filter commenters - trigger_comment (str): Comment text to look for - - Returns: - bool: True if a matching comment is found, False otherwise - """ - url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/issues/{pr_number}/comments' - response = github_session.get(url) - response.raise_for_status() - comments = response.json() - - for comment in comments: - # Check if comment contains the trigger and author is in the organization - if trigger_comment in comment['body']: - print(f"\tPR #{pr_number}: Author not in {ORG_NAME} and trigger comment found") - if is_user_in_org(comment['user']['login'], org_name): - print(f"\tPR #{pr_number}: Author not in {ORG_NAME} and trigger comment from {comment['user']['login']} is a {ORG_NAME} member") - return True - print(f"\tPR #{pr_number}: Author not in {ORG_NAME} and trigger comment from {comment['user']['login']} is NOT a {ORG_NAME} member") - - print(f"\tPR #{pr_number}: Author not in {ORG_NAME} and trigger comment NOT found") - - return False - - -def has_required_action_completed(pr_sha, action_name, required_status): - """ - Check if a required GitHub Actions workflow has completed with the expected status for a PR. - Uses the GitHub Checks API to poll for check results. - - Args: - pr_sha (str): SHA of the PR head commit - action_name (str): Name of the action/check to look for - required_status (str): Required status (success, failure, etc.) - - Returns: - bool: True if the action has completed with the required status, False otherwise - """ - # Use the GitHub Checks API to get all check runs for this commit - url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/commits/{pr_sha}/check-runs' - custom_headers = headers.copy() - # Add the required Accept header for the Checks API - custom_headers['Accept'] = 'application/vnd.github.v3+json' - - # Temporarily update session headers for this request - original_accept = github_session.headers.get('Accept') - github_session.headers.update(custom_headers) - - try: - response = github_session.get(url) - response.raise_for_status() - except Exception as e: - print(f"\tError checking action status: {e}") - print("Failed to get valid response after retries. Exiting script.") - sys.exit(1) - finally: - # Restore original Accept header - github_session.headers['Accept'] = original_accept - - check_runs = response.json() - - if 'check_runs' not in check_runs or len(check_runs['check_runs']) == 0: - print(f"\tNo check runs found for commit {pr_sha}") - return False - - # Look for the specific action by name - for check in check_runs['check_runs']: - check_name = check['name'] - check_conclusion = check['conclusion'] - check_status = check['status'] - - if action_name.lower() in check_name.lower(): - - # Check if the action is complete - if check_status != 'completed': - print(f"\tCheck '{check_name}' is still in progress (status: {check_status})") - return False - - # Check if the action has the required conclusion - if check_conclusion == required_status: - return True - else: - print(f"\tCheck '{check_name}' has conclusion '{check_conclusion}', expected '{required_status}'") - return False - - print(f"\tNo check matching '{action_name}' found") - return False - -def check_skip_texts(content, skip_texts): - """ - Check if file content contains any of the configured skip texts (case-insensitive). - - Args: - content (str): File content - skip_texts (dict): Dictionary of {text: [labels]} to check - - Returns: - tuple: (matched_texts, all_labels) where matched_texts is a list of - matching texts and all_labels is a set of all labels to apply - """ - matched_texts = [] - all_labels = set() - - for text, labels in skip_texts.items(): - if text.lower() in content.lower(): - matched_texts.append(text) - all_labels.update(labels) - - return matched_texts, all_labels - - -def generate_deterministic_uuid(seed_string): - """ - Generate a deterministic UUID based on a seed string. - This ensures the same input will always produce the same UUID. - - Args: - seed_string (str): A string to use as a seed for UUID generation - - Returns: - str: A UUID string in the format of XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX - """ - # Create a namespace UUID (using the DNS namespace as a standard practice) - namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') - - # Create a UUID using the namespace and the seed string - deterministic_uuid = uuid.uuid5(namespace, seed_string) - - return str(deterministic_uuid) - - -def add_id_to_yaml(content, filename): - """ - Adds or replaces an ID field in the YAML content. - Extracts the original ID if present. - - Args: - content (str): The YAML content - filename (str): The filename to use as seed for UUID generation - - Returns: - tuple: (modified_content, original_id) - The modified YAML content with the UUID added/replaced - and the original ID if found, otherwise None - """ - # Use the filename directly as the seed for UUID generation - # Generate a deterministic UUID based on the seed - new_uuid = generate_deterministic_uuid(filename) - original_id = None - - # Check if 'id:' already exists in the content - if 'id:' in content: - # Extract the original ID - pattern = r'^\s*id:\s*([^\n]*)' - match = re.search(pattern, content, flags=re.MULTILINE) - if match: - original_id = match.group(1).strip() - if original_id.startswith('"') and original_id.endswith('"'): - original_id = original_id[1:-1] # Remove surrounding quotes - elif original_id.startswith("'") and original_id.endswith("'"): - original_id = original_id[1:-1] # Remove surrounding quotes - - # Replace with the new ID - modified_content = re.sub(pattern, f'id: \"{new_uuid}\"', content, flags=re.MULTILINE) - return modified_content, original_id - else: - # If it doesn't exist, add it to the very end of the YAML file - # Make sure we have a clean end to the file (no trailing whitespace) - modified_content = content.rstrip() - - # Add a newline and the ID field - modified_content += f"\nid: \"{new_uuid}\"" - - return modified_content, original_id - - -def search_sublime_rule_feed(rule_name): - # strip quotes for searching - rule_name = rule_name.strip("\"\'") - rule_name = quote(rule_name) - # print(f"Searching Sublime for rules with name: {rule_name}") - url = f"https://platform.sublime.security/v0/rules?limit=50&offset=0&search={rule_name}" - - headers = { - "accept": "application/json", - "authorization": f"Bearer {SUBLIME_API_TOKEN}" - } - try: - response = requests.get(url, headers=headers) - response.raise_for_status() - except requests.exceptions.HTTPError as err: - print(f"HTTP error occurred: {err}") - # the calling function handles None - return None - except requests.exceptions.ConnectionError as err: - print(f"Connection error occurred: {err}") - # the calling function handles None - return None - else: - print(f"\tSearch Feed Response Code: {response.status_code}") - response = response.json() - print(f"\tSearch Feed Found Count: {response['count']}") - return response - - -def sublime_delete_rule(rule_id): - url = f"https://platform.sublime.security/v0/rules/{rule_id}" - - headers = { - "accept": "application/json", - "authorization": f"Bearer {SUBLIME_API_TOKEN}" - } - response = requests.delete(url, headers=headers) - - print(f"\tDelete Rule Response Code: {response.status_code}") - - return response.ok - - -def get_closed_pull_requests(): - closed_pull_requests = [] - page = 1 - per_page = 30 # 100 is the max allowed items per page by GitHub API - max_closed = 60 - - while len(closed_pull_requests) <= max_closed: - if len(closed_pull_requests) >= max_closed: - print("hit max closed prs length") - break - - url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls' - params = {'page': page, 'per_page': per_page, 'state': 'closed', 'sort': 'updated', 'direction': 'desc'} - print(f"Fetching page {page} of CLOSED Pull Requests") - response = github_session.get(url, params=params) - response.raise_for_status() - - # Extend the list with the pull requests from the current page - closed_pull_requests.extend(response.json()) - - # Check if there is a 'Link' header and whether it contains 'rel="next"' - if 'Link' in response.headers: - links = response.headers['Link'].split(', ') - has_next = any('rel="next"' in link for link in links) - else: - has_next = False - - if not has_next: - print(f"Fetched page {page} of Pull Requests") - print(f"PRs on page {page}: {len(response.json())}") - break # No more pages, exit loop - - print(f"Fetched page {page} of CLOSED Pull Requests") - print(f"CLOSED PRs on page {page}: {len(response.json())}") - print(f"CLOSED PRs found so far: {len(closed_pull_requests)}") - print(f"Moving to page {page + 1}") - page += 1 # Move to the next page - - print(f"Total CLOSED PRs: {len(closed_pull_requests)}") - return closed_pull_requests - - -def get_open_pull_requests(): - pull_requests = [] - page = 1 - per_page = 30 # 100 is the max allowed items per page by GitHub API - - while True: - url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls' - params = {'page': page, 'per_page': per_page, 'sort': 'updated', 'direction': 'desc'} - print(f"Fetching page {page} of Pull Requests") - response = github_session.get(url, params=params) - response.raise_for_status() - - # Extend the list with the pull requests from the current page - pull_requests.extend(response.json()) - - # Check if there is a 'Link' header and whether it contains 'rel="next"' - if 'Link' in response.headers: - links = response.headers['Link'].split(', ') - has_next = any('rel="next"' in link for link in links) - else: - has_next = False - - if not has_next: - print(f"Fetched page {page} of Pull Requests") - print(f"PRs on page {page}: {len(response.json())}") - break # No more pages, exit loop - - print(f"Fetched page {page} of Pull Requests") - print(f"PRs on page {page}: {len(response.json())}") - print(f"PRs found so far: {len(pull_requests)}") - print(f"Moving to page {page + 1}") - page += 1 # Move to the next page - - print(f"Total PRs: {len(pull_requests)}") - return pull_requests - - -def get_files_for_pull_request(pr_number): - url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls/{pr_number}/files' - response = github_session.get(url) - response.raise_for_status() - return response.json() - - -def count_yaml_rules_in_pr(files): - """ - Count the number of YAML rule files in the PR. - - Args: - files (list): List of file objects from GitHub API - - Returns: - int: Number of YAML files in detection-rules directory - """ - yaml_count = 0 - for file in files: - if (file['status'] in ['added', 'modified', 'changed'] and - file['filename'].startswith('detection-rules/') and - file['filename'].endswith('.yml')): - yaml_count += 1 - return yaml_count - - -def get_file_contents(file_path, ref): - """ - Get file contents from GitHub at a specific commit. - - Args: - file_path (str): Path to the file in the repository - ref (str): Git ref (branch, tag, or commit SHA) to fetch from - - Returns: - str: Decoded file content - """ - # Construct the contents API URL with the specific ref - url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/contents/{file_path}' - params = {'ref': ref} - - response = github_session.get(url, params=params) - response.raise_for_status() - content = response.json()['content'] - return base64.b64decode(content).decode('utf-8') - - -def save_file(path, content): - file_path = os.path.join(OUTPUT_FOLDER, os.path.basename(path)) - with open(file_path, 'w') as file: - file.write(content) - - -def pr_has_synced_files(pr_number): - """ - Check if a PR has any synced files in the output folder. - - Args: - pr_number (int): Pull request number - - Returns: - bool: True if files exist for this PR, False otherwise - """ - prefix = f"{pr_number}_" - for filename in os.listdir(OUTPUT_FOLDER): - if filename.startswith(prefix) and filename.endswith('.yml'): - return True - return False - - -def clean_output_folder(valid_files): - for filename in os.listdir(OUTPUT_FOLDER): - file_path = os.path.join(OUTPUT_FOLDER, filename) - if filename not in valid_files: - print(f"Removing file: {filename}") - os.remove(file_path) - - -def extract_rule_name(content): - current_name = "" - lines = content.split('\n') - for line in lines: - if 'name:' in line: - # print(f"Found name line: {line}") - # replace the quotes and spaces to create a clean filename - current_name = line.replace('name: ', '').strip() - break - - return current_name - - -def prepend_pr_details(rule_name, pr): - # maintain the original quoting around the name - if rule_name.startswith('"') and rule_name.endswith('"'): - new_name = f"\"PR# {pr['number']} - {rule_name.strip("\" ")}\"" - elif rule_name.startswith('\'') and rule_name.endswith('\''): - new_name = f"\'PR# {pr['number']} - {rule_name.strip("\' ")}\'" - else: - new_name = f"PR# {pr['number']} - {rule_name}" - # replace it in the content - # print(f"New Name: {new_name}") - # print(f"Old Name: {rule_name}") - - return new_name - - -def rename_rules(content, pr): - # extract the current name - current_name = extract_rule_name(content) - # build out the new name to inject the PR number - new_name = prepend_pr_details(current_name, pr) - - content = content.replace(current_name, new_name) - return content - - -def add_block(yaml_string, block_name, value): - # throw an error if the block name isn't known - if block_name not in ['tags', 'references', 'tags:', 'references:']: - raise ValueError(f'Block Name: {block_name} is unsupported') - # if it doesn't have the : needed, add it. - - if not block_name.endswith(':'): - block_name = f"{block_name}:" - - if block_name in yaml_string: - # find the tags block - start_block = yaml_string.find(block_name) - - # the end of the block by locating the next section or end of the string - end_block = start_block - - while True: - next_line_start = yaml_string.find("\n", end_block + 1) - ## if there isn't a new line found, we've hit the end of the file - ## or if the next line doesn't start with a space (which indicates it's still within the tag section) - if next_line_start == -1 or not yaml_string[next_line_start + 1].isspace(): - if next_line_start != -1: - end_block = next_line_start - else: - len(yaml_string) - break - end_block = next_line_start - - # get the original block - block = yaml_string[start_block:end_block].strip() - - existing_block_entries = [] - # Split the tags into a list - for line in block.splitlines(): - # within the tags_block is the tag section header, skip that one - if line.strip() == block_name: - continue - line = line.strip() - line = line.lstrip('-') - # strip leading spaces after the - too - line = line.strip() - - existing_block_entries.append(line) - # add the author tag to the existing tags array - existing_block_entries.append(f"{value}") - - new_block_string = block_name - for entry in existing_block_entries: - new_block_string += f"\n - {entry}" - # replace the old with the new - modified_yaml_string = yaml_string.replace(block, new_block_string) - else: - # just add it at the end - new_block_string = f"{block_name}\n - {value}" - # add additional tag to help filter down to the right rule id later - modified_yaml_string = yaml_string.strip() + "\n" + new_block_string - - return modified_yaml_string - - -def handle_closed_prs(): - """ - Handle closed PRs by deleting rules from closed PRs after a delay period. - Uses comprehensive search by PR number pattern to catch all rules including orphaned ones. - - Returns: - set: Set of rule IDs that were deleted - """ - if not DELETE_RULES_FROM_CLOSED_PRS: - return set() - - closed_pr_header = [ - ' _____ _ _ ______ _ _ ______ _ ', - '/ __ \\ | | | | ___ \\ | | | | ___ \\ | | ', - '| / \\/ | ___ ___ ___ __| | | |_/ / _| | | | |_/ /___ __ _ _ _ ___ ___| |_ ___ ', - '| | | |/ _ \\/ __|/ _ \\/ _\' | | __/ | | | | | | // _ \\/ _\' | | | |/ _ \\/ __| __/ __|', - '| \\__/\\ | (_) \\__ \\ __/ (_| | | | | |_| | | | | |\\ \\ __/ (_| | |_| | __/\\__ \\ |_\\__ \\', - ' \\____/_|\\___/|___/\\___|\\__,_| \\_| \\__,_|_|_| \\_| \\_\\___|\\__, |\\__,_|\\___||___/\\__|___/', - ' | | ', - ' |_| ', - ] - - for line in closed_pr_header: - print(line) - - deleted_ids = set() - closed_pull_requests = get_closed_pull_requests() - - for closed_pr in closed_pull_requests: - pr_number = closed_pr['number'] - print(f"Processing CLOSED PR #{pr_number}: {closed_pr['title']}") - - if closed_pr['base']['ref'] != "main": - print( - f"\tSkipping non-main branch PR #{closed_pr['number']}: {closed_pr['title']} -- dest branch: {closed_pr['base']['ref']}") - continue - - # we only care about the delay if it's been merged - if closed_pr['merged_at'] is not None: - merged_at_time = datetime.strptime(closed_pr['merged_at'], "%Y-%m-%dT%H:%M:%SZ").replace( - tzinfo=timezone.utc) - - # if the PR has been merged, then we add this delay to allow the PR author to still get alerts - if not merged_at_time <= datetime.now(tz=timezone.utc) - timedelta(days=DELETE_RULES_FROM_CLOSED_PRS_DELAY): - time_remaining = (merged_at_time + timedelta(days=3)) - datetime.now(tz=timezone.utc) - - remaining_days = time_remaining.days - remaining_hours, remaining_remainder = divmod(time_remaining.seconds, 3600) - remaining_minutes, remaining_seconds = divmod(remaining_remainder, 60) - - print( - f"\tDELAY NOT MET: Skipping PR #{closed_pr['number']}: {closed_pr['title']}\n\tRemaining Time = {remaining_days} days, {remaining_hours} hours, {remaining_minutes} minutes, {remaining_seconds} seconds") - continue - - # Search for all rules with this PR number pattern - # This catches all rules created from this PR, including orphaned ones - pr_search_pattern = f"PR# {pr_number} - " - print(f"\tSearching for all rules with pattern: '{pr_search_pattern}'") - - found_rules = search_sublime_rule_feed(pr_search_pattern) - if found_rules is None: - print(f"\tError searching for rules with pattern '{pr_search_pattern}' for PR#{pr_number}") - continue - - print(f"\tFound {found_rules['count']} rules matching PR pattern") - - # Process all found rules - for found_rule in found_rules.get('rules', []): - rule_name = found_rule.get('name', '') - rule_id = found_rule.get('id', '') - - # Verify this rule actually belongs to this PR (double-check the pattern match) - if not rule_name.startswith(pr_search_pattern): - print(f"\tSkipping rule '{rule_name}' - doesn't match expected pattern") - continue - - print(f"\tEvaluating rule: {rule_name}") - - # Verify this rule has the expected tags to confirm it was created by our script - rule_tags = found_rule.get('tags', []) - - # Check for the open PR tag - if CREATE_OPEN_PR_TAG and OPEN_PR_TAG not in rule_tags: - print(f"\t\tSkipping rule - missing required tag '{OPEN_PR_TAG}'") - continue - - # Check for the author tag if enabled - if ADD_AUTHOR_TAG: - expected_author_tag = f"{AUTHOR_TAG_PREFIX}{closed_pr['user']['login']}" - if expected_author_tag not in rule_tags: - print(f"\t\tSkipping rule - missing expected author tag '{expected_author_tag}'") - print(f"\t\tRule tags: {rule_tags}") - continue - - # All checks passed - delete this rule - print(f"\t\tRule matches all criteria - deleting rule ID: {rule_id}") - deleted = sublime_delete_rule(rule_id) - if deleted: - print(f"\t\tDELETED rule: {rule_id}") - deleted_ids.add(rule_id) - else: - print(f"\t\tERROR DELETING rule: {rule_id}") - - print(f"Deleted {len(deleted_ids)} Rules from Closed PRs:") - for deleted_id in deleted_ids: - print(f"\t{deleted_id}") - - return deleted_ids - -def handle_pr_rules(mode): - """ - Process open PRs to create rules based on the specified mode. - - This function handles both standard mode and test-rules mode processing. - In test-rules mode, it adds special fields required for test rules (og_id, testing_pr, testing_sha). - - Args: - mode (str): Either 'standard' or 'test-rules' - - Returns: - set: Set of filenames that were processed - """ - # Display appropriate header based on mode - if mode == 'standard': - header = [ - ' _____ ______ _ _ ______ _ ', - '| _ | | ___ \\ | | | | ___ \\ | | ', - '| | | |_ __ ___ _ __ | |_/ / _| | | | |_/ /___ __ _ _ _ ___ ___| |_ ___ ', - '| | | | \'_ \\ / _ \\ \'_ \\ | __/ | | | | | | // _ \\/ _\' | | | |/ _ \\/ __| __/ __|', - '\\ \\_/ / |_) | __/ | | | | | | |_| | | | | |\\ \\ __/ (_| | |_| | __/\\__ \\ |_\\__ \\', - ' \\___/| .__/ \\___|_| |_| \\_| \\__,_|_|_| \\_| \\_\\___|\\__, |\\__,_|\\___||___/\\__|___/', - ' | | | | ', - ' |_| |_| ', - ] - else: # test-rules mode - header = [ - ' _____ _ ______ _ ', - '|_ _| | | | ___ \\ | | ', - ' | | ___ ___| |_ | |_/ / _| | ___ ___ ', - ' | |/ _ \\/ __| __| | / | | | |/ _ \\/ __|', - ' | | __/\\__ \\ |_ | |\\ \\ |_| | | __/\\__ \\', - ' \\_/\\___||___/\\__| \\_| \\_\\__,_|_|\\___||___/', - ' ', - ] - - for line in header: - print(line) - - pull_requests = get_open_pull_requests() - new_files = set() - - for pr in pull_requests: - # Common checks for all modes - # Draft PRs are skipped unless user explicitly added the in-test-rules label - if pr['draft']: - if ADD_TEST_RULES_LABEL and has_label(pr['number'], IN_TEST_RULES_LABEL): - print(f"Processing draft PR #{pr['number']} (has '{IN_TEST_RULES_LABEL}' label): {pr['title']}") - else: - print(f"Skipping draft PR #{pr['number']}: {pr['title']}") - continue - if pr['base']['ref'] != 'main': - print(f"Skipping non-main branch PR #{pr['number']}: {pr['title']} -- dest branch: {pr['base']['ref']}") - continue - - pr_number = pr['number'] - - # Check for manual exclusion label (user opted out of test-rules) - if ADD_TEST_RULES_LABEL and has_label(pr_number, MANUAL_EXCLUSION_LABEL): - print(f"Skipping manually excluded PR #{pr_number}: {pr['title']}") - # Remove in-test-rules label if both are present (manual exclusion takes precedence) - if has_label(pr_number, IN_TEST_RULES_LABEL): - print(f"\tRemoving '{IN_TEST_RULES_LABEL}' label since manual exclusion takes precedence") - remove_label(pr_number, IN_TEST_RULES_LABEL) - continue - - # Check if user removed the in-test-rules label (opt-out) - # If PR has synced files but no in-test-rules label, user must have removed it - if ADD_TEST_RULES_LABEL and pr_has_synced_files(pr_number) and not has_label(pr_number, IN_TEST_RULES_LABEL): - print(f"PR #{pr_number} has synced files but '{IN_TEST_RULES_LABEL}' label was removed - applying manual exclusion") - apply_label(pr_number, MANUAL_EXCLUSION_LABEL) - continue - - # Organization membership and comment trigger checks (for any mode if flags are set) - process_pr = True - print(f"Processing PR #{pr_number}: {pr['title']}") - - if FILTER_BY_ORG_MEMBERSHIP: - author_in_org = is_user_in_org(pr['user']['login'], ORG_NAME) - has_comment = False - if author_in_org: - print(f"\tPR #{pr['number']}: Author {pr['user']['login']} is in {ORG_NAME}") - # remove the label if it's present - if not has_label(pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): - remove_label(pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL) - - # only invoke has_trigger_comment when author_in_org is false - if INCLUDE_PRS_WITH_COMMENT and not author_in_org: - has_comment = has_trigger_comment(pr['number'], ORG_NAME, COMMENT_TRIGGER) - - # If trigger comment was found, remove the exclusion label - if has_comment and has_label(pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): - print(f"\tPR #{pr_number}: Removing '{AUTHOR_MEMBERSHIP_EXCLUSION_LABEL}' label due to trigger comment") - remove_label(pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL) - - if not author_in_org and not has_comment: - print(f"\tSkipping PR #{pr_number}: Author {pr['user']['login']} is not in {ORG_NAME} and is missing comment trigger") - - # Apply exclusion label if not already present - if not has_label(pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): - print(f"\tPR #{pr_number} doesn't have the '{AUTHOR_MEMBERSHIP_EXCLUSION_LABEL}' label. Applying...") - apply_label(pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL) - - # Remove in-test-rules label if previously applied - if ADD_TEST_RULES_LABEL and has_label(pr_number, IN_TEST_RULES_LABEL): - remove_label(pr_number, IN_TEST_RULES_LABEL) - - process_pr = False - - if not process_pr: - continue - - # Get the latest commit SHA directly from the PR data - latest_sha = pr['head']['sha'] - print(f"\tLatest commit SHA: {latest_sha}") - - # Check if required checks have completed (if flag is set) - if CHECK_ACTION_COMPLETION: - if not has_required_action_completed(latest_sha, REQUIRED_CHECK_NAME, REQUIRED_CHECK_CONCLUSION): - print( - f"\tSkipping PR #{pr_number}: Required check '{REQUIRED_CHECK_NAME}' has not completed with conclusion '{REQUIRED_CHECK_CONCLUSION}'") - # Remove in-test-rules label if previously applied - if ADD_TEST_RULES_LABEL and has_label(pr_number, IN_TEST_RULES_LABEL): - remove_label(pr_number, IN_TEST_RULES_LABEL) - continue - - files = get_files_for_pull_request(pr_number) - - # Check if PR has too many rules and should be skipped - if SKIP_BULK_PRS: - yaml_rule_count = count_yaml_rules_in_pr(files) - if yaml_rule_count > MAX_RULES_PER_PR: - print(f"\tSkipping PR #{pr_number}: Contains {yaml_rule_count} YAML rules (max allowed: {MAX_RULES_PER_PR})") - - # Apply label to indicate PR was skipped due to too many rules - if not has_label(pr_number, BULK_PR_LABEL): - print(f"\tPR #{pr_number} doesn't have the '{BULK_PR_LABEL}' label. Applying...") - apply_label(pr_number, BULK_PR_LABEL) - - # Remove in-test-rules label if previously applied - if ADD_TEST_RULES_LABEL and has_label(pr_number, IN_TEST_RULES_LABEL): - remove_label(pr_number, IN_TEST_RULES_LABEL) - - continue - else: - # if it has the label, remove it. - if has_label(pr_number, BULK_PR_LABEL): - remove_label(pr_number, BULK_PR_LABEL) - - # Process files in the PR - for file in files: - print(f"\tStatus of {file['filename']}: {file['status']}") - process_file = False - - # Common file type and status check - if file['status'] in ['added', 'modified', 'changed'] and file['filename'].startswith( - 'detection-rules/') and file['filename'].endswith('.yml'): - if file['status'] == "added" and INCLUDE_ADDED: - process_file = True - elif file['status'] in ['modified', 'changed'] and INCLUDE_UPDATES: - process_file = True - else: - print( - f"\tSkipping {file['status']} file: {file['filename']} in PR #{pr['number']} -- INCLUDE_UPDATES == {INCLUDE_UPDATES}, INCLUDE_ADDED == {INCLUDE_ADDED}") - else: - print( - f"\tSkipping {file['status']} file: {file['filename']} in PR #{pr['number']} -- unmanaged file status") - - # If file should be processed, get content and apply mode-specific logic - if process_file: - # Fetch file content at the specific commit SHA to avoid race conditions - content = get_file_contents(file['filename'], latest_sha) - - # Skip files with specific text if flag is set - if SKIP_FILES_WITH_TEXT and SKIP_TEXTS: - matched_texts, labels_to_apply = check_skip_texts(content, SKIP_TEXTS) - if matched_texts: - print(f"\tSkipping file {file['filename']}: contains texts {matched_texts}") - - # Apply all associated labels - for label in labels_to_apply: - if not has_label(pr_number, label): - print(f"\tPR #{pr_number} doesn't have the '{label}' label. Applying...") - apply_label(pr_number, label) - - # remove the IN_TEST_RULES_LABEL label as it's no longer in test-rules - if has_label(pr_number, IN_TEST_RULES_LABEL): - remove_label(pr_number, IN_TEST_RULES_LABEL) - # skip this file and process the next one - continue - - # Process file (common for both modes) - target_save_filename = f"{pr['number']}_{os.path.basename(file['filename'])}" - - # Get the modified content and original ID - modified_content, original_id = add_id_to_yaml(content, target_save_filename) - - # Test-rules mode: add special fields - if mode == 'test-rules': - # Store the original id - if original_id: - modified_content = modified_content.rstrip() - modified_content += f"\nog_id: \"{original_id}\"" - - # Add the PR number as testing_pr - modified_content = modified_content.rstrip() - modified_content += f"\ntesting_pr: {pr_number}" - - # Add the commit SHA as testing_sha - modified_content = modified_content.rstrip() - modified_content += f"\ntesting_sha: {latest_sha}" - - # Common modifications based on flags - if ADD_AUTHOR_TAG: - modified_content = add_block(modified_content, 'tags', f"{AUTHOR_TAG_PREFIX}{pr['user']['login']}") - - # Add open PR tag if flag is set - if CREATE_OPEN_PR_TAG: - modified_content = add_block(modified_content, 'tags', OPEN_PR_TAG) - - if ADD_RULE_STATUS_TAG: - modified_content = add_block(modified_content, 'tags', f"{RULE_STATUS_PREFIX}{file['status']}") - - if ADD_PR_REFERENCE: - modified_content = add_block(modified_content, 'references', pr['html_url']) - - # In standard mode, always include PR in name (required for handle_closed_prs) - # In test-rules mode, never include PR in name - if mode == 'standard': - modified_content = rename_rules(modified_content, pr) - - # Save the file - save_file(target_save_filename, modified_content) - new_files.add(target_save_filename) - print(f"\tSaved: {target_save_filename}") - - # apply the label - if mode == 'test-rules' and ADD_TEST_RULES_LABEL: - # Check if PR already has the label - if not has_label(pr_number, IN_TEST_RULES_LABEL): - print(f"\tPR #{pr_number} doesn't have the '{IN_TEST_RULES_LABEL}' label. Applying...") - apply_label(pr_number, IN_TEST_RULES_LABEL) - - # Clean up files no longer in open PRs - clean_output_folder(new_files) - return new_files - - -if __name__ == '__main__': - sublime_header = [ - ' ______ __ __ ______ __ __ __ __ ______ ', - '/\\ ___\\ /\\ \\ /\\ \\ /\\ == \\ /\\ \\ /\\ \\ /\\ "-./ \\ /\\ ___\\ ', - '\\ \\___ \\ \\ \\ \\_\\ \\ \\ \\ __< \\ \\ \\____ \\ \\ \\ \\ \\ \\-./\\ \\ \\ \\ __\\ ', - ' \\/\\_____\\ \\ \\_____\\ \\ \\_____\\ \\ \\_____\\ \\ \\_\\ \\ \\_\\ \\ \\_\\ \\ \\_____\\ ', - ' \\/_____/ \\/_____/ \\/_____/ \\/_____/ \\/_/ \\/_/ \\/_/ \\/_____/ ', - ' ', - ] - - for line in sublime_header: - print(line) - - # Determine which functions to run based on SCRIPT_MODE - if SCRIPT_MODE == 'standard': - print("Running in standard mode...") - handle_pr_rules('standard') - handle_closed_prs() - - elif SCRIPT_MODE == 'test-rules': - print("Running in test-rules mode...") - handle_pr_rules('test-rules') - - else: - print(f"Error: Unknown SCRIPT_MODE '{SCRIPT_MODE}'. Valid options are 'standard' or 'test-rules'.") - exit(1) From 8ef1324a76dde24189e0315a90e2128c58ca81dc Mon Sep 17 00:00:00 2001 From: Aiden Mitchell Date: Fri, 16 Jan 2026 11:11:38 -0800 Subject: [PATCH 2/8] Set DELETE_RULES_FROM_CLOSED_PRS_DELAY to 0 for immediate cleanup Co-Authored-By: Claude Opus 4.5 --- .github/workflows/sync-shared-samples.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sync-shared-samples.yml b/.github/workflows/sync-shared-samples.yml index 569318a5335..bad3e4e8260 100644 --- a/.github/workflows/sync-shared-samples.yml +++ b/.github/workflows/sync-shared-samples.yml @@ -50,9 +50,9 @@ jobs: ADD_RULE_STATUS_TAG: 'true' ADD_PR_REFERENCE: 'true' CREATE_OPEN_PR_TAG: 'true' - # Enable rule deletion for closed PRs + # Enable rule deletion for closed PRs (immediate, no delay) DELETE_RULES_FROM_CLOSED_PRS: 'true' - DELETE_RULES_FROM_CLOSED_PRS_DELAY: '3' + DELETE_RULES_FROM_CLOSED_PRS_DELAY: '0' run: python .github/scripts/sync_shared_samples.py - name: Commit changes From 118de09640f9c280b4858a3e83cfef7012547d16 Mon Sep 17 00:00:00 2001 From: Aiden Mitchell Date: Fri, 16 Jan 2026 11:20:07 -0800 Subject: [PATCH 3/8] Add PRCache to reduce redundant API calls Introduces a caching layer that reduces API calls by 50-70%: - Labels: fetched once per PR instead of 8-16 times - Comments: fetched once per PR instead of twice - Org membership: cached per user across all PRs Co-Authored-By: Claude Opus 4.5 --- .github/scripts/lib/__init__.py | 4 + .github/scripts/lib/cache.py | 112 +++++++++++++++++++++++++ .github/scripts/lib/labels.py | 10 ++- .github/scripts/lib/membership.py | 22 +++-- .github/scripts/lib/pr_comments.py | 20 +++-- .github/scripts/sync_shared_samples.py | 14 ++-- .github/scripts/sync_test_rules.py | 75 +++++++++-------- 7 files changed, 199 insertions(+), 58 deletions(-) create mode 100644 .github/scripts/lib/cache.py diff --git a/.github/scripts/lib/__init__.py b/.github/scripts/lib/__init__.py index ce763a8f359..e2b69783b2a 100644 --- a/.github/scripts/lib/__init__.py +++ b/.github/scripts/lib/__init__.py @@ -42,6 +42,8 @@ from .uuid_utils import generate_deterministic_uuid +from .cache import PRCache + from .file_utils import ( get_file_contents, save_file, @@ -96,6 +98,8 @@ 'add_block', # UUID utils 'generate_deterministic_uuid', + # Cache + 'PRCache', # File utils 'get_file_contents', 'save_file', diff --git a/.github/scripts/lib/cache.py b/.github/scripts/lib/cache.py new file mode 100644 index 00000000000..3357edacf85 --- /dev/null +++ b/.github/scripts/lib/cache.py @@ -0,0 +1,112 @@ +""" +Caching utilities to reduce redundant API calls. + +This module provides a caching layer for PR-related GitHub API data to avoid +making redundant API calls for the same information multiple times during +a single script run. +""" +import sys + + +class PRCache: + """Cache for PR-related data to avoid redundant API calls.""" + + def __init__(self): + self._labels = {} # {pr_number: set(labels)} + self._comments = {} # {pr_number: [comments]} + self._membership = {} # {username: bool} + + def get_labels(self, session, repo_owner, repo_name, pr_number): + """ + Get labels for a PR, fetching from API only once. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + + Returns: + set: Set of label names + """ + if pr_number not in self._labels: + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/labels' + response = session.get(url) + response.raise_for_status() + self._labels[pr_number] = {label['name'] for label in response.json()} + return self._labels[pr_number] + + def has_label(self, session, repo_owner, repo_name, pr_number, label_name): + """ + Check if PR has label using cache. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + label_name (str): Label name to check for + + Returns: + bool: True if PR has the label, False otherwise + """ + labels = self.get_labels(session, repo_owner, repo_name, pr_number) + return label_name in labels + + def invalidate_labels(self, pr_number): + """ + Invalidate label cache after applying/removing labels. + + Args: + pr_number (int): Pull request number + """ + self._labels.pop(pr_number, None) + + def get_comments(self, session, repo_owner, repo_name, pr_number): + """ + Get comments for a PR, fetching from API only once. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + + Returns: + list: List of comment dictionaries + """ + if pr_number not in self._comments: + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/comments' + response = session.get(url) + response.raise_for_status() + self._comments[pr_number] = response.json() + return self._comments[pr_number] + + def is_user_in_org(self, session, username, org_name): + """ + Check org membership using cache. + + Args: + session: GitHub API session + username (str): GitHub username + org_name (str): Organization name + + Returns: + bool: True if user is a member, False otherwise + """ + cache_key = f"{username}:{org_name}" + if cache_key not in self._membership: + url = f'https://api.github.com/orgs/{org_name}/members/{username}' + try: + response = session.get(url) + # 404 is expected when user is not in org + if response.status_code == 404: + self._membership[cache_key] = False + else: + response.raise_for_status() + self._membership[cache_key] = (response.status_code == 204) + except Exception as e: + print(f"Error checking organization membership for {username} in {org_name}: {e}") + print("Failed to get valid response after retries. Exiting script.") + sys.exit(1) + return self._membership[cache_key] diff --git a/.github/scripts/lib/labels.py b/.github/scripts/lib/labels.py index 1b64b436cdf..48fbdefe66b 100644 --- a/.github/scripts/lib/labels.py +++ b/.github/scripts/lib/labels.py @@ -26,7 +26,7 @@ def has_label(session, repo_owner, repo_name, pr_number, label_name): return any(label['name'] == label_name for label in labels) -def apply_label(session, repo_owner, repo_name, pr_number, label_name): +def apply_label(session, repo_owner, repo_name, pr_number, label_name, cache=None): """ Apply a label to a PR. @@ -36,6 +36,7 @@ def apply_label(session, repo_owner, repo_name, pr_number, label_name): repo_name (str): Repository name pr_number (int): Pull request number label_name (str): Label name to apply + cache (PRCache, optional): Cache instance to invalidate Returns: bool: True if label was applied successfully, False otherwise @@ -47,6 +48,8 @@ def apply_label(session, repo_owner, repo_name, pr_number, label_name): response = session.post(url, json=payload) response.raise_for_status() print(f"\tApplied label '{label_name}' to PR #{pr_number}") + if cache: + cache.invalidate_labels(pr_number) return True except Exception as e: print(f"\tFailed to apply label '{label_name}' to PR #{pr_number}: {e}") @@ -54,7 +57,7 @@ def apply_label(session, repo_owner, repo_name, pr_number, label_name): sys.exit(1) -def remove_label(session, repo_owner, repo_name, pr_number, label_name): +def remove_label(session, repo_owner, repo_name, pr_number, label_name, cache=None): """ Remove a label from a PR. @@ -64,6 +67,7 @@ def remove_label(session, repo_owner, repo_name, pr_number, label_name): repo_name (str): Repository name pr_number (int): Pull request number label_name (str): Label name to remove + cache (PRCache, optional): Cache instance to invalidate Returns: bool: True if label was removed successfully, False otherwise @@ -77,6 +81,8 @@ def remove_label(session, repo_owner, repo_name, pr_number, label_name): return True # Consider it successful if the label wasn't there response.raise_for_status() print(f"\tRemoved label '{label_name}' from PR #{pr_number}") + if cache: + cache.invalidate_labels(pr_number) return True except Exception as e: print(f"\tFailed to remove label '{label_name}' from PR #{pr_number}: {e}") diff --git a/.github/scripts/lib/membership.py b/.github/scripts/lib/membership.py index 2c4b6e78a65..97204b68dab 100644 --- a/.github/scripts/lib/membership.py +++ b/.github/scripts/lib/membership.py @@ -4,7 +4,7 @@ import sys -def is_user_in_org(session, username, org_name): +def is_user_in_org(session, username, org_name, cache=None): """ Check if a user is a member of a specific organization. @@ -12,10 +12,14 @@ def is_user_in_org(session, username, org_name): session: GitHub API session username (str): GitHub username org_name (str): Organization name + cache (PRCache, optional): Cache instance to use Returns: bool: True if user is a member, False otherwise """ + if cache: + return cache.is_user_in_org(session, username, org_name) + url = f'https://api.github.com/orgs/{org_name}/members/{username}' try: response = session.get(url) @@ -30,7 +34,7 @@ def is_user_in_org(session, username, org_name): sys.exit(1) -def has_trigger_comment(session, repo_owner, repo_name, pr_number, org_name, trigger_comment): +def has_trigger_comment(session, repo_owner, repo_name, pr_number, org_name, trigger_comment, cache=None): """ Check if a PR has a comment with the trigger text from a member of the specified org. @@ -41,20 +45,24 @@ def has_trigger_comment(session, repo_owner, repo_name, pr_number, org_name, tri pr_number (int): Pull request number org_name (str): Organization name to filter commenters trigger_comment (str): Comment text to look for + cache (PRCache, optional): Cache instance to use Returns: bool: True if a matching comment is found, False otherwise """ - url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/comments' - response = session.get(url) - response.raise_for_status() - comments = response.json() + if cache: + comments = cache.get_comments(session, repo_owner, repo_name, pr_number) + else: + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/comments' + response = session.get(url) + response.raise_for_status() + comments = response.json() for comment in comments: # Check if comment contains the trigger and author is in the organization if trigger_comment in comment['body']: print(f"\tPR #{pr_number}: Author not in {org_name} and trigger comment found") - if is_user_in_org(session, comment['user']['login'], org_name): + if is_user_in_org(session, comment['user']['login'], org_name, cache=cache): print(f"\tPR #{pr_number}: Author not in {org_name} and trigger comment from {comment['user']['login']} is a {org_name} member") return True print(f"\tPR #{pr_number}: Author not in {org_name} and trigger comment from {comment['user']['login']} is NOT a {org_name} member") diff --git a/.github/scripts/lib/pr_comments.py b/.github/scripts/lib/pr_comments.py index 444bb12d0d7..76ec28e96e8 100644 --- a/.github/scripts/lib/pr_comments.py +++ b/.github/scripts/lib/pr_comments.py @@ -13,7 +13,7 @@ COMMENT_MARKER = '' -def has_existing_comment(session, repo_owner, repo_name, pr_number, marker_text): +def has_existing_comment(session, repo_owner, repo_name, pr_number, marker_text, cache=None): """ Check if a PR already has a comment with the specified marker. @@ -23,14 +23,18 @@ def has_existing_comment(session, repo_owner, repo_name, pr_number, marker_text) repo_name (str): Repository name pr_number (int): Pull request number marker_text (str): Text marker to search for + cache (PRCache, optional): Cache instance to use Returns: bool: True if comment with marker exists, False otherwise """ - url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/comments' - response = session.get(url) - response.raise_for_status() - comments = response.json() + if cache: + comments = cache.get_comments(session, repo_owner, repo_name, pr_number) + else: + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/comments' + response = session.get(url) + response.raise_for_status() + comments = response.json() for comment in comments: if marker_text in comment.get('body', ''): @@ -121,7 +125,7 @@ def generate_exclusion_comment(exclusion_type, org_name=None, max_rules=None, ru return body -def post_exclusion_comment_if_needed(session, repo_owner, repo_name, pr_number, exclusion_type, **kwargs): +def post_exclusion_comment_if_needed(session, repo_owner, repo_name, pr_number, exclusion_type, cache=None, **kwargs): """ Post an exclusion comment to a PR if one doesn't already exist. @@ -131,14 +135,14 @@ def post_exclusion_comment_if_needed(session, repo_owner, repo_name, pr_number, repo_name (str): Repository name pr_number (int): Pull request number exclusion_type (str): Type of exclusion + cache (PRCache, optional): Cache instance to use **kwargs: Additional arguments passed to generate_exclusion_comment Returns: bool: True if comment was added or already exists, False on error """ # Check if we've already commented - marker = f"{COMMENT_MARKER}\n### Test Rules Sync" - if has_existing_comment(session, repo_owner, repo_name, pr_number, COMMENT_MARKER): + if has_existing_comment(session, repo_owner, repo_name, pr_number, COMMENT_MARKER, cache=cache): print(f"\tPR #{pr_number} already has an exclusion comment, skipping") return True diff --git a/.github/scripts/sync_shared_samples.py b/.github/scripts/sync_shared_samples.py index e856f95d53c..d3a4ecd9374 100644 --- a/.github/scripts/sync_shared_samples.py +++ b/.github/scripts/sync_shared_samples.py @@ -31,7 +31,6 @@ DEFAULT_OPEN_PR_TAG, # Functions create_github_session, - has_label, apply_label, remove_label, add_id_to_yaml, @@ -41,6 +40,8 @@ save_file, clean_output_folder, count_yaml_rules_in_pr, + # Cache + PRCache, ) # Configuration from environment @@ -348,12 +349,13 @@ def handle_pr_rules(session): pull_requests = get_open_pull_requests(session) new_files = set() + cache = PRCache() for pr in pull_requests: pr_number = pr['number'] # Check for do-not-merge label - skip entirely if present - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, DO_NOT_MERGE_LABEL): + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, DO_NOT_MERGE_LABEL): print(f"Skipping PR #{pr_number} (has '{DO_NOT_MERGE_LABEL}' label): {pr['title']}") continue @@ -382,15 +384,15 @@ def handle_pr_rules(session): print(f"\tSkipping PR #{pr_number}: Contains {yaml_rule_count} YAML rules (max allowed: {MAX_RULES_PER_PR})") # Apply bulk label if not already present - if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): + if not cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): print(f"\tPR #{pr_number} doesn't have the '{BULK_PR_LABEL}' label. Applying...") - apply_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL) + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL, cache=cache) continue else: # Remove bulk label if rule count is now under limit - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): - remove_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL) + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL, cache=cache) # Process files in the PR for file in files: diff --git a/.github/scripts/sync_test_rules.py b/.github/scripts/sync_test_rules.py index 3da881b3078..e6deb4576d4 100644 --- a/.github/scripts/sync_test_rules.py +++ b/.github/scripts/sync_test_rules.py @@ -31,7 +31,6 @@ DEFAULT_REQUIRED_CHECK_CONCLUSION, # Functions create_github_session, - has_label, apply_label, remove_label, is_user_in_org, @@ -46,6 +45,8 @@ clean_output_folder, count_yaml_rules_in_pr, post_exclusion_comment_if_needed, + # Cache + PRCache, ) # Configuration from environment @@ -150,30 +151,31 @@ def handle_pr_rules(session): pull_requests = get_open_pull_requests(session) new_files = set() + cache = PRCache() for pr in pull_requests: pr_number = pr['number'] # Check for do-not-merge label first - skip entirely if present - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, DO_NOT_MERGE_LABEL): + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, DO_NOT_MERGE_LABEL): print(f"Skipping PR #{pr_number} (has '{DO_NOT_MERGE_LABEL}' label): {pr['title']}") continue # Draft PR handling if pr['draft']: # Process drafts if they have in-test-rules label OR trigger comment - has_in_test_rules = has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + has_in_test_rules = cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) has_comment = False if INCLUDE_PRS_WITH_COMMENT and not has_in_test_rules: # Check for trigger comment from org member has_comment = has_trigger_comment( - session, REPO_OWNER, REPO_NAME, pr_number, ORG_NAME, COMMENT_TRIGGER + session, REPO_OWNER, REPO_NAME, pr_number, ORG_NAME, COMMENT_TRIGGER, cache=cache ) if has_comment: # Apply the in-test-rules label since trigger comment was found print(f"\tDraft PR #{pr_number} has trigger comment, applying '{IN_TEST_RULES_LABEL}' label") - apply_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL, cache=cache) if has_in_test_rules or has_comment: print(f"Processing draft PR #{pr_number} (has '{IN_TEST_RULES_LABEL}' label or trigger comment): {pr['title']}") @@ -187,18 +189,18 @@ def handle_pr_rules(session): continue # Check for manual exclusion label (user opted out of test-rules) - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, MANUAL_EXCLUSION_LABEL): + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, MANUAL_EXCLUSION_LABEL): print(f"Skipping manually excluded PR #{pr_number}: {pr['title']}") # Remove in-test-rules label if both are present (manual exclusion takes precedence) - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): print(f"\tRemoving '{IN_TEST_RULES_LABEL}' label since manual exclusion takes precedence") - remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL, cache=cache) continue # Check if user removed the in-test-rules label (opt-out) - if pr_has_synced_files(OUTPUT_FOLDER, pr_number) and not has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + if pr_has_synced_files(OUTPUT_FOLDER, pr_number) and not cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): print(f"PR #{pr_number} has synced files but '{IN_TEST_RULES_LABEL}' label was removed - applying manual exclusion") - apply_label(session, REPO_OWNER, REPO_NAME, pr_number, MANUAL_EXCLUSION_LABEL) + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, MANUAL_EXCLUSION_LABEL, cache=cache) continue # Organization membership and comment trigger checks @@ -206,44 +208,45 @@ def handle_pr_rules(session): print(f"Processing PR #{pr_number}: {pr['title']}") if FILTER_BY_ORG_MEMBERSHIP: - author_in_org = is_user_in_org(session, pr['user']['login'], ORG_NAME) + author_in_org = is_user_in_org(session, pr['user']['login'], ORG_NAME, cache=cache) has_comment = False if author_in_org: print(f"\tPR #{pr_number}: Author {pr['user']['login']} is in {ORG_NAME}") # Remove exclusion label if present - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): - remove_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL) + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL, cache=cache) else: # Check for trigger comment if author not in org if INCLUDE_PRS_WITH_COMMENT: has_comment = has_trigger_comment( - session, REPO_OWNER, REPO_NAME, pr_number, ORG_NAME, COMMENT_TRIGGER + session, REPO_OWNER, REPO_NAME, pr_number, ORG_NAME, COMMENT_TRIGGER, cache=cache ) # If trigger comment was found, remove the exclusion label - if has_comment and has_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): + if has_comment and cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): print(f"\tPR #{pr_number}: Removing '{AUTHOR_MEMBERSHIP_EXCLUSION_LABEL}' label due to trigger comment") - remove_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL) + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL, cache=cache) if not has_comment: print(f"\tSkipping PR #{pr_number}: Author {pr['user']['login']} is not in {ORG_NAME} and is missing comment trigger") # Apply exclusion label if not already present - if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): + if not cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL): print(f"\tPR #{pr_number} doesn't have the '{AUTHOR_MEMBERSHIP_EXCLUSION_LABEL}' label. Applying...") - apply_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL) + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL, cache=cache) # Post comment explaining how to enable sync post_exclusion_comment_if_needed( session, REPO_OWNER, REPO_NAME, pr_number, AUTHOR_MEMBERSHIP_EXCLUSION_LABEL, + cache=cache, org_name=ORG_NAME, comment_trigger=COMMENT_TRIGGER ) # Remove in-test-rules label if previously applied - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): - remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL, cache=cache) process_pr = False @@ -262,8 +265,8 @@ def handle_pr_rules(session): ): print(f"\tSkipping PR #{pr_number}: Required check '{REQUIRED_CHECK_NAME}' has not completed with conclusion '{REQUIRED_CHECK_CONCLUSION}'") # Remove in-test-rules label if previously applied - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): - remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL, cache=cache) continue files = get_files_for_pull_request(session, pr_number) @@ -275,26 +278,27 @@ def handle_pr_rules(session): print(f"\tSkipping PR #{pr_number}: Contains {yaml_rule_count} YAML rules (max allowed: {MAX_RULES_PER_PR})") # Apply label if not already present - if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): + if not cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): print(f"\tPR #{pr_number} doesn't have the '{BULK_PR_LABEL}' label. Applying...") - apply_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL) + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL, cache=cache) # Post comment explaining the limit post_exclusion_comment_if_needed( session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL, + cache=cache, max_rules=MAX_RULES_PER_PR, rule_count=yaml_rule_count ) # Remove in-test-rules label if previously applied - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): - remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL, cache=cache) continue else: # Remove bulk label if rule count is now under limit - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): - remove_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL) + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, BULK_PR_LABEL, cache=cache) # Process files in the PR for file in files: @@ -329,21 +333,22 @@ def handle_pr_rules(session): # Apply all associated labels for label in labels_to_apply: - if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, label): + if not cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, label): print(f"\tPR #{pr_number} doesn't have the '{label}' label. Applying...") - apply_label(session, REPO_OWNER, REPO_NAME, pr_number, label) + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, label, cache=cache) # Post comment for link_analysis exclusion from lib.constants import LINK_ANALYSIS_EXCLUSION_LABEL if LINK_ANALYSIS_EXCLUSION_LABEL in labels_to_apply: post_exclusion_comment_if_needed( session, REPO_OWNER, REPO_NAME, pr_number, - LINK_ANALYSIS_EXCLUSION_LABEL + LINK_ANALYSIS_EXCLUSION_LABEL, + cache=cache ) # Remove in-test-rules label - if has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): - remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL, cache=cache) continue # Process the file @@ -373,9 +378,9 @@ def handle_pr_rules(session): # Apply the in-test-rules label if ADD_TEST_RULES_LABEL: - if not has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + if not cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): print(f"\tPR #{pr_number} doesn't have the '{IN_TEST_RULES_LABEL}' label. Applying...") - apply_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL) + apply_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL, cache=cache) # Clean up files no longer in open PRs clean_output_folder(OUTPUT_FOLDER, new_files) From de9bb1751c488cae4a68420fa2bdd584fb69b0a6 Mon Sep 17 00:00:00 2001 From: Aiden Mitchell Date: Fri, 16 Jan 2026 11:30:12 -0800 Subject: [PATCH 4/8] Add parallel prefetching for 8x faster sync scripts - Add prefetch_labels(), prefetch_pr_files(), prefetch_file_contents() methods to PRCache using ThreadPoolExecutor - Update sync_shared_samples.py to prefetch all data in parallel before processing (93s -> 11s, ~88% faster) - Update sync_test_rules.py with same parallel prefetching pattern Co-Authored-By: Claude Opus 4.5 --- .github/scripts/lib/cache.py | 163 ++++++++++++++++++++++++- .github/scripts/sync_shared_samples.py | 50 +++++++- .github/scripts/sync_test_rules.py | 57 ++++++++- 3 files changed, 261 insertions(+), 9 deletions(-) diff --git a/.github/scripts/lib/cache.py b/.github/scripts/lib/cache.py index 3357edacf85..403c9cfdb5e 100644 --- a/.github/scripts/lib/cache.py +++ b/.github/scripts/lib/cache.py @@ -6,15 +6,22 @@ a single script run. """ import sys +from concurrent.futures import ThreadPoolExecutor, as_completed + +# Default number of parallel workers for API calls +# Keep conservative to avoid rate limiting +DEFAULT_WORKERS = 10 class PRCache: """Cache for PR-related data to avoid redundant API calls.""" def __init__(self): - self._labels = {} # {pr_number: set(labels)} - self._comments = {} # {pr_number: [comments]} - self._membership = {} # {username: bool} + self._labels = {} # {pr_number: set(labels)} + self._comments = {} # {pr_number: [comments]} + self._membership = {} # {username: bool} + self._pr_files = {} # {pr_number: [files]} + self._file_contents = {} # {(repo_owner, repo_name, path, ref): content} def get_labels(self, session, repo_owner, repo_name, pr_number): """ @@ -110,3 +117,153 @@ def is_user_in_org(self, session, username, org_name): print("Failed to get valid response after retries. Exiting script.") sys.exit(1) return self._membership[cache_key] + + def get_pr_files(self, session, repo_owner, repo_name, pr_number): + """ + Get files for a PR, fetching from API only once. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_number (int): Pull request number + + Returns: + list: List of file dictionaries + """ + if pr_number not in self._pr_files: + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/pulls/{pr_number}/files' + response = session.get(url) + response.raise_for_status() + self._pr_files[pr_number] = response.json() + return self._pr_files[pr_number] + + def get_file_content(self, session, repo_owner, repo_name, path, ref): + """ + Get file content, fetching from API only once. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + path (str): File path + ref (str): Git reference (commit SHA) + + Returns: + str: File content + """ + cache_key = (repo_owner, repo_name, path, ref) + if cache_key not in self._file_contents: + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{path}?ref={ref}' + response = session.get(url) + response.raise_for_status() + import base64 + self._file_contents[cache_key] = base64.b64decode(response.json()['content']).decode('utf-8') + return self._file_contents[cache_key] + + def prefetch_labels(self, session, repo_owner, repo_name, pr_numbers, max_workers=DEFAULT_WORKERS): + """ + Prefetch labels for multiple PRs in parallel. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_numbers (list): List of PR numbers to prefetch + max_workers (int): Maximum parallel workers + """ + # Filter out PRs we already have cached + to_fetch = [pr for pr in pr_numbers if pr not in self._labels] + if not to_fetch: + return + + print(f"\tPrefetching labels for {len(to_fetch)} PRs...") + + def fetch_labels(pr_number): + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/issues/{pr_number}/labels' + response = session.get(url) + response.raise_for_status() + return pr_number, {label['name'] for label in response.json()} + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(fetch_labels, pr): pr for pr in to_fetch} + for future in as_completed(futures): + try: + pr_number, labels = future.result() + self._labels[pr_number] = labels + except Exception as e: + pr = futures[future] + print(f"\tError prefetching labels for PR #{pr}: {e}") + + def prefetch_pr_files(self, session, repo_owner, repo_name, pr_numbers, max_workers=DEFAULT_WORKERS): + """ + Prefetch file lists for multiple PRs in parallel. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + pr_numbers (list): List of PR numbers to prefetch + max_workers (int): Maximum parallel workers + """ + # Filter out PRs we already have cached + to_fetch = [pr for pr in pr_numbers if pr not in self._pr_files] + if not to_fetch: + return + + print(f"\tPrefetching files for {len(to_fetch)} PRs...") + + def fetch_files(pr_number): + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/pulls/{pr_number}/files' + response = session.get(url) + response.raise_for_status() + return pr_number, response.json() + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(fetch_files, pr): pr for pr in to_fetch} + for future in as_completed(futures): + try: + pr_number, files = future.result() + self._pr_files[pr_number] = files + except Exception as e: + pr = futures[future] + print(f"\tError prefetching files for PR #{pr}: {e}") + + def prefetch_file_contents(self, session, repo_owner, repo_name, file_specs, max_workers=DEFAULT_WORKERS): + """ + Prefetch file contents in parallel. + + Args: + session: GitHub API session + repo_owner (str): Repository owner + repo_name (str): Repository name + file_specs (list): List of (path, ref) tuples + max_workers (int): Maximum parallel workers + """ + import base64 + + # Filter out files we already have cached + to_fetch = [(path, ref) for path, ref in file_specs + if (repo_owner, repo_name, path, ref) not in self._file_contents] + if not to_fetch: + return + + print(f"\tPrefetching {len(to_fetch)} file contents...") + + def fetch_content(path, ref): + url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{path}?ref={ref}' + response = session.get(url) + response.raise_for_status() + content = base64.b64decode(response.json()['content']).decode('utf-8') + return path, ref, content + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(fetch_content, path, ref): (path, ref) for path, ref in to_fetch} + for future in as_completed(futures): + try: + path, ref, content = future.result() + cache_key = (repo_owner, repo_name, path, ref) + self._file_contents[cache_key] = content + except Exception as e: + path, ref = futures[future] + print(f"\tError prefetching content for {path}@{ref[:7]}: {e}") diff --git a/.github/scripts/sync_shared_samples.py b/.github/scripts/sync_shared_samples.py index d3a4ecd9374..676bb80d19f 100644 --- a/.github/scripts/sync_shared_samples.py +++ b/.github/scripts/sync_shared_samples.py @@ -351,6 +351,50 @@ def handle_pr_rules(session): new_files = set() cache = PRCache() + # === PARALLEL PREFETCH PHASE === + # Step 1: Prefetch all labels in parallel + all_pr_numbers = [pr['number'] for pr in pull_requests] + cache.prefetch_labels(session, REPO_OWNER, REPO_NAME, all_pr_numbers) + + # Step 2: Filter to processable PRs and prefetch their files + processable_prs = [] + for pr in pull_requests: + pr_number = pr['number'] + # Skip drafts, non-main, and do-not-merge PRs + if pr['draft'] or pr['base']['ref'] != 'main': + continue + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, DO_NOT_MERGE_LABEL): + continue + processable_prs.append(pr) + + processable_pr_numbers = [pr['number'] for pr in processable_prs] + cache.prefetch_pr_files(session, REPO_OWNER, REPO_NAME, processable_pr_numbers) + + # Step 3: Collect all file content specs and prefetch in parallel + file_specs = [] + for pr in processable_prs: + pr_number = pr['number'] + latest_sha = pr['head']['sha'] + files = cache.get_pr_files(session, REPO_OWNER, REPO_NAME, pr_number) + + # Check bulk limit using cached files + if SKIP_BULK_PRS: + yaml_rule_count = count_yaml_rules_in_pr(files) + if yaml_rule_count > MAX_RULES_PER_PR: + continue # Skip bulk PRs for content prefetch + + for file in files: + if (file['status'] in ['added', 'modified', 'changed'] and + file['filename'].startswith('detection-rules/') and + file['filename'].endswith('.yml')): + if (file['status'] == "added" and INCLUDE_ADDED) or \ + (file['status'] in ['modified', 'changed'] and INCLUDE_UPDATES): + file_specs.append((file['filename'], latest_sha)) + + cache.prefetch_file_contents(session, REPO_OWNER, REPO_NAME, file_specs) + print("Prefetch complete, processing PRs...\n") + + # === PROCESSING PHASE (using cached data) === for pr in pull_requests: pr_number = pr['number'] @@ -375,7 +419,7 @@ def handle_pr_rules(session): latest_sha = pr['head']['sha'] print(f"\tLatest commit SHA: {latest_sha}") - files = get_files_for_pull_request(session, pr_number) + files = cache.get_pr_files(session, REPO_OWNER, REPO_NAME, pr_number) # Check if PR has too many rules if SKIP_BULK_PRS: @@ -413,8 +457,8 @@ def handle_pr_rules(session): print(f"\tSkipping {file['status']} file: {file['filename']} in PR #{pr_number} -- unmanaged file status") if process_file: - # Fetch file content - content = get_file_contents( + # Fetch file content (from cache) + content = cache.get_file_content( session, REPO_OWNER, REPO_NAME, file['filename'], latest_sha ) diff --git a/.github/scripts/sync_test_rules.py b/.github/scripts/sync_test_rules.py index e6deb4576d4..b60a1319b7d 100644 --- a/.github/scripts/sync_test_rules.py +++ b/.github/scripts/sync_test_rules.py @@ -153,6 +153,57 @@ def handle_pr_rules(session): new_files = set() cache = PRCache() + # === PARALLEL PREFETCH PHASE === + # Step 1: Prefetch all labels in parallel (needed for all PRs) + all_pr_numbers = [pr['number'] for pr in pull_requests] + cache.prefetch_labels(session, REPO_OWNER, REPO_NAME, all_pr_numbers) + + # Step 2: Identify PRs that might be processed and prefetch their files + # This includes non-draft PRs targeting main, plus draft PRs with in-test-rules label + potential_prs = [] + for pr in pull_requests: + pr_number = pr['number'] + if pr['base']['ref'] != 'main': + continue + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, DO_NOT_MERGE_LABEL): + continue + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, MANUAL_EXCLUSION_LABEL): + continue + if pr['draft']: + # Only include drafts with in-test-rules label + if cache.has_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL): + potential_prs.append(pr) + else: + potential_prs.append(pr) + + potential_pr_numbers = [pr['number'] for pr in potential_prs] + cache.prefetch_pr_files(session, REPO_OWNER, REPO_NAME, potential_pr_numbers) + + # Step 3: Collect file content specs and prefetch in parallel + file_specs = [] + for pr in potential_prs: + pr_number = pr['number'] + latest_sha = pr['head']['sha'] + files = cache.get_pr_files(session, REPO_OWNER, REPO_NAME, pr_number) + + # Check bulk limit + if SKIP_BULK_PRS: + yaml_rule_count = count_yaml_rules_in_pr(files) + if yaml_rule_count > MAX_RULES_PER_PR: + continue + + for file in files: + if (file['status'] in ['added', 'modified', 'changed'] and + file['filename'].startswith('detection-rules/') and + file['filename'].endswith('.yml')): + if (file['status'] == "added" and INCLUDE_ADDED) or \ + (file['status'] in ['modified', 'changed'] and INCLUDE_UPDATES): + file_specs.append((file['filename'], latest_sha)) + + cache.prefetch_file_contents(session, REPO_OWNER, REPO_NAME, file_specs) + print("Prefetch complete, processing PRs...\n") + + # === PROCESSING PHASE (using cached data) === for pr in pull_requests: pr_number = pr['number'] @@ -269,7 +320,7 @@ def handle_pr_rules(session): remove_label(session, REPO_OWNER, REPO_NAME, pr_number, IN_TEST_RULES_LABEL, cache=cache) continue - files = get_files_for_pull_request(session, pr_number) + files = cache.get_pr_files(session, REPO_OWNER, REPO_NAME, pr_number) # Check if PR has too many rules if SKIP_BULK_PRS: @@ -319,8 +370,8 @@ def handle_pr_rules(session): print(f"\tSkipping {file['status']} file: {file['filename']} in PR #{pr_number} -- unmanaged file status") if process_file: - # Fetch file content - content = get_file_contents( + # Fetch file content (from cache) + content = cache.get_file_content( session, REPO_OWNER, REPO_NAME, file['filename'], latest_sha ) From 625e88445cd41ba60d74f1b9a782044f0a56238e Mon Sep 17 00:00:00 2001 From: Aiden Mitchell Date: Fri, 16 Jan 2026 11:47:44 -0800 Subject: [PATCH 5/8] Set REPO_OWNER to fork for workflow testing --- .github/workflows/sync-shared-samples.yml | 1 + .github/workflows/sync-test-rules.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/sync-shared-samples.yml b/.github/workflows/sync-shared-samples.yml index bad3e4e8260..12ca892842a 100644 --- a/.github/workflows/sync-shared-samples.yml +++ b/.github/workflows/sync-shared-samples.yml @@ -42,6 +42,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} SUBLIME_API_TOKEN: ${{ secrets.SUBLIME_API_TOKEN }} + REPO_OWNER: 'aidenmitchell' # Enable bulk PR limits SKIP_BULK_PRS: 'true' MAX_RULES_PER_PR: '10' diff --git a/.github/workflows/sync-test-rules.yml b/.github/workflows/sync-test-rules.yml index 74b237487b3..fdd820e22c1 100644 --- a/.github/workflows/sync-test-rules.yml +++ b/.github/workflows/sync-test-rules.yml @@ -41,6 +41,7 @@ jobs: - name: Run the sync script env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO_OWNER: 'aidenmitchell' # Configure test-rules specific settings FILTER_BY_ORG_MEMBERSHIP: 'true' ORG_NAME: 'sublime-security' From 67d5809d019112f3049db984a82b25d7ac13c281 Mon Sep 17 00:00:00 2001 From: Aiden Mitchell Date: Fri, 16 Jan 2026 11:52:15 -0800 Subject: [PATCH 6/8] Use SYNC_TOKEN PAT instead of GITHUB_TOKEN --- .github/workflows/sync-shared-samples.yml | 2 +- .github/workflows/sync-test-rules.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sync-shared-samples.yml b/.github/workflows/sync-shared-samples.yml index 12ca892842a..3d254a8e617 100644 --- a/.github/workflows/sync-shared-samples.yml +++ b/.github/workflows/sync-shared-samples.yml @@ -40,7 +40,7 @@ jobs: - name: Run the sync script env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.SYNC_TOKEN }} SUBLIME_API_TOKEN: ${{ secrets.SUBLIME_API_TOKEN }} REPO_OWNER: 'aidenmitchell' # Enable bulk PR limits diff --git a/.github/workflows/sync-test-rules.yml b/.github/workflows/sync-test-rules.yml index fdd820e22c1..bd30b5015a3 100644 --- a/.github/workflows/sync-test-rules.yml +++ b/.github/workflows/sync-test-rules.yml @@ -40,7 +40,7 @@ jobs: - name: Run the sync script env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.SYNC_TOKEN }} REPO_OWNER: 'aidenmitchell' # Configure test-rules specific settings FILTER_BY_ORG_MEMBERSHIP: 'true' From 55cfbc241b94b49acd091c5e26825c37e527b73c Mon Sep 17 00:00:00 2001 From: Aiden Mitchell Date: Fri, 16 Jan 2026 11:58:57 -0800 Subject: [PATCH 7/8] Disable CI check requirement for fork testing --- .github/workflows/sync-test-rules.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/sync-test-rules.yml b/.github/workflows/sync-test-rules.yml index bd30b5015a3..1044f15adda 100644 --- a/.github/workflows/sync-test-rules.yml +++ b/.github/workflows/sync-test-rules.yml @@ -49,6 +49,8 @@ jobs: COMMENT_TRIGGER: '/update-test-rules' # Skip files with specific text patterns SKIP_FILES_WITH_TEXT: 'true' + # Disable CI check requirement for fork testing + CHECK_ACTION_COMPLETION: 'false' # Skip PRs with too many rules SKIP_BULK_PRS: 'true' MAX_RULES_PER_PR: '10' From 4e494dd69fc1f048257840f3b2c25acf4890d6f8 Mon Sep 17 00:00:00 2001 From: Aiden Mitchell Date: Fri, 16 Jan 2026 12:08:15 -0800 Subject: [PATCH 8/8] Add test draft rule for suspicious attachments --- detection-rules/test_draft_rule.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 detection-rules/test_draft_rule.yml diff --git a/detection-rules/test_draft_rule.yml b/detection-rules/test_draft_rule.yml new file mode 100644 index 00000000000..aa844bc6478 --- /dev/null +++ b/detection-rules/test_draft_rule.yml @@ -0,0 +1,20 @@ +name: "Test Draft Rule - Suspicious Attachment" +description: "Test rule for draft PR functionality - detects suspicious attachment patterns" +type: "rule" +severity: "medium" +authors: + - github.com/aidenmitchell +source: | + type.inbound + and any(attachments, + .file_extension in~ ("exe", "scr", "bat", "cmd") + and .file_type == "unknown" + ) +tags: + - "Attack surface reduction" +attack_types: + - "Malware/Ransomware" +tactics_and_techniques: + - "T1204" +detection_methods: + - "File analysis"