-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
145 lines (120 loc) · 4.73 KB
/
config.py
File metadata and controls
145 lines (120 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
```python
import os
import sys
import re
import logging
from dotenv import load_dotenv
import json
from typing import Dict, Set, List, Optional
# -----------------------------------------------------------------------------
# sandvox.py
#
# This script searches for potentially sensitive information (like OpenAI API keys)
# within public GitHub repositories using the GitHub API. It loads environment
# variables for configuration and logs its activities.
# -----------------------------------------------------------------------------
# --- Constants and Configuration ---
# Load environment variables from .env file (if it exists). This should be
# done as early as possible to make env vars available.
load_dotenv()
# GitHub API Token - Retrieve from environment variables. This is critical
# for authenticating requests.
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
print("❌ GitHub token not found in environment variables. Please set GITHUB_TOKEN.")
sys.exit(1)
# GitHub API Headers - Used for authentication and specifying the
# expected response format. Using a dictionary for better readability.
HEADERS = {
"Authorization": f"token {GITHUB_TOKEN}",
"Accept": "application/vnd.github.v3+json"
}
# Search Keywords - Terms to search for within repository content. Using a
# set for efficient membership checking.
SEARCH_KEYWORDS: Set[str] = {
"openai api key",
"chatgpt key",
"openai secret",
"sk-" # Common prefix for OpenAI API keys.
}
# Regular Expression Patterns - More sophisticated pattern matching to identify
# specific types of sensitive data. Using a dictionary to organize the patterns
# with descriptive keys.
REGEX_PATTERNS: Dict[str, re.Pattern] = {
"OpenAI API Key": re.compile(r"sk-[A-Za-z0-9]{32,}")
}
# GitHub Search Settings - Configure the GitHub API search behavior.
RESULTS_PER_PAGE = 100 # Max allowed by GitHub API is 100
MAX_PAGES = 10 # Reduced to avoid excessive API calls and potential rate limiting. Increase cautiously.
# Storage Paths - File paths for storing results and logs.
RESULTS_FILE = "data/results.json"
LOG_FILE = "logs/activity.log"
# --- Logging Setup ---
# Configure logging to both a file and the console.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler(sys.stdout) # Log to console as well
]
)
# --- Helper Functions ---
def create_directory_if_not_exists(path: str):
"""
Creates a directory if it does not already exist. Handles potential
race conditions.
"""
directory = os.path.dirname(path)
if directory and not os.path.exists(directory):
try:
os.makedirs(directory)
except OSError as e:
logging.error(f"Failed to create directory {directory}: {e}")
# Consider re-raising the exception or exiting if directory creation fails
# as the script may not function correctly without it.
def save_results(results: List[Dict], filename: str):
"""
Saves search results to a JSON file.
"""
create_directory_if_not_exists(filename) # Ensure the directory exists
try:
with open(filename, "w") as f:
json.dump(results, f, indent=4)
logging.info(f"Results saved to {filename}")
except IOError as e:
logging.error(f"Error writing to {filename}: {e}")
def build_search_query(keywords: Set[str]) -> str:
"""
Constructs the GitHub search query string from a set of keywords.
"""
query_parts = [f'"{keyword}"' for keyword in keywords]
return " OR ".join(query_parts)
# --- Main Script Logic (Placeholder - to be implemented) ---
# This section would contain the main logic for:
# 1. Building the GitHub search query
# 2. Making API requests to GitHub.
# 3. Parsing the API responses.
# 4. Identifying sensitive information within the search results.
# 5. Saving the results and logging activity.
def main():
"""
Main function to orchestrate the search and reporting process.
"""
logging.info("Starting sandvox script...")
# Example: Construct a search query
search_query = build_search_query(SEARCH_KEYWORDS)
logging.info(f"Search query: {search_query}")
# Placeholder: Implement GitHub API interaction here.
# Example: Replace this with actual API calls and result processing.
# results = perform_github_search(search_query)
results = [] # Placeholder - replace with results from GitHub search.
# Example: Simulate some results for demonstration purposes.
if results:
save_results(results, RESULTS_FILE)
else:
logging.info("No results found.")
logging.info("Script finished.")
if __name__ == "__main__":
main()
```