Skip to content

Commit 7a7c0a6

Browse files
committed
wip: Check snippets
1 parent 572c2b7 commit 7a7c0a6

File tree

1 file changed

+317
-0
lines changed

1 file changed

+317
-0
lines changed

.github/scripts/check_snippets.py

Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
import json
2+
import re
3+
import os
4+
import requests
5+
import sys
6+
7+
def parse_dependencies_json(json_file_path):
8+
"""
9+
Parse the JSON file containing dependencies and extract only the repositories type.
10+
11+
Args:
12+
json_file_path: Path to the JSON file containing the dependencies
13+
14+
Returns:
15+
List of dictionaries containing repository dependencies
16+
"""
17+
try:
18+
# Read and parse the JSON file
19+
with open(json_file_path, 'r') as file:
20+
data = json.load(file)
21+
22+
# Extract only dependencies with category "repositories"
23+
repositories = [
24+
dependency for dependency in data.get('outdated_dependencies', [])
25+
if dependency.get('category') == 'repositories'
26+
]
27+
28+
return repositories
29+
except FileNotFoundError:
30+
print(f"Error: File {json_file_path} not found.")
31+
return []
32+
except json.JSONDecodeError:
33+
print(f"Error: File {json_file_path} contains invalid JSON.")
34+
return []
35+
except Exception as e:
36+
print(f"Error parsing dependencies: {str(e)}")
37+
return []
38+
39+
def find_repository_references(root_dir, repo_dependencies):
40+
"""
41+
Search through the codebase for repository references matching the patterns.
42+
43+
Args:
44+
root_dir: Root directory of the codebase to search
45+
repo_dependencies: List of repository dependencies to check for
46+
47+
Returns:
48+
List of dictionaries containing file path, line number, and matched text
49+
"""
50+
# Build a list of repo names to search for
51+
repo_names = [repo['name'] for repo in repo_dependencies]
52+
53+
# Regex patterns to match repository URLs with placeholders
54+
patterns = [
55+
# Pattern for GitHub URLs with line references
56+
r'https://github\.com/.*?/blob/\{\{\s*dependencies\.repositories\.([a-zA-Z_]+)\.version\s*\}\}.*?#L\d+',
57+
# Pattern for raw.githubusercontent.com URLs with line ranges
58+
r'https://raw\.githubusercontent\.com/.*?/refs/tags/\{\{\s*dependencies\.repositories\.([a-zA-Z_]+)\.version\s*\}\}.*?:\d+:\d+'
59+
]
60+
61+
results = []
62+
63+
# Walk through all files in the codebase
64+
for dirpath, _, filenames in os.walk(root_dir):
65+
for filename in filenames:
66+
if filename.endswith('.md'): # Assuming we're only searching Markdown files
67+
file_path = os.path.join(dirpath, filename)
68+
69+
try:
70+
with open(file_path, 'r', encoding='utf-8') as file:
71+
for line_num, line in enumerate(file, 1):
72+
# Check each pattern
73+
for pattern in patterns:
74+
matches = re.finditer(pattern, line)
75+
for match in matches:
76+
repo_var = match.group(1)
77+
# Only include if the repo name is in our dependencies list
78+
if repo_var in repo_names:
79+
results.append({
80+
'file': file_path,
81+
'line_number': line_num,
82+
'match_text': match.group(0),
83+
'repo_var': repo_var,
84+
'full_line': line.strip()
85+
})
86+
except (UnicodeDecodeError, IOError) as e:
87+
print(f"Error reading file {file_path}: {str(e)}")
88+
89+
return results
90+
91+
def convert_to_raw_url(github_url):
92+
"""
93+
Convert a GitHub URL to a raw.githubusercontent.com URL.
94+
95+
Args:
96+
github_url: GitHub URL to convert
97+
98+
Returns:
99+
Raw URL for the same content
100+
"""
101+
# Pattern: https://github.com/{owner}/{repo}/blob/{branch}/{path}#L{line}
102+
pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+?)(?:#L(\d+))?$'
103+
match = re.match(pattern, github_url)
104+
105+
if match:
106+
owner, repo, branch, path, line_num = match.groups()
107+
raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
108+
return raw_url, line_num
109+
return None, None
110+
111+
def extract_line_range(url):
112+
"""
113+
Extract line range from a raw GitHub URL with line numbers.
114+
115+
Args:
116+
url: URL potentially containing line numbers
117+
118+
Returns:
119+
Tuple of (url without line numbers, start line, end line)
120+
"""
121+
# Check for line range in format :start_line:end_line
122+
pattern = r'(.+):(\d+):(\d+)$'
123+
match = re.match(pattern, url)
124+
125+
if match:
126+
base_url, start_line, end_line = match.groups()
127+
return base_url, int(start_line), int(end_line)
128+
129+
return url, None, None
130+
131+
def fetch_code_snippet(url, version, line_start=None, line_end=None):
132+
"""
133+
Fetch code from a raw GitHub URL, optionally extracting specific lines.
134+
135+
Args:
136+
url: Raw GitHub URL
137+
version: Version to use in place of the placeholder
138+
line_start: Starting line (optional)
139+
line_end: Ending line (optional)
140+
141+
Returns:
142+
The code snippet as a string
143+
"""
144+
try:
145+
response = requests.get(url)
146+
response.raise_for_status()
147+
148+
content = response.text.splitlines()
149+
150+
if line_start is not None and line_end is not None:
151+
# Extract specific lines (adjust for 0-based indexing)
152+
content = content[line_start-1:line_end]
153+
elif line_start is not None:
154+
# Just one line
155+
content = [content[line_start-1]]
156+
157+
return '\n'.join(content)
158+
except requests.RequestException as e:
159+
print(f"Error fetching code: {str(e)}")
160+
return None
161+
162+
def compare_code_snippets(repo_references, repo_dependencies):
163+
"""
164+
Compare code snippets between current and latest versions.
165+
166+
Args:
167+
repo_references: List of repository references found in the codebase
168+
repo_dependencies: List of repository dependencies
169+
170+
Returns:
171+
List of dictionaries containing the comparison results
172+
"""
173+
# Create a lookup dictionary for repositories
174+
repo_dict = {repo['name']: repo for repo in repo_dependencies}
175+
results = []
176+
177+
for ref in repo_references:
178+
repo_name = ref['repo_var']
179+
if repo_name not in repo_dict:
180+
continue
181+
182+
repo = repo_dict[repo_name]
183+
current_version = repo['current_version']
184+
latest_version = repo['latest_version']
185+
186+
url = ref['match_text']
187+
# Replace the placeholder with actual versions
188+
url_pattern = re.compile(r'\{\{\s*dependencies\.repositories\.[a-zA-Z_]+\.version\s*\}\}')
189+
190+
current_url = url_pattern.sub(current_version, url)
191+
latest_url = url_pattern.sub(latest_version, url)
192+
193+
# Check if it's already a raw URL
194+
if "raw.githubusercontent.com" in current_url:
195+
raw_current_url, line_num = current_url, None
196+
raw_latest_url, line_num = latest_url, None
197+
198+
# Extract line ranges if present
199+
raw_current_url, start_line, end_line = extract_line_range(raw_current_url)
200+
raw_latest_url, _, _ = extract_line_range(raw_latest_url)
201+
else:
202+
# Convert to raw URL
203+
raw_current_url, line_num = convert_to_raw_url(current_url)
204+
raw_latest_url, _ = convert_to_raw_url(latest_url)
205+
206+
# Set line range if a specific line was referenced
207+
if line_num:
208+
start_line, end_line = int(line_num), int(line_num)
209+
else:
210+
start_line, end_line = None, None
211+
212+
# Fetch both versions of the code
213+
print(f"Fetching code snippets for {repo_name}...")
214+
215+
# Print url with line numbers
216+
print(f"Current URL: {current_url}")
217+
current_code = fetch_code_snippet(raw_current_url, current_version, start_line, end_line)
218+
print(current_code)
219+
220+
print(f"Latest: {latest_url}")
221+
latest_code = fetch_code_snippet(raw_latest_url, latest_version, start_line, end_line)
222+
print(latest_code)
223+
224+
# Check if they match
225+
match = (current_code == latest_code) if current_code and latest_code else False
226+
227+
results.append({
228+
'file': ref['file'],
229+
'line_number': ref['line_number'],
230+
'repo_name': repo_name,
231+
'current_version': current_version,
232+
'latest_version': latest_version,
233+
'current_url': current_url,
234+
'latest_url': latest_url,
235+
'match': match,
236+
'current_code': current_code,
237+
'latest_code': latest_code
238+
})
239+
240+
return results
241+
242+
def check_outdated_snippets(json_file_path, codebase_root_dir):
243+
"""
244+
Complete workflow to check for outdated code snippets in the documentation codebase.
245+
246+
Args:
247+
json_file_path: Path to the JSON file containing dependencies
248+
codebase_root_dir: Root directory of the documentation codebase
249+
250+
Returns:
251+
Updated dependencies data with outdated snippets information
252+
"""
253+
# Step 1: Parse the JSON file to get repository dependencies
254+
repo_dependencies = parse_dependencies_json(json_file_path)
255+
print(f"Found {len(repo_dependencies)} repository dependencies.")
256+
257+
# Step 2: Search the codebase for repository references
258+
repo_references = find_repository_references(codebase_root_dir, repo_dependencies)
259+
print(f"Found {len(repo_references)} repository references in the codebase.")
260+
261+
# Step 3: Compare code snippets between current and latest versions
262+
comparison_results = compare_code_snippets(repo_references, repo_dependencies)
263+
#print(comparison_results)
264+
265+
# Filter for outdated snippets (where match is False)
266+
outdated_snippets = [result for result in comparison_results if not result['match']]
267+
print(f"Found {len(outdated_snippets)} outdated code snippets.")
268+
269+
# Step 4: Update the original JSON file with outdated snippets information
270+
with open(json_file_path, 'r') as file:
271+
data = json.load(file)
272+
273+
# Prepare outdated snippets information for each repository
274+
repo_to_outdated = {}
275+
for snippet in outdated_snippets:
276+
repo_name = snippet['repo_name']
277+
if repo_name not in repo_to_outdated:
278+
repo_to_outdated[repo_name] = []
279+
280+
repo_to_outdated[repo_name].append({
281+
'file': snippet['file'],
282+
'line_number': snippet['line_number'],
283+
'current_url': snippet['current_url'],
284+
'latest_url': snippet['latest_url']
285+
})
286+
287+
# Add outdated_snippets field to each repository in the JSON
288+
for dependency in data['outdated_dependencies']:
289+
if dependency['category'] == 'repositories' and dependency['name'] in repo_to_outdated:
290+
dependency['outdated_snippets'] = repo_to_outdated[dependency['name']]
291+
elif dependency['category'] == 'repositories':
292+
dependency['outdated_snippets'] = []
293+
294+
# Write updated data back to the file
295+
with open(json_file_path, 'w') as file:
296+
json.dump(data, file, indent=2)
297+
298+
return data
299+
300+
if __name__ == "__main__":
301+
if len(sys.argv) != 3:
302+
print("Usage: python script.py <json_file_path> <codebase_root_dir>")
303+
sys.exit(1)
304+
305+
json_file_path = sys.argv[1]
306+
codebase_root_dir = sys.argv[2]
307+
308+
updated_data = check_outdated_snippets(json_file_path, codebase_root_dir)
309+
310+
# Print summary of outdated snippets
311+
for dependency in updated_data['outdated_dependencies']:
312+
if dependency['category'] == 'repositories':
313+
snippet_count = len(dependency.get('outdated_snippets', []))
314+
if snippet_count > 0:
315+
print(f"\n{dependency['name']}: {snippet_count} outdated snippets")
316+
for i, snippet in enumerate(dependency['outdated_snippets'], 1):
317+
print(f" {i}. {snippet['file']}:{snippet['line_number']}")

0 commit comments

Comments
 (0)