1+ import json
2+ import re
3+ import os
4+ import requests
5+ import sys
6+
7+ def parse_dependencies_json (json_file_path ):
8+ """
9+ Parse the JSON file containing dependencies and extract only the repositories type.
10+
11+ Args:
12+ json_file_path: Path to the JSON file containing the dependencies
13+
14+ Returns:
15+ List of dictionaries containing repository dependencies
16+ """
17+ try :
18+ # Read and parse the JSON file
19+ with open (json_file_path , 'r' ) as file :
20+ data = json .load (file )
21+
22+ # Extract only dependencies with category "repositories"
23+ repositories = [
24+ dependency for dependency in data .get ('outdated_dependencies' , [])
25+ if dependency .get ('category' ) == 'repositories'
26+ ]
27+
28+ return repositories
29+ except FileNotFoundError :
30+ print (f"Error: File { json_file_path } not found." )
31+ return []
32+ except json .JSONDecodeError :
33+ print (f"Error: File { json_file_path } contains invalid JSON." )
34+ return []
35+ except Exception as e :
36+ print (f"Error parsing dependencies: { str (e )} " )
37+ return []
38+
39+ def find_repository_references (root_dir , repo_dependencies ):
40+ """
41+ Search through the codebase for repository references matching the patterns.
42+
43+ Args:
44+ root_dir: Root directory of the codebase to search
45+ repo_dependencies: List of repository dependencies to check for
46+
47+ Returns:
48+ List of dictionaries containing file path, line number, and matched text
49+ """
50+ # Build a list of repo names to search for
51+ repo_names = [repo ['name' ] for repo in repo_dependencies ]
52+
53+ # Regex patterns to match repository URLs with placeholders
54+ patterns = [
55+ # Pattern for GitHub URLs with line references
56+ r'https://github\.com/.*?/blob/\{\{\s*dependencies\.repositories\.([a-zA-Z_]+)\.version\s*\}\}.*?#L\d+' ,
57+ # Pattern for raw.githubusercontent.com URLs with line ranges
58+ r'https://raw\.githubusercontent\.com/.*?/refs/tags/\{\{\s*dependencies\.repositories\.([a-zA-Z_]+)\.version\s*\}\}.*?:\d+:\d+'
59+ ]
60+
61+ results = []
62+
63+ # Walk through all files in the codebase
64+ for dirpath , _ , filenames in os .walk (root_dir ):
65+ for filename in filenames :
66+ if filename .endswith ('.md' ): # Assuming we're only searching Markdown files
67+ file_path = os .path .join (dirpath , filename )
68+
69+ try :
70+ with open (file_path , 'r' , encoding = 'utf-8' ) as file :
71+ for line_num , line in enumerate (file , 1 ):
72+ # Check each pattern
73+ for pattern in patterns :
74+ matches = re .finditer (pattern , line )
75+ for match in matches :
76+ repo_var = match .group (1 )
77+ # Only include if the repo name is in our dependencies list
78+ if repo_var in repo_names :
79+ results .append ({
80+ 'file' : file_path ,
81+ 'line_number' : line_num ,
82+ 'match_text' : match .group (0 ),
83+ 'repo_var' : repo_var ,
84+ 'full_line' : line .strip ()
85+ })
86+ except (UnicodeDecodeError , IOError ) as e :
87+ print (f"Error reading file { file_path } : { str (e )} " )
88+
89+ return results
90+
91+ def convert_to_raw_url (github_url ):
92+ """
93+ Convert a GitHub URL to a raw.githubusercontent.com URL.
94+
95+ Args:
96+ github_url: GitHub URL to convert
97+
98+ Returns:
99+ Raw URL for the same content
100+ """
101+ # Pattern: https://github.com/{owner}/{repo}/blob/{branch}/{path}#L{line}
102+ pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+?)(?:#L(\d+))?$'
103+ match = re .match (pattern , github_url )
104+
105+ if match :
106+ owner , repo , branch , path , line_num = match .groups ()
107+ raw_url = f"https://raw.githubusercontent.com/{ owner } /{ repo } /{ branch } /{ path } "
108+ return raw_url , line_num
109+ return None , None
110+
111+ def extract_line_range (url ):
112+ """
113+ Extract line range from a raw GitHub URL with line numbers.
114+
115+ Args:
116+ url: URL potentially containing line numbers
117+
118+ Returns:
119+ Tuple of (url without line numbers, start line, end line)
120+ """
121+ # Check for line range in format :start_line:end_line
122+ pattern = r'(.+):(\d+):(\d+)$'
123+ match = re .match (pattern , url )
124+
125+ if match :
126+ base_url , start_line , end_line = match .groups ()
127+ return base_url , int (start_line ), int (end_line )
128+
129+ return url , None , None
130+
131+ def fetch_code_snippet (url , version , line_start = None , line_end = None ):
132+ """
133+ Fetch code from a raw GitHub URL, optionally extracting specific lines.
134+
135+ Args:
136+ url: Raw GitHub URL
137+ version: Version to use in place of the placeholder
138+ line_start: Starting line (optional)
139+ line_end: Ending line (optional)
140+
141+ Returns:
142+ The code snippet as a string
143+ """
144+ try :
145+ response = requests .get (url )
146+ response .raise_for_status ()
147+
148+ content = response .text .splitlines ()
149+
150+ if line_start is not None and line_end is not None :
151+ # Extract specific lines (adjust for 0-based indexing)
152+ content = content [line_start - 1 :line_end ]
153+ elif line_start is not None :
154+ # Just one line
155+ content = [content [line_start - 1 ]]
156+
157+ return '\n ' .join (content )
158+ except requests .RequestException as e :
159+ print (f"Error fetching code: { str (e )} " )
160+ return None
161+
162+ def compare_code_snippets (repo_references , repo_dependencies ):
163+ """
164+ Compare code snippets between current and latest versions.
165+
166+ Args:
167+ repo_references: List of repository references found in the codebase
168+ repo_dependencies: List of repository dependencies
169+
170+ Returns:
171+ List of dictionaries containing the comparison results
172+ """
173+ # Create a lookup dictionary for repositories
174+ repo_dict = {repo ['name' ]: repo for repo in repo_dependencies }
175+ results = []
176+
177+ for ref in repo_references :
178+ repo_name = ref ['repo_var' ]
179+ if repo_name not in repo_dict :
180+ continue
181+
182+ repo = repo_dict [repo_name ]
183+ current_version = repo ['current_version' ]
184+ latest_version = repo ['latest_version' ]
185+
186+ url = ref ['match_text' ]
187+ # Replace the placeholder with actual versions
188+ url_pattern = re .compile (r'\{\{\s*dependencies\.repositories\.[a-zA-Z_]+\.version\s*\}\}' )
189+
190+ current_url = url_pattern .sub (current_version , url )
191+ latest_url = url_pattern .sub (latest_version , url )
192+
193+ # Check if it's already a raw URL
194+ if "raw.githubusercontent.com" in current_url :
195+ raw_current_url , line_num = current_url , None
196+ raw_latest_url , line_num = latest_url , None
197+
198+ # Extract line ranges if present
199+ raw_current_url , start_line , end_line = extract_line_range (raw_current_url )
200+ raw_latest_url , _ , _ = extract_line_range (raw_latest_url )
201+ else :
202+ # Convert to raw URL
203+ raw_current_url , line_num = convert_to_raw_url (current_url )
204+ raw_latest_url , _ = convert_to_raw_url (latest_url )
205+
206+ # Set line range if a specific line was referenced
207+ if line_num :
208+ start_line , end_line = int (line_num ), int (line_num )
209+ else :
210+ start_line , end_line = None , None
211+
212+ # Fetch both versions of the code
213+ print (f"Fetching code snippets for { repo_name } ..." )
214+
215+ # Print url with line numbers
216+ print (f"Current URL: { current_url } " )
217+ current_code = fetch_code_snippet (raw_current_url , current_version , start_line , end_line )
218+ print (current_code )
219+
220+ print (f"Latest: { latest_url } " )
221+ latest_code = fetch_code_snippet (raw_latest_url , latest_version , start_line , end_line )
222+ print (latest_code )
223+
224+ # Check if they match
225+ match = (current_code == latest_code ) if current_code and latest_code else False
226+
227+ results .append ({
228+ 'file' : ref ['file' ],
229+ 'line_number' : ref ['line_number' ],
230+ 'repo_name' : repo_name ,
231+ 'current_version' : current_version ,
232+ 'latest_version' : latest_version ,
233+ 'current_url' : current_url ,
234+ 'latest_url' : latest_url ,
235+ 'match' : match ,
236+ 'current_code' : current_code ,
237+ 'latest_code' : latest_code
238+ })
239+
240+ return results
241+
242+ def check_outdated_snippets (json_file_path , codebase_root_dir ):
243+ """
244+ Complete workflow to check for outdated code snippets in the documentation codebase.
245+
246+ Args:
247+ json_file_path: Path to the JSON file containing dependencies
248+ codebase_root_dir: Root directory of the documentation codebase
249+
250+ Returns:
251+ Updated dependencies data with outdated snippets information
252+ """
253+ # Step 1: Parse the JSON file to get repository dependencies
254+ repo_dependencies = parse_dependencies_json (json_file_path )
255+ print (f"Found { len (repo_dependencies )} repository dependencies." )
256+
257+ # Step 2: Search the codebase for repository references
258+ repo_references = find_repository_references (codebase_root_dir , repo_dependencies )
259+ print (f"Found { len (repo_references )} repository references in the codebase." )
260+
261+ # Step 3: Compare code snippets between current and latest versions
262+ comparison_results = compare_code_snippets (repo_references , repo_dependencies )
263+ #print(comparison_results)
264+
265+ # Filter for outdated snippets (where match is False)
266+ outdated_snippets = [result for result in comparison_results if not result ['match' ]]
267+ print (f"Found { len (outdated_snippets )} outdated code snippets." )
268+
269+ # Step 4: Update the original JSON file with outdated snippets information
270+ with open (json_file_path , 'r' ) as file :
271+ data = json .load (file )
272+
273+ # Prepare outdated snippets information for each repository
274+ repo_to_outdated = {}
275+ for snippet in outdated_snippets :
276+ repo_name = snippet ['repo_name' ]
277+ if repo_name not in repo_to_outdated :
278+ repo_to_outdated [repo_name ] = []
279+
280+ repo_to_outdated [repo_name ].append ({
281+ 'file' : snippet ['file' ],
282+ 'line_number' : snippet ['line_number' ],
283+ 'current_url' : snippet ['current_url' ],
284+ 'latest_url' : snippet ['latest_url' ]
285+ })
286+
287+ # Add outdated_snippets field to each repository in the JSON
288+ for dependency in data ['outdated_dependencies' ]:
289+ if dependency ['category' ] == 'repositories' and dependency ['name' ] in repo_to_outdated :
290+ dependency ['outdated_snippets' ] = repo_to_outdated [dependency ['name' ]]
291+ elif dependency ['category' ] == 'repositories' :
292+ dependency ['outdated_snippets' ] = []
293+
294+ # Write updated data back to the file
295+ with open (json_file_path , 'w' ) as file :
296+ json .dump (data , file , indent = 2 )
297+
298+ return data
299+
300+ if __name__ == "__main__" :
301+ if len (sys .argv ) != 3 :
302+ print ("Usage: python script.py <json_file_path> <codebase_root_dir>" )
303+ sys .exit (1 )
304+
305+ json_file_path = sys .argv [1 ]
306+ codebase_root_dir = sys .argv [2 ]
307+
308+ updated_data = check_outdated_snippets (json_file_path , codebase_root_dir )
309+
310+ # Print summary of outdated snippets
311+ for dependency in updated_data ['outdated_dependencies' ]:
312+ if dependency ['category' ] == 'repositories' :
313+ snippet_count = len (dependency .get ('outdated_snippets' , []))
314+ if snippet_count > 0 :
315+ print (f"\n { dependency ['name' ]} : { snippet_count } outdated snippets" )
316+ for i , snippet in enumerate (dependency ['outdated_snippets' ], 1 ):
317+ print (f" { i } . { snippet ['file' ]} :{ snippet ['line_number' ]} " )
0 commit comments