1+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+ # SPDX-License-Identifier: Apache-2.0
3+
4+ """
5+ Deep validator module for WRITEME to check for issues in the codebase.
6+ This version performs a more thorough check for duplicate snippet tags by
7+ directly scanning the files in the repository.
8+ """
9+
10+ import logging
11+ import os
12+ import re
13+ import concurrent .futures
14+ from collections import defaultdict
15+ from pathlib import Path
16+ from typing import Dict , List , Set , Tuple , Optional , Any
17+
18+ from aws_doc_sdk_examples_tools .doc_gen import DocGen
19+
20+ logger = logging .getLogger (__name__ )
21+
22+
23+ class ValidationError (Exception ):
24+ """Exception raised for validation errors."""
25+ pass
26+
27+
28+ def find_snippet_tags_in_file (file_path : Path ) -> List [Tuple [str , int ]]:
29+ """
30+ Find all snippet tags in a file by directly parsing the file content.
31+
32+ Args:
33+ file_path: Path to the file to check
34+
35+ Returns:
36+ List of tuples containing (tag, line_number)
37+ """
38+ if not file_path .exists ():
39+ return []
40+
41+ try :
42+ with open (file_path , 'r' , encoding = 'utf-8' , errors = 'replace' ) as f :
43+ lines = f .readlines ()
44+ except Exception as e :
45+ logger .warning (f"Error reading file { file_path } : { e } " )
46+ return []
47+
48+ # Common snippet tag patterns
49+ patterns = [
50+ # Standard snippet tag format
51+ r'snippet-start:\s*\[([^\]]+)\]' ,
52+ r'snippet-end:\s*\[([^\]]+)\]' ,
53+ # Alternative formats
54+ r'SNIPPET\s+START\s+\[([^\]]+)\]' ,
55+ r'SNIPPET\s+END\s+\[([^\]]+)\]' ,
56+ r'//\s*SNIPPET:\s*([^\s]+)' ,
57+ r'#\s*SNIPPET:\s*([^\s]+)' ,
58+ r'<!--\s*SNIPPET:\s*([^\s]+)\s*-->' ,
59+ # Look for any other potential tag formats
60+ r'snippet[:\-_]([a-zA-Z0-9_\-]+)' ,
61+ # Common AWS SDK snippet formats
62+ r'//\s*snippet-start:\s*([^\s]+)' ,
63+ r'#\s*snippet-start:\s*([^\s]+)' ,
64+ r'<!--\s*snippet-start:\s*([^\s]+)\s*-->' ,
65+ r'//\s*snippet-end:\s*([^\s]+)' ,
66+ r'#\s*snippet-end:\s*([^\s]+)' ,
67+ r'<!--\s*snippet-end:\s*([^\s]+)\s*-->' ,
68+ ]
69+
70+ results = []
71+ for i , line in enumerate (lines , 1 ):
72+ for pattern in patterns :
73+ matches = re .findall (pattern , line , re .IGNORECASE )
74+ for match in matches :
75+ results .append ((match , i ))
76+
77+ return results
78+
79+
80+ def scan_directory_for_snippet_tags (
81+ root_dir : Path ,
82+ extensions : Optional [List [str ]] = None ,
83+ max_workers : int = 10
84+ ) -> Dict [str , List [Tuple [str , int , str ]]]:
85+ """
86+ Scan a directory recursively for files containing snippet tags.
87+ Uses parallel processing for faster scanning.
88+
89+ Args:
90+ root_dir: Root directory to scan
91+ extensions: Optional list of file extensions to check
92+ max_workers: Maximum number of parallel workers
93+
94+ Returns:
95+ Dictionary mapping snippet tags to lists of (file_path, line_number, context)
96+ """
97+ if extensions is None :
98+ # Default extensions to check
99+ extensions = [
100+ '.py' , '.java' , '.js' , '.ts' , '.cs' , '.cpp' , '.c' , '.go' , '.rb' ,
101+ '.php' , '.swift' , '.kt' , '.rs' , '.abap' , '.md' , '.html' , '.xml'
102+ ]
103+
104+ # Find all files with the specified extensions
105+ files_to_scan = []
106+ for root , _ , files in os .walk (root_dir ):
107+ for file in files :
108+ if any (file .endswith (ext ) for ext in extensions ):
109+ files_to_scan .append (Path (root ) / file )
110+
111+ # Process files in parallel
112+ tag_to_locations = defaultdict (list )
113+
114+ def process_file (file_path ):
115+ try :
116+ relative_path = file_path .relative_to (root_dir )
117+ tags = find_snippet_tags_in_file (file_path )
118+
119+ results = []
120+ for tag , line_number in tags :
121+ # Get some context from the file
122+ try :
123+ with open (file_path , 'r' , encoding = 'utf-8' , errors = 'replace' ) as f :
124+ lines = f .readlines ()
125+ start_line = max (0 , line_number - 2 )
126+ end_line = min (len (lines ), line_number + 1 )
127+ context = '' .join (lines [start_line :end_line ]).strip ()
128+ except Exception :
129+ context = "<context unavailable>"
130+
131+ results .append ((str (relative_path ), line_number , context ))
132+
133+ return {tag : [loc ] for tag , line_number in tags for loc in [(str (relative_path ), line_number , "" )]}
134+ except Exception as e :
135+ logger .warning (f"Error processing file { file_path } : { e } " )
136+ return {}
137+
138+ # Use ThreadPoolExecutor for parallel processing
139+ with concurrent .futures .ThreadPoolExecutor (max_workers = max_workers ) as executor :
140+ future_to_file = {executor .submit (process_file , file ): file for file in files_to_scan }
141+
142+ for future in concurrent .futures .as_completed (future_to_file ):
143+ file_results = future .result ()
144+ for tag , locations in file_results .items ():
145+ tag_to_locations [tag ].extend (locations )
146+
147+ return tag_to_locations
148+
149+
150+ def check_duplicate_snippet_tags_deep (doc_gen : DocGen ) -> List [Tuple [str , List [Dict [str , Any ]]]]:
151+ """
152+ Deep check for duplicate snippet tags in the codebase.
153+ This function scans all files directly to find snippet tags.
154+
155+ Args:
156+ doc_gen: The DocGen instance containing snippets
157+
158+ Returns:
159+ List of tuples containing (tag, [location_details]) for duplicate tags
160+ """
161+ logger .info ("Starting deep scan for duplicate snippet tags..." )
162+
163+ # Scan the repository directly for snippet tags
164+ root_dir = doc_gen .root
165+ tag_locations = scan_directory_for_snippet_tags (root_dir )
166+
167+ # Find tags that appear in multiple files
168+ duplicates = []
169+ for tag , locations in tag_locations .items ():
170+ # Group locations by file path
171+ files = {}
172+ for file_path , line_number , context in locations :
173+ if file_path not in files :
174+ files [file_path ] = []
175+ files [file_path ].append ({"line" : line_number , "context" : context })
176+
177+ # If the tag appears in multiple files, it's a duplicate
178+ if len (files ) > 1 :
179+ duplicate_info = []
180+ for file_path , occurrences in files .items ():
181+ duplicate_info .append ({
182+ "file" : file_path ,
183+ "occurrences" : occurrences
184+ })
185+ duplicates .append ((tag , duplicate_info ))
186+
187+ logger .info (f"Deep scan complete. Found { len (duplicates )} duplicate tags." )
188+ return duplicates
189+
190+
191+ def format_duplicate_report (duplicates : List [Tuple [str , List [Dict [str , Any ]]]]) -> str :
192+ """
193+ Format a detailed report of duplicate snippet tags.
194+
195+ Args:
196+ duplicates: List of duplicate tag information
197+
198+ Returns:
199+ Formatted report as a string
200+ """
201+ if not duplicates :
202+ return "No duplicate snippet tags found."
203+
204+ report = [f"Found { len (duplicates )} duplicate snippet tags:" ]
205+
206+ for tag , locations in duplicates :
207+ report .append (f"\n Tag: '{ tag } ' found in { len (locations )} files:" )
208+
209+ for location in locations :
210+ file_path = location ["file" ]
211+ occurrences = location ["occurrences" ]
212+
213+ report .append (f" File: { file_path } " )
214+ for occurrence in occurrences :
215+ line = occurrence .get ("line" , "unknown" )
216+ context = occurrence .get ("context" , "" ).replace ("\n " , " " ).strip ()
217+ if context :
218+ context = f" - Context: { context [:60 ]} ..."
219+ report .append (f" Line { line } { context } " )
220+
221+ return "\n " .join (report )
222+
223+
224+ def validate_snippets_deep (doc_gen : DocGen , strict : bool = False ) -> bool :
225+ """
226+ Deep validation of snippets in the codebase.
227+
228+ Args:
229+ doc_gen: The DocGen instance containing snippets
230+ strict: If True, raise an exception for validation errors
231+
232+ Returns:
233+ True if validation passed, False otherwise
234+ """
235+ validation_passed = True
236+
237+ # Check for duplicate snippet tags using the deep method
238+ duplicates = check_duplicate_snippet_tags_deep (doc_gen )
239+ if duplicates :
240+ validation_passed = False
241+ report = format_duplicate_report (duplicates )
242+ print ("\n === DUPLICATE SNIPPET TAGS (DEEP SCAN) ===" )
243+ print (report )
244+
245+ # Exit with error if strict validation is enabled
246+ if strict :
247+ raise ValidationError ("Validation failed: duplicate snippet tags found" )
248+ else :
249+ print ("No duplicate snippet tags found in deep scan." )
250+
251+ return validation_passed
0 commit comments