feat: enhance detector factory and LLM detection capabilities

d-oit · d-oit · commit 3732bf451876 · 2025-10-17T11:43:42.000Z
diff --git a/crates/core/src/detector_factory.rs b/crates/core/src/detector_factory.rs
@@ -216,6 +216,23 @@ impl DetectorFactory {
             DetectorType::LLMPythonIssues => Ok(Some(Box::new(PythonLLMIssuesDetector))),
             DetectorType::LLMGeneratedComments => Ok(Some(Box::new(LLMGeneratedCommentsDetector))),
 
+            // Advanced LLM-specific patterns
+            DetectorType::LLMAIModelHallucination => {
+                Ok(Some(Box::new(AIModelHallucinationDetector)))
+            }
+            DetectorType::LLMIncorrectAsync => Ok(Some(Box::new(IncorrectAsyncDetector))),
+            DetectorType::LLMSecurityAntipattern => {
+                Ok(Some(Box::new(LLMSecurityAntipatternDetector)))
+            }
+            DetectorType::LLMDBAntipattern => Ok(Some(Box::new(LLMDBAntipatternDetector))),
+            DetectorType::LLMErrorHandlingMistake => {
+                Ok(Some(Box::new(LLMErrorHandlingMistakesDetector)))
+            }
+            DetectorType::LLMPerformanceMistake => {
+                Ok(Some(Box::new(LLMPerformanceMistakesDetector)))
+            }
+            DetectorType::LLMTypeMistake => Ok(Some(Box::new(LLMTypeMistakesDetector))),
+
             // Comprehensive LLM detector
             DetectorType::LLMComprehensive => Ok(Some(Box::new(ComprehensiveLLMDetector::new()))),
 
diff --git a/crates/core/src/detectors.rs b/crates/core/src/detectors.rs
@@ -1,4 +1,5 @@
 use crate::{Match, PatternDetector};
+use aho_corasick::AhoCorasick;
 use anyhow::Result;
 use lazy_static::lazy_static;
 use regex::Regex;
@@ -47,7 +48,7 @@ fn detect_pattern_with_context(
     pattern_name: &str,
     re: &Regex,
 ) -> Vec<Match> {
-    let mut matches = Vec::new();
+    let mut matches = smallvec::SmallVec::<[Match; 4]>::new();
     for (line_idx, line) in content.lines().enumerate() {
         for mat in re.find_iter(line) {
             // Extract more context around the match
@@ -64,7 +65,7 @@ fn detect_pattern_with_context(
             });
         }
     }
-    matches
+    matches.into_vec()
 }
 
 /// Default detector for TODO comments (case-insensitive)
@@ -427,6 +428,85 @@ impl PatternDetector for CustomPatternDetector {
     }
 }
 
+/// High-performance detector using Aho-Corasick algorithm for multiple pattern matching
+pub struct HighPerformanceDetector {
+    pattern_names: Vec<String>,
+    ac: AhoCorasick,
+}
+
+impl HighPerformanceDetector {
+    /// Creates a new high-performance detector with the given patterns
+    pub fn new(patterns: Vec<(&str, &str)>) -> Result<Self> {
+        let (pattern_names, pattern_strings): (Vec<String>, Vec<String>) = patterns
+            .into_iter()
+            .map(|(name, pattern)| (name.to_string(), pattern.to_string()))
+            .unzip();
+
+        let ac = AhoCorasick::new(&pattern_strings)?;
+
+        Ok(Self { pattern_names, ac })
+    }
+
+    /// Creates a detector for common TODO/FIXME patterns
+    pub fn for_common_patterns() -> Self {
+        let patterns = vec![
+            ("TODO", r"(?i)todo"),
+            ("FIXME", r"(?i)fixme"),
+            ("HACK", r"(?i)hack"),
+            ("BUG", r"(?i)bug"),
+            ("XXX", r"XXX"),
+            ("NOTE", r"(?i)note"),
+            ("WARNING", r"(?i)warning"),
+            ("PANIC", r"panic!"),
+            ("UNWRAP", r"\.unwrap\(\)"),
+            ("UNSAFE", r"unsafe\s+\{"),
+            ("DEBUG", r"(?i)debug"),
+            ("TEST", r"(?i)test"),
+            ("PHASE", r"(?i)phase\s*[0-9]+"),
+            ("CONSOLE_LOG", r"console\.(log|debug|info|warn|error)"),
+            ("PRINT", r"print|println|echo"),
+            ("ALERT", r"alert\(|confirm\(|prompt\("),
+            ("DEBUGGER", r"debugger|pdb\.set_trace"),
+        ];
+
+        Self::new(patterns).unwrap()
+    }
+}
+
+impl PatternDetector for HighPerformanceDetector {
+    fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
+        let mut matches = Vec::new();
+
+        for mat in self.ac.find_iter(content) {
+            let pattern_id = mat.pattern();
+            let pattern_name = &self.pattern_names[pattern_id.as_usize()];
+
+            // Extract context around the match
+            let start = mat.start().saturating_sub(15);
+            let end = (mat.end() + 25).min(content.len());
+            let context = &content[start..end];
+
+            // Find the line number
+            let line_start = content[..mat.start()]
+                .rfind('\n')
+                .map(|pos| pos + 1)
+                .unwrap_or(0);
+            let line_number = content[..line_start].lines().count() + 1;
+            let column = mat.start() - line_start + 1;
+
+            matches.push(Match {
+                file_path: file_path.to_string_lossy().to_string(),
+                line_number,
+                column,
+                pattern: pattern_name.clone(),
+                message: format!("{}: {}", pattern_name, context.trim()),
+            });
+        }
+
+        matches
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/crates/core/src/llm_detectors.rs b/crates/core/src/llm_detectors.rs
@@ -6,7 +6,7 @@ use std::path::Path;
 lazy_static! {
     // Hallucinated API patterns - APIs that LLMs commonly generate but don't exist
     pub static ref HALLUCINATED_API_REGEX: Regex = Regex::new(
-        r"(?i)\.(authenticate|validateInput|sanitize|encryptData|hashPassword|secureRandom)\s*\(\s*\)"
+        r"(?i)\.(authenticate|validateInput|sanitize|encryptData|hashPassword|secureRandom|generateToken|verifySignature|encodeBase64|decodeBase64|compressData|decompressData|validateEmail|validatePhone|formatCurrency|parseJson|serializeJson)\s*\(\s*\)"
     ).unwrap();
 
     pub static ref INCOMPLETE_API_REGEX: Regex = Regex::new(
@@ -95,7 +95,42 @@ lazy_static! {
 
     // LLM-specific comment patterns that indicate AI generation
     pub static ref LLM_GENERATED_COMMENTS_REGEX: Regex = Regex::new(
-        r"(?i)//.*(?:ai generated|generated by|gpt|claude|chatgpt|copilot|based on|as an ai)"
+        r"(?i)//.*(?:ai generated|generated by|gpt|claude|chatgpt|copilot|based on|as an ai|llm|machine learning|neural network|deep learning|transformer|attention mechanism)"
+    ).unwrap();
+
+    // AI model hallucinated patterns - common incorrect implementations
+    pub static ref AI_MODEL_HALLUCINATION_REGEX: Regex = Regex::new(
+        r"(?i)(?:tensorflow\.keras|torch\.nn\.Module|sklearn\.model_selection\.GridSearchCV|transformers\.pipeline)\s*\(\s*['\x22][^'\x22]*['\x22]\s*\)\s*\.\s*(fit|predict|train|evaluate)\s*\(\s*\)"
+    ).unwrap();
+
+    // Incorrect async patterns commonly generated by LLMs
+    pub static ref INCORRECT_ASYNC_REGEX: Regex = Regex::new(
+        r"(?:async\s+function\s+\w+\s*\([^)]*\)\s*\{\s*return\s+await\s+Promise\.resolve\([^;]*\);\s*\}|await\s+\w+\s*\([^)]*\)\s*;?\s*//.*blocking|Promise\.all\([^)]*\)\s*\.\s*then\s*\([^)]*\)\s*await)"
+    ).unwrap();
+
+    // Common LLM-generated security anti-patterns
+    pub static ref LLM_SECURITY_ANTIPATTERN_REGEX: Regex = Regex::new(
+        r"(?i)(?:eval\s*\([^)]*req\.|Function\s*\([^)]*req\.|setTimeout\s*\([^)]*req\.|setInterval\s*\([^)]*req\.|innerHTML\s*=.*req\.|outerHTML\s*=.*req\.|document\.write\s*\([^)]*req\.|window\.location\s*=.*req\.|localStorage\.setItem\s*\([^,)]*,\s*req\.|sessionStorage\.setItem\s*\([^,)]*,\s*req\.)"
+    ).unwrap();
+
+    // LLM-generated database anti-patterns
+    pub static ref LLM_DB_ANTIPATTERN_REGEX: Regex = Regex::new(
+        r"(?i)(?:SELECT\s+\*\s+FROM\s+\w+\s+WHERE\s+.*=.*\+|INSERT\s+INTO\s+\w+\s+VALUES\s*\([^)]*\+|UPDATE\s+\w+\s+SET\s+.*=.*\+|DELETE\s+FROM\s+\w+\s+WHERE\s+.*=.*\+)"
+    ).unwrap();
+
+    // Common LLM-generated error handling mistakes
+    pub static ref LLM_ERROR_HANDLING_MISTAKES_REGEX: Regex = Regex::new(
+        r"(?:try\s*\{\s*[^}]*\}\s*catch\s*\([^)]*\)\s*\{\s*\}\s*//.*ignore|catch\s*\([^)]*\)\s*\{\s*console\.log\s*\([^)]*\)\s*\}\s*//.*log|throw\s+new\s+Error\s*\([^)]*\)\s*;?\s*//.*generic|\.catch\s*\([^)]*\)\s*=>\s*\{\s*\}\s*//.*empty)"
+    ).unwrap();
+
+    // LLM-generated performance issues
+    pub static ref LLM_PERFORMANCE_MISTAKES_REGEX: Regex = Regex::new(
+        r"(?:for\s*\([^)]*\)\s*\{\s*[^}]*for\s*\([^)]*\)\s*\{\s*[^}]*for\s*\([^)]*\)\s*\{\s*[^}]*\}\s*\}\s*\}\s*//.*nested|Array\.from\s*\([^)]*\)\s*\.\s*map\s*\([^)]*\)\s*\.\s*filter\s*\([^)]*\)\s*\.\s*reduce\s*\([^)]*\)\s*//.*chain|\.sort\s*\([^)]*\)\s*\.\s*reverse\s*\([^)]*\)\s*//.*inefficient)"
+    ).unwrap();
+
+    // LLM-generated incorrect type handling
+    pub static ref LLM_TYPE_MISTAKES_REGEX: Regex = Regex::new(
+        r"(?:let\s+\w+\s*:\s*any\s*=\s*[^;]*;?\s*//.*type|var\s+\w+\s*=\s*[^;]*;?\s*//.*untyped|const\s+\w+\s*=\s*null\s*;?\s*//.*nullable|function\s+\w+\s*\([^)]*\)\s*:\s*any\s*\{[^}]*\}\s*//.*return)"
     ).unwrap();
 }
 
@@ -410,6 +445,104 @@ impl PatternDetector for LLMGeneratedCommentsDetector {
     }
 }
 
+/// Detector for AI model hallucinated patterns
+pub struct AIModelHallucinationDetector;
+
+impl PatternDetector for AIModelHallucinationDetector {
+    fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
+        detect_pattern_with_context(
+            content,
+            file_path,
+            "LLM_AI_MODEL_HALLUCINATION",
+            &AI_MODEL_HALLUCINATION_REGEX,
+        )
+    }
+}
+
+/// Detector for incorrect async patterns
+pub struct IncorrectAsyncDetector;
+
+impl PatternDetector for IncorrectAsyncDetector {
+    fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
+        detect_pattern_with_context(
+            content,
+            file_path,
+            "LLM_INCORRECT_ASYNC",
+            &INCORRECT_ASYNC_REGEX,
+        )
+    }
+}
+
+/// Detector for LLM-generated security anti-patterns
+pub struct LLMSecurityAntipatternDetector;
+
+impl PatternDetector for LLMSecurityAntipatternDetector {
+    fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
+        detect_pattern_with_context(
+            content,
+            file_path,
+            "LLM_SECURITY_ANTIPATTERN",
+            &LLM_SECURITY_ANTIPATTERN_REGEX,
+        )
+    }
+}
+
+/// Detector for LLM-generated database anti-patterns
+pub struct LLMDBAntipatternDetector;
+
+impl PatternDetector for LLMDBAntipatternDetector {
+    fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
+        detect_pattern_with_context(
+            content,
+            file_path,
+            "LLM_DB_ANTIPATTERN",
+            &LLM_DB_ANTIPATTERN_REGEX,
+        )
+    }
+}
+
+/// Detector for LLM-generated error handling mistakes
+pub struct LLMErrorHandlingMistakesDetector;
+
+impl PatternDetector for LLMErrorHandlingMistakesDetector {
+    fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
+        detect_pattern_with_context(
+            content,
+            file_path,
+            "LLM_ERROR_HANDLING_MISTAKE",
+            &LLM_ERROR_HANDLING_MISTAKES_REGEX,
+        )
+    }
+}
+
+/// Detector for LLM-generated performance mistakes
+pub struct LLMPerformanceMistakesDetector;
+
+impl PatternDetector for LLMPerformanceMistakesDetector {
+    fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
+        detect_pattern_with_context(
+            content,
+            file_path,
+            "LLM_PERFORMANCE_MISTAKE",
+            &LLM_PERFORMANCE_MISTAKES_REGEX,
+        )
+    }
+}
+
+/// Detector for LLM-generated type handling mistakes
+pub struct LLMTypeMistakesDetector;
+
+impl PatternDetector for LLMTypeMistakesDetector {
+    fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
+        detect_pattern_with_context(
+            content,
+            file_path,
+            "LLM_TYPE_MISTAKE",
+            &LLM_TYPE_MISTAKES_REGEX,
+        )
+    }
+}
+
 /// Comprehensive LLM vulnerability detector that combines multiple patterns
 pub struct ComprehensiveLLMDetector {
     detectors: Vec<Box<dyn PatternDetector>>,
@@ -436,6 +569,13 @@ impl ComprehensiveLLMDetector {
             Box::new(ContextConfusionDetector),
             Box::new(DatabaseAntipatternDetector),
             Box::new(LLMGeneratedCommentsDetector),
+            Box::new(AIModelHallucinationDetector),
+            Box::new(IncorrectAsyncDetector),
+            Box::new(LLMSecurityAntipatternDetector),
+            Box::new(LLMDBAntipatternDetector),
+            Box::new(LLMErrorHandlingMistakesDetector),
+            Box::new(LLMPerformanceMistakesDetector),
+            Box::new(LLMTypeMistakesDetector),
         ];
 
         Self { detectors }