Skip to content

Commit 3732bf4

Browse files
committed
feat: enhance detector factory and LLM detection capabilities
1 parent d358460 commit 3732bf4

File tree

3 files changed

+241
-4
lines changed

3 files changed

+241
-4
lines changed

crates/core/src/detector_factory.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,23 @@ impl DetectorFactory {
216216
DetectorType::LLMPythonIssues => Ok(Some(Box::new(PythonLLMIssuesDetector))),
217217
DetectorType::LLMGeneratedComments => Ok(Some(Box::new(LLMGeneratedCommentsDetector))),
218218

219+
// Advanced LLM-specific patterns
220+
DetectorType::LLMAIModelHallucination => {
221+
Ok(Some(Box::new(AIModelHallucinationDetector)))
222+
}
223+
DetectorType::LLMIncorrectAsync => Ok(Some(Box::new(IncorrectAsyncDetector))),
224+
DetectorType::LLMSecurityAntipattern => {
225+
Ok(Some(Box::new(LLMSecurityAntipatternDetector)))
226+
}
227+
DetectorType::LLMDBAntipattern => Ok(Some(Box::new(LLMDBAntipatternDetector))),
228+
DetectorType::LLMErrorHandlingMistake => {
229+
Ok(Some(Box::new(LLMErrorHandlingMistakesDetector)))
230+
}
231+
DetectorType::LLMPerformanceMistake => {
232+
Ok(Some(Box::new(LLMPerformanceMistakesDetector)))
233+
}
234+
DetectorType::LLMTypeMistake => Ok(Some(Box::new(LLMTypeMistakesDetector))),
235+
219236
// Comprehensive LLM detector
220237
DetectorType::LLMComprehensive => Ok(Some(Box::new(ComprehensiveLLMDetector::new()))),
221238

crates/core/src/detectors.rs

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use crate::{Match, PatternDetector};
2+
use aho_corasick::AhoCorasick;
23
use anyhow::Result;
34
use lazy_static::lazy_static;
45
use regex::Regex;
@@ -47,7 +48,7 @@ fn detect_pattern_with_context(
4748
pattern_name: &str,
4849
re: &Regex,
4950
) -> Vec<Match> {
50-
let mut matches = Vec::new();
51+
let mut matches = smallvec::SmallVec::<[Match; 4]>::new();
5152
for (line_idx, line) in content.lines().enumerate() {
5253
for mat in re.find_iter(line) {
5354
// Extract more context around the match
@@ -64,7 +65,7 @@ fn detect_pattern_with_context(
6465
});
6566
}
6667
}
67-
matches
68+
matches.into_vec()
6869
}
6970

7071
/// Default detector for TODO comments (case-insensitive)
@@ -427,6 +428,85 @@ impl PatternDetector for CustomPatternDetector {
427428
}
428429
}
429430

431+
/// High-performance detector using Aho-Corasick algorithm for multiple pattern matching
432+
pub struct HighPerformanceDetector {
433+
pattern_names: Vec<String>,
434+
ac: AhoCorasick,
435+
}
436+
437+
impl HighPerformanceDetector {
438+
/// Creates a new high-performance detector with the given patterns
439+
pub fn new(patterns: Vec<(&str, &str)>) -> Result<Self> {
440+
let (pattern_names, pattern_strings): (Vec<String>, Vec<String>) = patterns
441+
.into_iter()
442+
.map(|(name, pattern)| (name.to_string(), pattern.to_string()))
443+
.unzip();
444+
445+
let ac = AhoCorasick::new(&pattern_strings)?;
446+
447+
Ok(Self { pattern_names, ac })
448+
}
449+
450+
/// Creates a detector for common TODO/FIXME patterns
451+
pub fn for_common_patterns() -> Self {
452+
let patterns = vec![
453+
("TODO", r"(?i)todo"),
454+
("FIXME", r"(?i)fixme"),
455+
("HACK", r"(?i)hack"),
456+
("BUG", r"(?i)bug"),
457+
("XXX", r"XXX"),
458+
("NOTE", r"(?i)note"),
459+
("WARNING", r"(?i)warning"),
460+
("PANIC", r"panic!"),
461+
("UNWRAP", r"\.unwrap\(\)"),
462+
("UNSAFE", r"unsafe\s+\{"),
463+
("DEBUG", r"(?i)debug"),
464+
("TEST", r"(?i)test"),
465+
("PHASE", r"(?i)phase\s*[0-9]+"),
466+
("CONSOLE_LOG", r"console\.(log|debug|info|warn|error)"),
467+
("PRINT", r"print|println|echo"),
468+
("ALERT", r"alert\(|confirm\(|prompt\("),
469+
("DEBUGGER", r"debugger|pdb\.set_trace"),
470+
];
471+
472+
Self::new(patterns).unwrap()
473+
}
474+
}
475+
476+
impl PatternDetector for HighPerformanceDetector {
477+
fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
478+
let mut matches = Vec::new();
479+
480+
for mat in self.ac.find_iter(content) {
481+
let pattern_id = mat.pattern();
482+
let pattern_name = &self.pattern_names[pattern_id.as_usize()];
483+
484+
// Extract context around the match
485+
let start = mat.start().saturating_sub(15);
486+
let end = (mat.end() + 25).min(content.len());
487+
let context = &content[start..end];
488+
489+
// Find the line number
490+
let line_start = content[..mat.start()]
491+
.rfind('\n')
492+
.map(|pos| pos + 1)
493+
.unwrap_or(0);
494+
let line_number = content[..line_start].lines().count() + 1;
495+
let column = mat.start() - line_start + 1;
496+
497+
matches.push(Match {
498+
file_path: file_path.to_string_lossy().to_string(),
499+
line_number,
500+
column,
501+
pattern: pattern_name.clone(),
502+
message: format!("{}: {}", pattern_name, context.trim()),
503+
});
504+
}
505+
506+
matches
507+
}
508+
}
509+
430510
#[cfg(test)]
431511
mod tests {
432512
use super::*;

crates/core/src/llm_detectors.rs

Lines changed: 142 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::path::Path;
66
lazy_static! {
77
// Hallucinated API patterns - APIs that LLMs commonly generate but don't exist
88
pub static ref HALLUCINATED_API_REGEX: Regex = Regex::new(
9-
r"(?i)\.(authenticate|validateInput|sanitize|encryptData|hashPassword|secureRandom)\s*\(\s*\)"
9+
r"(?i)\.(authenticate|validateInput|sanitize|encryptData|hashPassword|secureRandom|generateToken|verifySignature|encodeBase64|decodeBase64|compressData|decompressData|validateEmail|validatePhone|formatCurrency|parseJson|serializeJson)\s*\(\s*\)"
1010
).unwrap();
1111

1212
pub static ref INCOMPLETE_API_REGEX: Regex = Regex::new(
@@ -95,7 +95,42 @@ lazy_static! {
9595

9696
// LLM-specific comment patterns that indicate AI generation
9797
pub static ref LLM_GENERATED_COMMENTS_REGEX: Regex = Regex::new(
98-
r"(?i)//.*(?:ai generated|generated by|gpt|claude|chatgpt|copilot|based on|as an ai)"
98+
r"(?i)//.*(?:ai generated|generated by|gpt|claude|chatgpt|copilot|based on|as an ai|llm|machine learning|neural network|deep learning|transformer|attention mechanism)"
99+
).unwrap();
100+
101+
// AI model hallucinated patterns - common incorrect implementations
102+
pub static ref AI_MODEL_HALLUCINATION_REGEX: Regex = Regex::new(
103+
r"(?i)(?:tensorflow\.keras|torch\.nn\.Module|sklearn\.model_selection\.GridSearchCV|transformers\.pipeline)\s*\(\s*['\x22][^'\x22]*['\x22]\s*\)\s*\.\s*(fit|predict|train|evaluate)\s*\(\s*\)"
104+
).unwrap();
105+
106+
// Incorrect async patterns commonly generated by LLMs
107+
pub static ref INCORRECT_ASYNC_REGEX: Regex = Regex::new(
108+
r"(?:async\s+function\s+\w+\s*\([^)]*\)\s*\{\s*return\s+await\s+Promise\.resolve\([^;]*\);\s*\}|await\s+\w+\s*\([^)]*\)\s*;?\s*//.*blocking|Promise\.all\([^)]*\)\s*\.\s*then\s*\([^)]*\)\s*await)"
109+
).unwrap();
110+
111+
// Common LLM-generated security anti-patterns
112+
pub static ref LLM_SECURITY_ANTIPATTERN_REGEX: Regex = Regex::new(
113+
r"(?i)(?:eval\s*\([^)]*req\.|Function\s*\([^)]*req\.|setTimeout\s*\([^)]*req\.|setInterval\s*\([^)]*req\.|innerHTML\s*=.*req\.|outerHTML\s*=.*req\.|document\.write\s*\([^)]*req\.|window\.location\s*=.*req\.|localStorage\.setItem\s*\([^,)]*,\s*req\.|sessionStorage\.setItem\s*\([^,)]*,\s*req\.)"
114+
).unwrap();
115+
116+
// LLM-generated database anti-patterns
117+
pub static ref LLM_DB_ANTIPATTERN_REGEX: Regex = Regex::new(
118+
r"(?i)(?:SELECT\s+\*\s+FROM\s+\w+\s+WHERE\s+.*=.*\+|INSERT\s+INTO\s+\w+\s+VALUES\s*\([^)]*\+|UPDATE\s+\w+\s+SET\s+.*=.*\+|DELETE\s+FROM\s+\w+\s+WHERE\s+.*=.*\+)"
119+
).unwrap();
120+
121+
// Common LLM-generated error handling mistakes
122+
pub static ref LLM_ERROR_HANDLING_MISTAKES_REGEX: Regex = Regex::new(
123+
r"(?:try\s*\{\s*[^}]*\}\s*catch\s*\([^)]*\)\s*\{\s*\}\s*//.*ignore|catch\s*\([^)]*\)\s*\{\s*console\.log\s*\([^)]*\)\s*\}\s*//.*log|throw\s+new\s+Error\s*\([^)]*\)\s*;?\s*//.*generic|\.catch\s*\([^)]*\)\s*=>\s*\{\s*\}\s*//.*empty)"
124+
).unwrap();
125+
126+
// LLM-generated performance issues
127+
pub static ref LLM_PERFORMANCE_MISTAKES_REGEX: Regex = Regex::new(
128+
r"(?:for\s*\([^)]*\)\s*\{\s*[^}]*for\s*\([^)]*\)\s*\{\s*[^}]*for\s*\([^)]*\)\s*\{\s*[^}]*\}\s*\}\s*\}\s*//.*nested|Array\.from\s*\([^)]*\)\s*\.\s*map\s*\([^)]*\)\s*\.\s*filter\s*\([^)]*\)\s*\.\s*reduce\s*\([^)]*\)\s*//.*chain|\.sort\s*\([^)]*\)\s*\.\s*reverse\s*\([^)]*\)\s*//.*inefficient)"
129+
).unwrap();
130+
131+
// LLM-generated incorrect type handling
132+
pub static ref LLM_TYPE_MISTAKES_REGEX: Regex = Regex::new(
133+
r"(?:let\s+\w+\s*:\s*any\s*=\s*[^;]*;?\s*//.*type|var\s+\w+\s*=\s*[^;]*;?\s*//.*untyped|const\s+\w+\s*=\s*null\s*;?\s*//.*nullable|function\s+\w+\s*\([^)]*\)\s*:\s*any\s*\{[^}]*\}\s*//.*return)"
99134
).unwrap();
100135
}
101136

@@ -410,6 +445,104 @@ impl PatternDetector for LLMGeneratedCommentsDetector {
410445
}
411446
}
412447

448+
/// Detector for AI model hallucinated patterns
449+
pub struct AIModelHallucinationDetector;
450+
451+
impl PatternDetector for AIModelHallucinationDetector {
452+
fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
453+
detect_pattern_with_context(
454+
content,
455+
file_path,
456+
"LLM_AI_MODEL_HALLUCINATION",
457+
&AI_MODEL_HALLUCINATION_REGEX,
458+
)
459+
}
460+
}
461+
462+
/// Detector for incorrect async patterns
463+
pub struct IncorrectAsyncDetector;
464+
465+
impl PatternDetector for IncorrectAsyncDetector {
466+
fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
467+
detect_pattern_with_context(
468+
content,
469+
file_path,
470+
"LLM_INCORRECT_ASYNC",
471+
&INCORRECT_ASYNC_REGEX,
472+
)
473+
}
474+
}
475+
476+
/// Detector for LLM-generated security anti-patterns
477+
pub struct LLMSecurityAntipatternDetector;
478+
479+
impl PatternDetector for LLMSecurityAntipatternDetector {
480+
fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
481+
detect_pattern_with_context(
482+
content,
483+
file_path,
484+
"LLM_SECURITY_ANTIPATTERN",
485+
&LLM_SECURITY_ANTIPATTERN_REGEX,
486+
)
487+
}
488+
}
489+
490+
/// Detector for LLM-generated database anti-patterns
491+
pub struct LLMDBAntipatternDetector;
492+
493+
impl PatternDetector for LLMDBAntipatternDetector {
494+
fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
495+
detect_pattern_with_context(
496+
content,
497+
file_path,
498+
"LLM_DB_ANTIPATTERN",
499+
&LLM_DB_ANTIPATTERN_REGEX,
500+
)
501+
}
502+
}
503+
504+
/// Detector for LLM-generated error handling mistakes
505+
pub struct LLMErrorHandlingMistakesDetector;
506+
507+
impl PatternDetector for LLMErrorHandlingMistakesDetector {
508+
fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
509+
detect_pattern_with_context(
510+
content,
511+
file_path,
512+
"LLM_ERROR_HANDLING_MISTAKE",
513+
&LLM_ERROR_HANDLING_MISTAKES_REGEX,
514+
)
515+
}
516+
}
517+
518+
/// Detector for LLM-generated performance mistakes
519+
pub struct LLMPerformanceMistakesDetector;
520+
521+
impl PatternDetector for LLMPerformanceMistakesDetector {
522+
fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
523+
detect_pattern_with_context(
524+
content,
525+
file_path,
526+
"LLM_PERFORMANCE_MISTAKE",
527+
&LLM_PERFORMANCE_MISTAKES_REGEX,
528+
)
529+
}
530+
}
531+
532+
/// Detector for LLM-generated type handling mistakes
533+
pub struct LLMTypeMistakesDetector;
534+
535+
impl PatternDetector for LLMTypeMistakesDetector {
536+
fn detect(&self, content: &str, file_path: &Path) -> Vec<Match> {
537+
detect_pattern_with_context(
538+
content,
539+
file_path,
540+
"LLM_TYPE_MISTAKE",
541+
&LLM_TYPE_MISTAKES_REGEX,
542+
)
543+
}
544+
}
545+
413546
/// Comprehensive LLM vulnerability detector that combines multiple patterns
414547
pub struct ComprehensiveLLMDetector {
415548
detectors: Vec<Box<dyn PatternDetector>>,
@@ -436,6 +569,13 @@ impl ComprehensiveLLMDetector {
436569
Box::new(ContextConfusionDetector),
437570
Box::new(DatabaseAntipatternDetector),
438571
Box::new(LLMGeneratedCommentsDetector),
572+
Box::new(AIModelHallucinationDetector),
573+
Box::new(IncorrectAsyncDetector),
574+
Box::new(LLMSecurityAntipatternDetector),
575+
Box::new(LLMDBAntipatternDetector),
576+
Box::new(LLMErrorHandlingMistakesDetector),
577+
Box::new(LLMPerformanceMistakesDetector),
578+
Box::new(LLMTypeMistakesDetector),
439579
];
440580

441581
Self { detectors }

0 commit comments

Comments
 (0)