From 467fa0fe9aad68ee59f99aeafbfec108ef8ea9af Mon Sep 17 00:00:00 2001
From: PandaDEV <70103896+0PandaDEV@users.noreply.github.com>
Date: Sun, 15 Dec 2024 19:50:09 +1000
Subject: [PATCH 1/2] feat: detect directly from text

---
 src/lib.rs | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
diff --git a/src/lib.rs b/src/lib.rs
index 2baace6..da90fb4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -289,6 +289,33 @@ fn filter_candidates(
     }
 }
 
+/// Detects the programming language from the given text content
+///
+/// If the language cannot be determined, None will be returned.
+/// 
+/// # Examples
+/// ```
+/// use hyperpolyglot::{detect_from_text, Detection};
+/// 
+/// let content = r#"
+///     fn main() {
+///         println!("Hello World!");
+///     }
+/// "#;
+/// let language = detect_from_text(content).unwrap();
+/// assert_eq!(Detection::Classifier("Rust"), language);
+/// ```
+pub fn detect_from_text(content: &str) -> Option<Detection> {
+    // Since we don't have filename/extension info, we'll use all supported languages as candidates
+    let candidates: Vec<&'static str> = LANGUAGE_INFO.keys().copied().collect();
+    
+    // Truncate content if needed
+    let content = truncate_to_char_boundary(content, MAX_CONTENT_SIZE_BYTES);
+    
+    // Use classifier to determine the language
+    Some(Detection::Classifier(detectors::classify(content, &candidates)))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

From f4f463d7430d870568584ffd55c901f4576a6bae Mon Sep 17 00:00:00 2001
From: PandaDEV <70103896+0PandaDEV@users.noreply.github.com>
Date: Mon, 16 Dec 2024 23:21:09 +1000
Subject: [PATCH 2/2] feat: add simple if input is actually code

---
 src/lib.rs | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/src/lib.rs b/src/lib.rs
index da90fb4..e4cd581 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,6 +11,7 @@ use std::{
     path::{Path, PathBuf},
     sync::mpsc,
 };
+use regex::Regex;
 
 pub mod detectors;
 pub mod filters;
@@ -306,6 +307,11 @@ fn filter_candidates(
 /// assert_eq!(Detection::Classifier("Rust"), language);
 /// ```
 pub fn detect_from_text(content: &str) -> Option<Detection> {
+    // Early return if the content looks like plain text
+    if looks_like_plain_text(content) {
+        return None;
+    }
+
     // Since we don't have filename/extension info, we'll use all supported languages as candidates
     let candidates: Vec<&'static str> = LANGUAGE_INFO.keys().copied().collect();
     
@@ -316,6 +322,70 @@ pub fn detect_from_text(content: &str) -> Option<Detection> {
     Some(Detection::Classifier(detectors::classify(content, &candidates)))
 }
 
+fn looks_like_plain_text(content: &str) -> bool {
+    // Common build output and log patterns
+    let build_log_patterns = [
+        Regex::new(r"(built|Building|building) in \d+").unwrap(),
+        Regex::new(r"^\[?\d{1,2}:\d{2}:\d{2}(.\d{3})?\]?").unwrap(), // Time stamps
+        Regex::new(r"^(✔|✓|->|\[INFO\]|\[ERROR\]|\[WARN\])").unwrap(), // Common log indicators
+        Regex::new(r"(starting|finished|completed|done|ready|listening)").unwrap(),
+    ];
+
+    // Existing sentence pattern
+    let sentence_pattern = Regex::new(r"^[A-Z].*[.!?]$").unwrap();
+    
+    // Code patterns
+    let code_patterns = [
+        Regex::new(r"[{}\[\]();]").unwrap(),
+        Regex::new(r"^(function|def|class|if|for|while|import|package)\b").unwrap(),
+        Regex::new(r"^\s*(public|private|protected)\b").unwrap(),
+    ];
+
+    let lines: Vec<&str> = content
+        .lines()
+        .map(|line| line.trim())
+        .filter(|line| !line.is_empty())
+        .collect();
+
+    if lines.is_empty() {
+        return true;
+    }
+
+    // Check for build/log output
+    let log_lines = lines
+        .iter()
+        .filter(|line| {
+            build_log_patterns.iter().any(|pattern| pattern.is_match(line))
+        })
+        .count();
+
+    let log_ratio = log_lines as f32 / lines.len() as f32;
+    
+    // If it looks like build output or logs, return true (it's not code)
+    if log_ratio > 0.2 {
+        return true;
+    }
+
+    let code_lines = lines
+        .iter()
+        .filter(|line| {
+            code_patterns.iter().any(|pattern| pattern.is_match(line))
+        })
+        .count();
+
+    let code_ratio = code_lines as f32 / lines.len() as f32;
+    let sentence_lines = lines
+        .iter()
+        .filter(|line| sentence_pattern.is_match(line))
+        .count();
+    let sentence_ratio = sentence_lines as f32 / lines.len() as f32;
+
+    // Consider it plain text if:
+    // 1. It has more sentence-like lines than code-like lines AND sentence ratio is significant
+    // 2. OR if it has very few code-like patterns
+    sentence_ratio > code_ratio && sentence_ratio > 0.3 || code_ratio < 0.1
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;