From 467fa0fe9aad68ee59f99aeafbfec108ef8ea9af Mon Sep 17 00:00:00 2001 From: PandaDEV <70103896+0PandaDEV@users.noreply.github.com> Date: Sun, 15 Dec 2024 19:50:09 +1000 Subject: [PATCH 1/2] feat: detect directly from text --- src/lib.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 2baace6..da90fb4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -289,6 +289,33 @@ fn filter_candidates( } } +/// Detects the programming language from the given text content +/// +/// If the language cannot be determined, None will be returned. +/// +/// # Examples +/// ``` +/// use hyperpolyglot::{detect_from_text, Detection}; +/// +/// let content = r#" +/// fn main() { +/// println!("Hello World!"); +/// } +/// "#; +/// let language = detect_from_text(content).unwrap(); +/// assert_eq!(Detection::Classifier("Rust"), language); +/// ``` +pub fn detect_from_text(content: &str) -> Option { + // Since we don't have filename/extension info, we'll use all supported languages as candidates + let candidates: Vec<&'static str> = LANGUAGE_INFO.keys().copied().collect(); + + // Truncate content if needed + let content = truncate_to_char_boundary(content, MAX_CONTENT_SIZE_BYTES); + + // Use classifier to determine the language + Some(Detection::Classifier(detectors::classify(content, &candidates))) +} + #[cfg(test)] mod tests { use super::*; From f4f463d7430d870568584ffd55c901f4576a6bae Mon Sep 17 00:00:00 2001 From: PandaDEV <70103896+0PandaDEV@users.noreply.github.com> Date: Mon, 16 Dec 2024 23:21:09 +1000 Subject: [PATCH 2/2] feat: add simple if input is actually code --- src/lib.rs | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index da90fb4..e4cd581 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,7 @@ use std::{ path::{Path, PathBuf}, sync::mpsc, }; +use regex::Regex; pub mod detectors; pub mod filters; @@ -306,6 +307,11 @@ fn filter_candidates( /// assert_eq!(Detection::Classifier("Rust"), language); /// ``` pub fn detect_from_text(content: &str) -> Option { + // Early return if the content looks like plain text + if looks_like_plain_text(content) { + return None; + } + // Since we don't have filename/extension info, we'll use all supported languages as candidates let candidates: Vec<&'static str> = LANGUAGE_INFO.keys().copied().collect(); @@ -316,6 +322,70 @@ pub fn detect_from_text(content: &str) -> Option { Some(Detection::Classifier(detectors::classify(content, &candidates))) } +fn looks_like_plain_text(content: &str) -> bool { + // Common build output and log patterns + let build_log_patterns = [ + Regex::new(r"(built|Building|building) in \d+").unwrap(), + Regex::new(r"^\[?\d{1,2}:\d{2}:\d{2}(.\d{3})?\]?").unwrap(), // Time stamps + Regex::new(r"^(✔|✓|->|\[INFO\]|\[ERROR\]|\[WARN\])").unwrap(), // Common log indicators + Regex::new(r"(starting|finished|completed|done|ready|listening)").unwrap(), + ]; + + // Existing sentence pattern + let sentence_pattern = Regex::new(r"^[A-Z].*[.!?]$").unwrap(); + + // Code patterns + let code_patterns = [ + Regex::new(r"[{}\[\]();]").unwrap(), + Regex::new(r"^(function|def|class|if|for|while|import|package)\b").unwrap(), + Regex::new(r"^\s*(public|private|protected)\b").unwrap(), + ]; + + let lines: Vec<&str> = content + .lines() + .map(|line| line.trim()) + .filter(|line| !line.is_empty()) + .collect(); + + if lines.is_empty() { + return true; + } + + // Check for build/log output + let log_lines = lines + .iter() + .filter(|line| { + build_log_patterns.iter().any(|pattern| pattern.is_match(line)) + }) + .count(); + + let log_ratio = log_lines as f32 / lines.len() as f32; + + // If it looks like build output or logs, return true (it's not code) + if log_ratio > 0.2 { + return true; + } + + let code_lines = lines + .iter() + .filter(|line| { + code_patterns.iter().any(|pattern| pattern.is_match(line)) + }) + .count(); + + let code_ratio = code_lines as f32 / lines.len() as f32; + let sentence_lines = lines + .iter() + .filter(|line| sentence_pattern.is_match(line)) + .count(); + let sentence_ratio = sentence_lines as f32 / lines.len() as f32; + + // Consider it plain text if: + // 1. It has more sentence-like lines than code-like lines AND sentence ratio is significant + // 2. OR if it has very few code-like patterns + sentence_ratio > code_ratio && sentence_ratio > 0.3 || code_ratio < 0.1 +} + #[cfg(test)] mod tests { use super::*;