boom-bot/nltk_utils.py at main · TrevorKinsie/boom-bot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import logging
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string  # Added import

logger = logging.getLogger(__name__)

# --- Setup NLTK resources ---
stop_words = set(stopwords.words('english'))

def setup_nltk():
    """Downloads required NLTK data if not present."""
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        logger.info("Downloading NLTK stopwords...")
        nltk.download('stopwords', quiet=True)
        logger.info("NLTK stopwords downloaded.")
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        logger.info("Downloading NLTK punkt tokenizer...")
        nltk.download('punkt', quiet=True)
        logger.info("NLTK punkt tokenizer downloaded.")
    try:
        nltk.data.find('taggers/averaged_perceptron_tagger')
    except LookupError:
        logger.info("Downloading NLTK averaged_perceptron_tagger...")
        nltk.download('averaged_perceptron_tagger', quiet=True)
        logger.info("NLTK averaged_perceptron_tagger downloaded.")

# --- Normalization and Subject Extraction ---

def normalize_question_simple(text: str) -> str:
    """Original simple normalization for storage keys."""
    return text.lower().strip()


def normalize_question_nltk(text: str) -> set[str]:
    """Normalizes question text using NLTK for similarity comparison.
    Removes punctuation/stopwords, returns set of significant words.
    """
    if not text:  # Handle empty input early
        return set()
    try:
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Tokenize
        try:
            tokens = word_tokenize(text)
        except LookupError as e:
            logger.error(f"NLTK LookupError during tokenization: {e}. Falling back to simple split.")
            # Fallback: simple whitespace split if punkt fails
            tokens = text.split()

        # Remove stop words and non-alphabetic tokens
        significant_words = {word for word in tokens if word.isalpha() and word not in stop_words}
        return significant_words
    except Exception as e:
        # Catch other potential errors during normalization
        logger.error(f"Unexpected error during NLTK normalization of '{text}': {e}")
        return set()


def extract_subject(text: str) -> str:
    """Extracts the likely subject (first noun phrase) from the question text using POS tagging."""
    try:
        # Tokenize the text
        tokens = word_tokenize(text)
        # Perform Part-of-Speech tagging
        tagged_tokens = nltk.pos_tag(tokens)

        subject_words = []
        in_subject = False
        # Simple heuristic: find the first sequence of Determiner (DT), Adjective (JJ), Noun (NN/NNS/NNP/NNPS)
        for word, tag in tagged_tokens:
            # Start capturing if we see a determiner, adjective, or noun
            if tag.startswith('DT') or tag.startswith('JJ') or tag.startswith('NN'):
                subject_words.append(word)
                in_subject = True
            # Stop capturing if we are in a subject and hit something else (like a verb VB)
            elif in_subject and (tag.startswith('VB') or word in ['is', 'are', 'does', 'do', 'get', 'deserves']):
                break  # Stop after the main noun phrase, before the verb
            # If we started capturing but hit something non-essential, keep going for multi-word nouns
            elif in_subject and not (tag.startswith('DT') or tag.startswith('JJ') or tag.startswith('NN')):
                # If it's something clearly not part of the noun phrase, stop
                if word in ['?', '.'] or tag in [':', ',']:
                    break
                # Otherwise, might be part of a complex noun phrase, continue for now
                # (This part is tricky and can be refined)
                pass
            # If we haven't started capturing and it's not a starting tag, ignore
            elif not in_subject:
                continue

        # Clean up the extracted subject
        subject = " ".join(subject_words).strip()
        # Remove leading 'how many booms does/do/is/are' etc. if accidentally captured
        common_prefixes = ["how many booms does ", "how many booms do ", "how many booms is ", "how many booms are ", "how many booms "]
        for prefix in common_prefixes:
            if subject.lower().startswith(prefix):
                subject = subject[len(prefix):].strip()
                break

        # Fallback if extraction is empty or very short
        if not subject or len(subject.split()) == 0:
            logger.warning(f"Subject extraction failed for '{text}'. Falling back to full text.")
            return text.strip().rstrip('?')  # Fallback to original cleaned text

        logger.info(f"Extracted subject '{subject}' from '{text}'")
        return subject

    except LookupError as e:
        logger.error(f"NLTK LookupError during subject extraction (likely missing tagger): {e}. Falling back.")
        return text.strip().rstrip('?')  # Fallback
    except Exception as e:
        logger.error(f"Error during subject extraction for '{text}': {e}")
        return text.strip().rstrip('?')  # Fallback