-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathnltk_utils.py
More file actions
120 lines (105 loc) · 5.15 KB
/
nltk_utils.py
File metadata and controls
120 lines (105 loc) · 5.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import logging
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string # Added import
logger = logging.getLogger(__name__)
# --- Setup NLTK resources ---
stop_words = set(stopwords.words('english'))
def setup_nltk():
"""Downloads required NLTK data if not present."""
try:
nltk.data.find('corpora/stopwords')
except LookupError:
logger.info("Downloading NLTK stopwords...")
nltk.download('stopwords', quiet=True)
logger.info("NLTK stopwords downloaded.")
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
logger.info("Downloading NLTK punkt tokenizer...")
nltk.download('punkt', quiet=True)
logger.info("NLTK punkt tokenizer downloaded.")
try:
nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
logger.info("Downloading NLTK averaged_perceptron_tagger...")
nltk.download('averaged_perceptron_tagger', quiet=True)
logger.info("NLTK averaged_perceptron_tagger downloaded.")
# --- Normalization and Subject Extraction ---
def normalize_question_simple(text: str) -> str:
"""Original simple normalization for storage keys."""
return text.lower().strip()
def normalize_question_nltk(text: str) -> set[str]:
"""Normalizes question text using NLTK for similarity comparison.
Removes punctuation/stopwords, returns set of significant words.
"""
if not text: # Handle empty input early
return set()
try:
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Tokenize
try:
tokens = word_tokenize(text)
except LookupError as e:
logger.error(f"NLTK LookupError during tokenization: {e}. Falling back to simple split.")
# Fallback: simple whitespace split if punkt fails
tokens = text.split()
# Remove stop words and non-alphabetic tokens
significant_words = {word for word in tokens if word.isalpha() and word not in stop_words}
return significant_words
except Exception as e:
# Catch other potential errors during normalization
logger.error(f"Unexpected error during NLTK normalization of '{text}': {e}")
return set()
def extract_subject(text: str) -> str:
"""Extracts the likely subject (first noun phrase) from the question text using POS tagging."""
try:
# Tokenize the text
tokens = word_tokenize(text)
# Perform Part-of-Speech tagging
tagged_tokens = nltk.pos_tag(tokens)
subject_words = []
in_subject = False
# Simple heuristic: find the first sequence of Determiner (DT), Adjective (JJ), Noun (NN/NNS/NNP/NNPS)
for word, tag in tagged_tokens:
# Start capturing if we see a determiner, adjective, or noun
if tag.startswith('DT') or tag.startswith('JJ') or tag.startswith('NN'):
subject_words.append(word)
in_subject = True
# Stop capturing if we are in a subject and hit something else (like a verb VB)
elif in_subject and (tag.startswith('VB') or word in ['is', 'are', 'does', 'do', 'get', 'deserves']):
break # Stop after the main noun phrase, before the verb
# If we started capturing but hit something non-essential, keep going for multi-word nouns
elif in_subject and not (tag.startswith('DT') or tag.startswith('JJ') or tag.startswith('NN')):
# If it's something clearly not part of the noun phrase, stop
if word in ['?', '.'] or tag in [':', ',']:
break
# Otherwise, might be part of a complex noun phrase, continue for now
# (This part is tricky and can be refined)
pass
# If we haven't started capturing and it's not a starting tag, ignore
elif not in_subject:
continue
# Clean up the extracted subject
subject = " ".join(subject_words).strip()
# Remove leading 'how many booms does/do/is/are' etc. if accidentally captured
common_prefixes = ["how many booms does ", "how many booms do ", "how many booms is ", "how many booms are ", "how many booms "]
for prefix in common_prefixes:
if subject.lower().startswith(prefix):
subject = subject[len(prefix):].strip()
break
# Fallback if extraction is empty or very short
if not subject or len(subject.split()) == 0:
logger.warning(f"Subject extraction failed for '{text}'. Falling back to full text.")
return text.strip().rstrip('?') # Fallback to original cleaned text
logger.info(f"Extracted subject '{subject}' from '{text}'")
return subject
except LookupError as e:
logger.error(f"NLTK LookupError during subject extraction (likely missing tagger): {e}. Falling back.")
return text.strip().rstrip('?') # Fallback
except Exception as e:
logger.error(f"Error during subject extraction for '{text}': {e}")
return text.strip().rstrip('?') # Fallback