Skip to content

Commit 9e96f1e

Browse files
author
Halvani
committed
Added additional extractors and updated README.md
1 parent 50ce3d2 commit 9e96f1e

File tree

2 files changed

+202
-11
lines changed

2 files changed

+202
-11
lines changed

README.md

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,25 @@ tu = TextUnit()
3232
tu = TextUnit(model_id=TextUnit.SpacyModelSize.English_Large)
3333
```
3434

35-
### Extract function words
35+
### Extract character *n*-grams
3636
```python
37-
text = "The kickoff meeting will take place on Tuesday."
37+
text = "Man Bites Dog"
3838

39-
print(tu.function_words(text))
39+
print(tu.char_ngrams(text, n=5))
4040

41-
# ['The', 'will', 'on']
41+
# ['Man B', 'an Bi', 'n Bit', ' Bite', 'Bites', 'ites ', 'tes D', 'es Do', 's Dog']
42+
```
43+
44+
### Extract token *n*-grams
45+
```python
46+
text = "Man Bites Dog"
47+
48+
print(tu.token_ngrams(text, n=2))
49+
50+
# ['Man Bites', 'Bites Dog']
4251
```
4352

53+
4454
### Extract contractions
4555
```python
4656
text_contractions = """I’m pretty sure we’ll finish this today, but if we don’t, that’s alright — we’ve still got tomorrow. You shouldn’t worry too much; it isn’t as hard as it looks, and they’ve already done most of the work anyway."""
@@ -113,44 +123,87 @@ tu.named_entities(text_named_entities, restrict_to_categories=["ORG"])
113123
text_dates = """The first prototype was released on 2021-07-15, and version 1.0 followed on July 20, 2022. A major update arrived on 15/08/2023, just before the annual review on 08.09.2023. Our next release is scheduled for March 1st, 2024, with a beta planned for 01 March 2024. Please submit your reports by 12/31/2024 or, at the latest, by 2025/01/10. The kickoff meeting took place on Tuesday, 3 January 2023, and follow-ups are held every Monday."""
114124

115125
# Preserve original format of the extracted dates (not bullet-proof)
116-
117126
print(tu.dates(text_dates, preserve_input_format=True))
127+
118128
# ['on 2021-07-15', 'on 1.0', 'wed on July 20, 2022', 'on 15/08/2023', 'on 08.09.2023', 'March 1st, 2024', '01 March 2024', 'by 12/31/2024', 'by 2025/01/10', 'on Tuesday, 3 January 2023', 'Monday']
119129

120130
# Unify extracted dates to the format "dd.mm.yyyy" (default)
121-
122131
print(tu.dates(text_dates))
132+
123133
# ['15.07.2021', '01.11.2025', '20.07.2022', '15.08.2023', '09.08.2023', '01.03.2024', '01.03.2024', '31.12.2024', '10.01.2025', '03.01.2023', '17.11.2025']
124134
```
125135

136+
126137
### Extract stop words (superset of function words)
127138
```python
128139
text = "The kickoff meeting will take place on Tuesday."
129140

130141
print(tu.stop_words(text))
142+
131143
# ['The', 'will', 'take', 'on']
132144
```
133145

134-
### Extract Part of Speech tags (POS tags)
146+
### Extract function words
135147
```python
136148
text = "The kickoff meeting will take place on Tuesday."
137149

138-
# Extract all POS tags
150+
print(tu.function_words(text))
139151

152+
# ['The', 'will', 'on']
153+
```
154+
155+
### Extract part of speech tags (POS tags)
156+
```python
157+
text = "The kickoff meeting will take place on Tuesday."
158+
159+
# Extract all POS tags
140160
print(tu.postags(text))
161+
141162
# ['DET', 'NOUN', 'NOUN', 'AUX', 'VERB', 'NOUN', 'ADP', 'PROPN', 'PUNCT']
142163

143164
# Extract all POS tags and combine them with corresponding tokens
144-
145165
print(tu.postags(text, combine_with_token=True, combine_sep=" "))
166+
146167
# [('The', 'DET'), ('kickoff', 'NOUN'), ('meeting', 'NOUN'), ('will', 'AUX'), ('take', 'VERB'), ('place', 'NOUN'), ('on', 'ADP'), ('Tuesday', 'PROPN'), ('.', 'PUNCT')]
147168

148169
# Extract only nouns and return their tokens instead of the POS tags
149-
150170
print(tu.postags(text, tokens_only=True, tags_to_consider={"NOUN"}))
171+
151172
# ['kickoff', 'meeting', 'place']
152173
```
153174

175+
### Extract lemmas
176+
```python
177+
text_lemmas = "The researchers were analyzing how various models predicted meanings that had already been inferred by earlier systems."
178+
179+
print(tu.lemmas(text_lemmas))
180+
181+
# ['the', 'researcher', 'be', 'analyze', 'how', 'various', 'model', 'predict', 'meaning', 'that', 'have', 'already', 'be', 'infer', 'by', 'early', 'system', '.']
182+
```
183+
184+
### Extract quotes
185+
```python
186+
text_quotes = """Lena looked at her phone and muttered, “I cannot find the message anymore.” Her friend pointed at the screen and said, “Check the folder below.” Lena scrolled again and sighed, “Still nothing.” Her friend shrugged and replied, «Maybe it was deleted.»"""
187+
188+
print(*tu.quotes(text_quotes, strip_marks=False), sep="\n")
189+
190+
# “I cannot find the message anymore.”
191+
# “Check the folder below.”
192+
# “Still nothing.”
193+
# «Maybe it was deleted.»
194+
```
195+
196+
### Extract URLs
197+
```python
198+
text_urls = """During the meeting Anna mentioned that the project documentation was now available at https://docs.example.org/start which helped everyone understand the initial setup. Mark added that the latest prototype could be viewed on the internal server at http://intranet.local/prototype. To gather more background information, Julia recommended checking https://www.research-info.net/articles/ai-overview Later Tom pointed out a helpful code repository at https://github.com/Halvani/TextUnitLib which included several utilities they could reuse. Before the session ended, Anna also shared a registration link for next week’s workshop: http://events.example.com/register?id=42"""
199+
200+
print(*tu.urls(text_urls), sep="\n")
201+
202+
# https://docs.example.org/start
203+
# https://www.research-info.net/articles/ai-overview
204+
# https://github.com/Halvani/TextUnitLib
205+
# http://events.example.com/register?id=42
206+
```
154207

155208
<a name="Applications"></a>
156209
## Applications

textunitlib/core.py

Lines changed: 139 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1694,4 +1694,142 @@ def function_words(
16941694
# Case insensitive lookup
16951695
considered_function_words_lower = {fw.lower() for fw in considered_function_words}
16961696

1697-
return [t for t in alpha_tokens if t.lower() in considered_function_words_lower]
1697+
return [t for t in alpha_tokens if t.lower() in considered_function_words_lower]
1698+
1699+
1700+
def lemmas(self, text: str, lowercase: bool = False) -> List[str]:
1701+
"""
1702+
Extract lemmatized tokens from the input text using the spaCy pipeline.
1703+
1704+
Args:
1705+
text (str): The input text to analyze.
1706+
lowercase (bool, optional): If True, return lemmas in lowercase form.
1707+
Defaults to False.
1708+
1709+
Returns:
1710+
List[str]: A list of lemmas corresponding to the spaCy tokenization.
1711+
1712+
Raises:
1713+
RuntimeError: If the spaCy pipeline is not initialized.
1714+
"""
1715+
if self.__nlp is None:
1716+
raise RuntimeError("Internal spaCy pipeline is not initialized.")
1717+
1718+
spacy_tokens = self.tokens(text, strategy=self.Tokenization.SpacyTokens)
1719+
lemmas = [t.lemma_ for t in spacy_tokens]
1720+
1721+
return [lemma.lower() for lemma in lemmas] if lowercase else lemmas
1722+
1723+
1724+
def quotes(self,
1725+
text: str,
1726+
strip_marks: bool = True,
1727+
allow_multiline: bool = True,
1728+
min_length: int = 1,
1729+
extract_sentence_wise: bool = False) -> List[str]:
1730+
"""
1731+
Extract quoted passages from text.
1732+
1733+
Supports various quotation styles including:
1734+
- Straight quotes: "..." and '...'
1735+
- Curly quotes: “...”, „...“, etc.
1736+
- Guillemets: «...», »...«, ‹...›, …
1737+
1738+
If extract_sentence_wise=True, quotations will be extracted separately for
1739+
each sentence detected by the spaCy pipeline. This helps avoid cross-sentence
1740+
over-matching and improves robustness for long or complex texts.
1741+
1742+
Args:
1743+
text (str): The input text.
1744+
1745+
strip_marks (bool, optional):
1746+
If True, return only the inner quoted text without quotation marks.
1747+
If False, return the entire quoted span including the marks.
1748+
Defaults to True.
1749+
1750+
allow_multiline (bool, optional):
1751+
If True, quotes may span multiple lines.
1752+
Defaults to True.
1753+
1754+
min_length (int, optional):
1755+
Minimum length of the inner quote (after stripping whitespace).
1756+
Defaults to 1.
1757+
1758+
extract_sentence_wise (bool, optional):
1759+
If True, run the quote extraction on each sentence separately.
1760+
If False, process the text as a whole.
1761+
Defaults to False.
1762+
1763+
Returns:
1764+
List[str]: Extracted quoted strings in order of appearance.
1765+
"""
1766+
1767+
if not text:
1768+
return []
1769+
1770+
# Define pairs of opening and closing quotation marks
1771+
quote_pairs = [
1772+
('"', '"'),
1773+
("'", "'"),
1774+
('“', '”'),
1775+
('„', '“'),
1776+
('«', '»'),
1777+
('‹', '›'),
1778+
('‚', '‘'),
1779+
('‚', '’'),
1780+
('»', '«'),
1781+
]
1782+
1783+
flags = re.DOTALL if allow_multiline else 0
1784+
results: List[str] = []
1785+
1786+
def extract_from_segment(segment: str):
1787+
"""Extract quotes from a single text segment (sentence or full text)."""
1788+
local: List[str] = []
1789+
1790+
for open_q, close_q in quote_pairs:
1791+
1792+
# Special handling for ASCII quotes to avoid apostrophe noise
1793+
if open_q in {"'", '"'} and open_q == close_q:
1794+
pattern = (
1795+
r'(?<!\w)' + re.escape(open_q) +
1796+
r'(.*?)' +
1797+
re.escape(close_q) + r'(?!\w)'
1798+
)
1799+
else:
1800+
pattern = re.escape(open_q) + r'(.*?)' + re.escape(close_q)
1801+
1802+
matches = re.finditer(pattern, segment, flags)
1803+
for match in matches:
1804+
inner = match.group(1)
1805+
1806+
# Skip short or whitespace-only inner texts
1807+
if len(inner.strip()) < min_length:
1808+
continue
1809+
1810+
# Heuristic to skip apostrophe-contractions like 's, 't, 'm
1811+
if open_q == close_q == "'" and len(inner) <= 2 and not any(ch.isspace() for ch in inner):
1812+
continue
1813+
1814+
full = match.group(0)
1815+
extracted = inner if strip_marks else full
1816+
1817+
if extracted not in local:
1818+
local.append(extracted)
1819+
return local
1820+
1821+
# If sentence-wise extraction is requested, split into sentences
1822+
if extract_sentence_wise:
1823+
if self.__nlp is None:
1824+
raise RuntimeError("spaCy pipeline not initialized for sentence segmentation.")
1825+
1826+
sentences = self.sentences(text)
1827+
1828+
for sent in sentences:
1829+
for q in extract_from_segment(sent):
1830+
results.append(q)
1831+
1832+
else:
1833+
# Extract from entire document
1834+
results.extend(extract_from_segment(text))
1835+
return results

0 commit comments

Comments
 (0)