Skip to content

Commit 67a97e8

Browse files
authored
Merge pull request #14 from SkBlaz/priors
Prior keyphrases as seeds for subsequent runs
2 parents e3e58a3 + 00e358a commit 67a97e8

File tree

4 files changed

+35
-4
lines changed

4 files changed

+35
-4
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,6 @@ docker run -d -p 8000:8000 --rm -it rakun_test
115115
- **Keyword extraction from text**: Use `localhost:8000/get_keywords` for analyzing text.
116116
- **Keyword extraction from PDFs**: Use `localhost:8000/get_keywords_pdf` for analyzing PDF documents.
117117

118-
119-
120118
# Citing this work
121119

122120
https://link.springer.com/chapter/10.1007/978-3-031-18840-4_27#citeas

examples/example_priors.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
## Idea augment output with domain-specific priors (symbolic)
2+
3+
from rakun2 import RakunKeyphraseDetector
4+
5+
EXAMPLE_DOCUMENT = "Kangaroos bounce gracefully across the vast, dusty plains of the Australian outback, their powerful hind legs propelling them effortlessly through the heat waves shimmering in the distance. A curious joey peeks shyly from its mother's pouch, observing a world filled with towering eucalyptus trees and rust-colored earth. At twilight, mobs of kangaroos gather by cool watering holes, pausing occasionally to listen for the rustle of predators hidden in the brush. They nibble cautiously on grass and shrubs, ears swiveling alertly at every unexpected sound. As dawn arrives, a young kangaroo boldly ventures off, testing his strength in playful leaps that send dust spiraling into the morning breeze."
6+
7+
priors=[('kangaroo', 0.11618894338607788), ('marsupials', 0.10609937831759453), ('species', 0.04895966034382582), ('australian', 0.03770353738218546), ('pouch', 0.025573878083378077), ('placental mammals', 0.025146500440314412), ('development', 0.019251500139944255), ('metatherians', 0.01630012784153223), ('males', 0.0161470053717494), ('marsupialia', 0.015857341699302197)]
8+
9+
10+
hyperparameters = {"num_keywords": 10,
11+
"merge_threshold": 1.0,
12+
"alpha": 0.9,
13+
"token_prune_len": 1}
14+
15+
keyword_detector = RakunKeyphraseDetector(hyperparameters)
16+
out_keywords = keyword_detector.find_keywords(EXAMPLE_DOCUMENT, input_type="string")
17+
print(out_keywords)
18+
19+
keyword_detector = RakunKeyphraseDetector(hyperparameters)
20+
out_keywords = keyword_detector.find_keywords(EXAMPLE_DOCUMENT, input_type="string", prior_rankings=priors)
21+
print(out_keywords)

rakun2/class_rakun.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,8 @@ def match_sweep(self) -> None:
467467
def find_keywords(self,
468468
document: str,
469469
input_type: str = "file",
470-
encoding: str = "utf-8") -> List[Tuple[str, float]]:
470+
encoding: str = "utf-8",
471+
prior_rankings: list = None) -> List[Tuple[str, float]]:
471472
"""
472473
Extract and rank keywords from the input document.
473474
@@ -494,4 +495,15 @@ def find_keywords(self,
494495

495496
if self.verbose:
496497
logger.info("Keyword extraction complete.")
498+
499+
if prior_rankings is not None:
500+
prior_dict = dict(prior_rankings)
501+
new_keywords = [
502+
(keyphrase, score + prior_dict[prior_keyphrase])
503+
for keyphrase, score in self.final_keywords
504+
for prior_keyphrase in prior_dict
505+
if prior_keyphrase in keyphrase
506+
]
507+
self.final_keywords = new_keywords
508+
497509
return self.final_keywords[:self.hyperparameters["num_keywords"]]

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def parse_requirements(file):
1515

1616

1717
setup(name='rakun2',
18-
version='0.29',
18+
version='0.30',
1919
description=
2020
"RaKUn 2.0; Better faster stronger lighter",
2121
url='http://github.com/skblaz/rakun2',

0 commit comments

Comments
 (0)