Merge pull request #14 from SkBlaz/priors

SkBlaz · web-flow · commit 67a97e8d6210 · 2025-04-18T20:21:45.000+02:00
Prior keyphrases as seeds for subsequent runs
diff --git a/README.md b/README.md
@@ -115,8 +115,6 @@ docker run -d -p 8000:8000 --rm -it rakun_test
   - **Keyword extraction from text**: Use `localhost:8000/get_keywords` for analyzing text.
   - **Keyword extraction from PDFs**: Use `localhost:8000/get_keywords_pdf` for analyzing PDF documents.
 
-
-
 # Citing this work
 
 https://link.springer.com/chapter/10.1007/978-3-031-18840-4_27#citeas
diff --git a/examples/example_priors.py b/examples/example_priors.py
@@ -0,0 +1,21 @@
+## Idea augment output with domain-specific priors (symbolic)
+
+from rakun2 import RakunKeyphraseDetector
+
+EXAMPLE_DOCUMENT = "Kangaroos bounce gracefully across the vast, dusty plains of the Australian outback, their powerful hind legs propelling them effortlessly through the heat waves shimmering in the distance. A curious joey peeks shyly from its mother's pouch, observing a world filled with towering eucalyptus trees and rust-colored earth. At twilight, mobs of kangaroos gather by cool watering holes, pausing occasionally to listen for the rustle of predators hidden in the brush. They nibble cautiously on grass and shrubs, ears swiveling alertly at every unexpected sound. As dawn arrives, a young kangaroo boldly ventures off, testing his strength in playful leaps that send dust spiraling into the morning breeze."
+
+priors=[('kangaroo', 0.11618894338607788), ('marsupials', 0.10609937831759453), ('species', 0.04895966034382582), ('australian', 0.03770353738218546), ('pouch', 0.025573878083378077), ('placental mammals', 0.025146500440314412), ('development', 0.019251500139944255), ('metatherians', 0.01630012784153223), ('males', 0.0161470053717494), ('marsupialia', 0.015857341699302197)]
+
+
+hyperparameters = {"num_keywords": 10,
+                   "merge_threshold": 1.0,
+                   "alpha": 0.9,
+                   "token_prune_len": 1}
+
+keyword_detector = RakunKeyphraseDetector(hyperparameters)
+out_keywords = keyword_detector.find_keywords(EXAMPLE_DOCUMENT, input_type="string")
+print(out_keywords)
+
+keyword_detector = RakunKeyphraseDetector(hyperparameters)
+out_keywords = keyword_detector.find_keywords(EXAMPLE_DOCUMENT, input_type="string", prior_rankings=priors)
+print(out_keywords)
diff --git a/rakun2/class_rakun.py b/rakun2/class_rakun.py
@@ -467,7 +467,8 @@ def match_sweep(self) -> None:
     def find_keywords(self,
                       document: str,
                       input_type: str = "file",
-                      encoding: str = "utf-8") -> List[Tuple[str, float]]:
+                      encoding: str = "utf-8",
+                      prior_rankings: list = None) -> List[Tuple[str, float]]:
         """
         Extract and rank keywords from the input document.
 
@@ -494,4 +495,15 @@ def find_keywords(self,
 
         if self.verbose:
             logger.info("Keyword extraction complete.")
+
+        if prior_rankings is not None:
+            prior_dict = dict(prior_rankings)
+            new_keywords = [
+                (keyphrase, score + prior_dict[prior_keyphrase])
+                for keyphrase, score in self.final_keywords
+                for prior_keyphrase in prior_dict
+                if prior_keyphrase in keyphrase
+            ]
+            self.final_keywords = new_keywords
+
         return self.final_keywords[:self.hyperparameters["num_keywords"]]
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@ def parse_requirements(file):
 
 
 setup(name='rakun2',
-      version='0.29',
+      version='0.30',
       description=
       "RaKUn 2.0; Better faster stronger lighter",
       url='http://github.com/skblaz/rakun2',