feat: Add extractors for keywords, ner & hierarchy (#527)

bjchambers · web-flow · commit 296d4eb17488 · 2024-07-01T08:12:04.000-07:00
* feat: Add extractors for keywords, ner &amp; hierarchy

This simplifies the use of Keybert for keyword based links and
GLiNER for named-entity based links.

This also makes it easier to create links representing a document and/or
page hierarchy.
diff --git a/libs/knowledge-store/ragstack_knowledge_store/graph_store.py b/libs/knowledge-store/ragstack_knowledge_store/graph_store.py
@@ -272,21 +272,18 @@ def _apply_schema(self):
         )
 
         # Index on text_embedding (for similarity search)
-        self._session.execute(
-            f"""CREATE CUSTOM INDEX IF NOT EXISTS {self._node_table}_text_embedding_index
+        self._session.execute(f"""
+            CREATE CUSTOM INDEX IF NOT EXISTS {self._node_table}_text_embedding_index
             ON {self._keyspace}.{self._node_table}(text_embedding)
             USING 'StorageAttachedIndex';
-            """  # noqa: E501
-        )
+        """)  # noqa: E501
 
         # Index on target_text_embedding (for similarity search)
-        self._session.execute(
-            f"""
+        self._session.execute(f"""
             CREATE CUSTOM INDEX IF NOT EXISTS {self._targets_table}_target_text_embedding_index
             ON {self._keyspace}.{self._targets_table}(target_text_embedding)
             USING 'StorageAttachedIndex';
-            """  # noqa: E501
-        )
+        """)  # noqa: E501
 
     def _concurrent_queries(self) -> ConcurrentQueries:
         return ConcurrentQueries(self._session)
diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml
@@ -39,6 +39,8 @@ ragstack-ai-tests-utils = { path = "../tests-utils", develop = true }
 ragstack-ai-colbert = { path = "../colbert", develop = true }
 ragstack-ai-knowledge-store = { path = "../knowledge-store", develop = true }
 pytest-asyncio = "^0.23.6"
+keybert = "^0.8.5"
+gliner = "^0.2.5"
 
 [tool.poetry.group.dev.dependencies]
 setuptools = "^70.0.0"
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/__init__.py b/libs/langchain/ragstack_langchain/graph_store/extractors/__init__.py
@@ -1,8 +1,20 @@
+from .gliner_link_extractor import GLiNERInput, GLiNERLinkExtractor
+from .hierarchy_link_extractor import HierarchyInput, HierarchyLinkExtractor
 from .html_link_extractor import HtmlInput, HtmlLinkExtractor
-from .link_extractor import LinkExtractor
+from .keybert_link_extractor import KeybertInput, KeybertLinkExtractor
+from .link_extractor_adapter import LinkExtractorAdapter
+from .link_extractor_transformer import LinkExtractorTransformer
 
 __all__ = [
     "LinkExtractor",
+    "GLiNERInput",
+    "GLiNERLinkExtractor",
+    "HierarchyInput",
+    "HierarchyLinkExtractor",
     "HtmlInput",
     "HtmlLinkExtractor",
+    "KeybertInput",
+    "KeybertLinkExtractor",
+    "LinkExtractorAdapter",
+    "LinkExtractorTransformer",
 ]
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/gliner_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/gliner_link_extractor.py
@@ -0,0 +1,56 @@
+from typing import Any, Dict, Iterable, List, Optional, Set
+
+from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor
+from ragstack_langchain.graph_store.links import Link
+
+# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
+GLiNERInput = str
+
+
+class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
+    def __init__(
+        self,
+        labels: List[str],
+        *,
+        kind: str = "entity",
+        model: str = "urchade/gliner_mediumv2.1",
+        extract_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """Extract keywords using GLiNER.
+
+        Args:
+            kind: Kind of links to produce with this extractor.
+            labels: List of kinds of entities to extract.
+            model: GLiNER model to use.
+            extract_kwargs: Keyword arguments to pass to GLiNER.
+        """
+        try:
+            from gliner import GLiNER
+
+            self._model = GLiNER.from_pretrained(model)
+
+        except ImportError:
+            raise ImportError(
+                "gliner is required for GLiNERLinkExtractor. "
+                "Please install it with `pip install gliner`."
+            ) from None
+
+        self._labels = labels
+        self._kind = kind
+        self._extract_kwargs = extract_kwargs or {}
+
+    def extract_one(self, input: GLiNERInput) -> Set[Link]:
+        return next(self.extract_many([input]))
+
+    def extract_many(
+        self,
+        inputs: Iterable[GLiNERInput],
+    ) -> Iterable[Set[Link]]:
+        strs = [i if isinstance(i, str) else i.page_content for i in inputs]
+        for entities in self._model.batch_predict_entities(
+            strs, self._labels, **self._extract_kwargs
+        ):
+            yield {
+                Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
+                for e in entities
+            }
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/hierarchy_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/hierarchy_link_extractor.py
@@ -0,0 +1,62 @@
+from typing import Callable, List, Set
+
+from langchain_core.documents import Document
+
+from ragstack_langchain.graph_store.links import Link
+
+from .link_extractor import LinkExtractor
+from .link_extractor_adapter import LinkExtractorAdapter
+
+# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
+HierarchyInput = List[str]
+
+
+class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
+    def __init__(
+        self,
+        kind: str = "hierarchy",
+        up_links: bool = True,
+        down_links: bool = False,
+        sibling_links: bool = False,
+    ):
+        """Extract links from a document hierarchy.
+
+        Args:
+            kind: Kind of links to produce with this extractor.
+            up_links: Link from a section to it's parent.
+            down_links: Link from a section to it's children.
+            sibling_links: Link from a section to other sections with the same parent.
+        """
+        self._kind = kind
+        self._up_links = up_links
+        self._down_links = down_links
+        self._sibling_links = sibling_links
+
+    def as_document_extractor(
+        self, hierarchy: Callable[[Document], HierarchyInput]
+    ) -> LinkExtractor[Document]:
+        return LinkExtractorAdapter(underlying=self, transform=hierarchy)
+
+    def extract_one(
+        self,
+        input: HierarchyInput,
+    ) -> Set[Link]:
+        this_path = "/".join(input)
+        parent_path = None
+
+        links = set()
+        if self._up_links:
+            links.add(Link.incoming(kind=self._kind, tag=f"up:{this_path}"))
+        if self._down_links:
+            links.add(Link.outgoing(kind=self._kind, tag=f"down:{this_path}"))
+
+        if len(input) >= 1:
+            parent_path = "/".join(input[0:-1])
+            if self._up_links and len(input) > 1:
+                links.add(Link.outgoing(kind=self._kind, tag=f"up:{parent_path}"))
+            if self._down_links and len(input) > 1:
+                links.add(Link.incoming(kind=self._kind, tag=f"down:{parent_path}"))
+            if self._sibling_links:
+                links.add(Link.bidir(kind=self._kind, tag=f"sib:{parent_path}"))
+
+        return links
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/html_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/html_link_extractor.py
@@ -2,9 +2,12 @@
 from typing import TYPE_CHECKING, Set, Union
 from urllib.parse import urldefrag, urljoin, urlparse
 
+from langchain_core.documents import Document
+
 from ragstack_langchain.graph_store.links import Link
 
 from .link_extractor import LinkExtractor
+from .link_extractor_adapter import LinkExtractorAdapter
 
 if TYPE_CHECKING:
     from bs4 import BeautifulSoup
@@ -77,6 +80,26 @@ def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
         self._kind = kind
         self.drop_fragments = drop_fragments
 
+    def as_document_extractor(
+        self, url_metadata_key: str = "source"
+    ) -> LinkExtractor[Document]:
+        """Return a LinkExtractor that applies to documents.
+
+        NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
+        link extractors it may be more efficient to call the link extractors directly
+        on the parsed BeautifulSoup object.
+
+        Args:
+            url_metadata_key: The name of the filed in document metadata with the URL of
+                the document.
+        """
+        return LinkExtractorAdapter(
+            underlying=self,
+            transform=lambda doc: HtmlInput(
+                doc.page_content, doc.metadata[url_metadata_key]
+            ),
+        )
+
     def extract_one(
         self,
         input: HtmlInput,
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/keybert_link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/keybert_link_extractor.py
@@ -0,0 +1,63 @@
+from typing import Any, Dict, Iterable, Optional, Set, Union
+
+from langchain_core.documents import Document
+
+from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor
+from ragstack_langchain.graph_store.links import Link
+
+# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
+KeybertInput = Union[str, Document]
+
+
+class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
+    def __init__(
+        self,
+        *,
+        kind: str = "kw",
+        embedding_model: str = "all-MiniLM-L6-v2",
+        extract_keywords_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """Extract keywords using Keybert.
+
+        Args:
+            kind: Kind of links to produce with this extractor.
+            embedding_model: Name of the embedding model to use with Keybert.
+            extract_keywords_kwargs: Keyword arguments to pass to Keybert's
+                `extract_keywords` method.
+        """
+        try:
+            import keybert
+
+            self._kw_model = keybert.KeyBERT(model=embedding_model)
+        except ImportError:
+            raise ImportError(
+                "keybert is required for KeybertLinkExtractor. "
+                "Please install it with `pip install keybert`."
+            ) from None
+
+        self._kind = kind
+        self._extract_keywords_kwargs = extract_keywords_kwargs or {}
+
+    def extract_one(self, input: KeybertInput) -> Set[Link]:
+        keywords = self._kw_model.extract_keywords(
+            input if isinstance(input, str) else input.page_content,
+            **self._extract_keywords_kwargs,
+        )
+        return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
+
+    def extract_many(
+        self,
+        inputs: Iterable[KeybertInput],
+    ) -> Iterable[Set[Link]]:
+        if len(inputs) == 1:
+            # Even though we pass a list, if it contains one item, keybert will
+            # flatten it. This means it's easier to just call the special case
+            # for one item.
+            yield self.extract_one(inputs[0])
+        elif len(inputs) > 1:
+            strs = [i if isinstance(i, str) else i.page_content for i in inputs]
+            extracted = self._kw_model.extract_keywords(
+                strs, **self._extract_keywords_kwargs
+            )
+            for keywords in extracted:
+                yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Generic, Iterable, TypeVar
+from typing import Generic, Iterable, Set, TypeVar
 
 from ragstack_langchain.graph_store.links import Link
 
@@ -24,7 +24,7 @@ def extract_one(self, input: InputT) -> set[Link]:
             Set of links extracted from the input.
         """
 
-    def extract_many(self, inputs: Iterable[InputT]):
+    def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
         """Add edges from each `input` to the corresponding documents.
 
         Args:
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_adapter.py b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_adapter.py
@@ -0,0 +1,24 @@
+from typing import Callable, Iterable, Set, TypeVar
+
+from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor
+from ragstack_langchain.graph_store.links import Link
+
+InputT = TypeVar("InputT")
+UnderlyingInputT = TypeVar("UnderlyingInputT")
+
+
+class LinkExtractorAdapter(LinkExtractor[InputT]):
+    def __init__(
+        self,
+        underlying: LinkExtractor[UnderlyingInputT],
+        transform: Callable[[InputT], UnderlyingInputT],
+    ) -> None:
+        self._underlying = underlying
+        self._transform = transform
+
+    def extract_one(self, input: InputT) -> Set[Link]:
+        return self.extract_one(self._transform(input))
+
+    def extract_many(self, inputs: Iterable[InputT]) -> Iterable[Set[Link]]:
+        underlying_inputs = [self._transform(input) for input in inputs]
+        return self._underlying.extract_many(underlying_inputs)
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_transformer.py b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor_transformer.py
@@ -0,0 +1,27 @@
+from typing import Iterable, Sequence
+
+from langchain_core.documents import Document
+from langchain_core.documents.transformers import BaseDocumentTransformer
+
+from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor
+from ragstack_langchain.graph_store.links import add_links
+
+
+class LinkExtractorTransformer(BaseDocumentTransformer):
+    def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]):
+        """Create a DocumentTransformer which adds the given links."""
+        self.link_extractors = link_extractors
+
+    def transform_documents(self, documents: Sequence[Document]) -> Sequence[Document]:
+        document_links = zip(
+            documents,
+            zip(
+                *[
+                    extractor.extract_many(documents)
+                    for extractor in self.link_extractors
+                ]
+            ),
+        )
+        for document, links in document_links:
+            add_links(document, *links)
+        return documents
diff --git a/libs/langchain/tests/unit_tests/test_gliner_link_extractor.py b/libs/langchain/tests/unit_tests/test_gliner_link_extractor.py
@@ -0,0 +1,41 @@
+from ragstack_langchain.graph_store.extractors import GLiNERLinkExtractor
+from ragstack_langchain.graph_store.links import Link
+
+PAGE_1 = """
+Cristiano Ronaldo dos Santos Aveiro (Portuguese pronunciation: [kɾiʃˈtjɐnu
+ʁɔˈnaldu]; born 5 February 1985) is a Portuguese professional footballer who
+plays as a forward for and captains both Saudi Pro League club Al Nassr and the
+Portugal national team. Widely regarded as one of the greatest players of all
+time, Ronaldo has won five Ballon d'Or awards,[note 3] a record three UEFA Men's
+Player of the Year Awards, and four European Golden Shoes, the most by a
+European player. He has won 33 trophies in his career, including seven league
+titles, five UEFA Champions Leagues, the UEFA European Championship and the UEFA
+Nations League. Ronaldo holds the records for most appearances (183), goals
+(140) and assists (42) in the Champions League, goals in the European
+Championship (14), international goals (128) and international appearances
+(205). He is one of the few players to have made over 1,200 professional career
+appearances, the most by an outfield player, and has scored over 850 official
+senior career goals for club and country, making him the top goalscorer of all
+time.
+"""
+
+
+def test_one_from_keywords():
+    extractor = GLiNERLinkExtractor(
+        labels=["Person", "Award", "Date", "Competitions", "Teams"]
+    )
+
+    results = extractor.extract_one(PAGE_1)
+    print(results)
+    assert results == {
+        Link.bidir(kind="entity:Person", tag="Cristiano Ronaldo dos Santos Aveiro"),
+        Link.bidir(kind="entity:Award", tag="European Golden Shoes"),
+        Link.bidir(kind="entity:Competitions", tag="European\nChampionship"),
+        Link.bidir(kind="entity:Award", tag="UEFA Men's\nPlayer of the Year Awards"),
+        Link.bidir(kind="entity:Date", tag="5 February 1985"),
+        Link.bidir(kind="entity:Competitions", tag="UEFA Champions Leagues"),
+        Link.bidir(kind="entity:Teams", tag="Portugal national team"),
+        Link.bidir(kind="entity:Competitions", tag="UEFA European Championship"),
+        Link.bidir(kind="entity:Competitions", tag="UEFA\nNations League"),
+        Link.bidir(kind="entity:Award", tag="Ballon d'Or"),
+    }
diff --git a/libs/langchain/tests/unit_tests/test_hierarchy_link_extractor.py b/libs/langchain/tests/unit_tests/test_hierarchy_link_extractor.py
diff --git a/libs/langchain/tests/unit_tests/test_html_link_extractor.py b/libs/langchain/tests/unit_tests/test_html_link_extractor.py
diff --git a/libs/langchain/tests/unit_tests/test_keybert_link_extractor.py b/libs/langchain/tests/unit_tests/test_keybert_link_extractor.py
diff --git a/libs/langchain/tests/unit_tests/test_link_extractor_transformer.py b/libs/langchain/tests/unit_tests/test_link_extractor_transformer.py