Skip to content

Commit 1ac897e

Browse files
committed
Bump versions again to get LangChain extractors
1 parent a158cd1 commit 1ac897e

15 files changed

+80
-572
lines changed

libs/e2e-tests/pyproject.llamaindex.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ llama-index-multi-modal-llms-gemini = { git = "https://github.com/run-llama/llam
4242

4343
llama-parse = { git = "https://github.com/run-llama/llama_parse.git", branch = "main" }
4444

45-
langchain = "0.2.7"
46-
langchain-core = "0.2.12"
47-
langchain-community = "0.2.7"
45+
langchain = "0.2.10"
46+
langchain-core = "0.2.22"
47+
langchain-community = "0.2.9"
4848
langchain-astradb = "0.3.3"
4949
langchain-openai = "0.1.8"
5050
langchain-google-genai = { version = "1.0.6" }

libs/langchain/pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ ragstack-ai-colbert = { version = "1.0.5", optional = true }
1818
ragstack-ai-knowledge-store = { version = "0.1.0", optional = true }
1919

2020
# langchain
21-
langchain = "0.2.7"
22-
langchain-core = "0.2.12"
23-
langchain-community = "0.2.7"
21+
langchain = "0.2.10"
22+
langchain-core = "0.2.22"
23+
langchain-community = "0.2.9"
2424
langchain-astradb = "0.3.3"
2525
langchain-openai = "0.1.8"
2626
langchain-google-genai = { version = "1.0.6", optional = true }

libs/langchain/ragstack_langchain/graph_store/cassandra.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
CassandraGraphVectorStore as CassandraGraphStore,
33
)
44

5-
65
__all__ = [
76
"CassandraGraphStore",
87
]
Lines changed: 9 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,9 @@
1-
from typing import Any, Dict, Iterable, List, Optional, Set
2-
3-
from langchain_core.graph_vectorstores import Link
4-
5-
from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor
6-
7-
# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
8-
GLiNERInput = str
9-
10-
11-
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
12-
def __init__(
13-
self,
14-
labels: List[str],
15-
*,
16-
kind: str = "entity",
17-
model: str = "urchade/gliner_mediumv2.1",
18-
extract_kwargs: Optional[Dict[str, Any]] = None,
19-
):
20-
"""Extract keywords using GLiNER.
21-
22-
Args:
23-
kind: Kind of links to produce with this extractor.
24-
labels: List of kinds of entities to extract.
25-
model: GLiNER model to use.
26-
extract_kwargs: Keyword arguments to pass to GLiNER.
27-
"""
28-
try:
29-
from gliner import GLiNER
30-
31-
self._model = GLiNER.from_pretrained(model)
32-
33-
except ImportError:
34-
raise ImportError(
35-
"gliner is required for GLiNERLinkExtractor. "
36-
"Please install it with `pip install gliner`."
37-
) from None
38-
39-
self._labels = labels
40-
self._kind = kind
41-
self._extract_kwargs = extract_kwargs or {}
42-
43-
def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002
44-
return next(self.extract_many([input]))
45-
46-
def extract_many(
47-
self,
48-
inputs: Iterable[GLiNERInput],
49-
) -> Iterable[Set[Link]]:
50-
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
51-
for entities in self._model.batch_predict_entities(
52-
strs, self._labels, **self._extract_kwargs
53-
):
54-
yield {
55-
Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
56-
for e in entities
57-
}
1+
from langchain_community.graph_vectorstores.extractors import (
2+
GLiNERInput,
3+
GLiNERLinkExtractor,
4+
)
5+
6+
__all__ = [
7+
"GLiNERInput",
8+
"GLiNERLinkExtractor",
9+
]
Lines changed: 9 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,9 @@
1-
from typing import Callable, List, Set
2-
3-
from langchain_core.documents import Document
4-
from langchain_core.graph_vectorstores import Link
5-
6-
from .link_extractor import LinkExtractor
7-
from .link_extractor_adapter import LinkExtractorAdapter
8-
9-
# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
10-
HierarchyInput = List[str]
11-
12-
13-
class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
14-
def __init__(
15-
self,
16-
kind: str = "hierarchy",
17-
up_links: bool = True,
18-
down_links: bool = False,
19-
sibling_links: bool = False,
20-
):
21-
"""Extract links from a document hierarchy.
22-
23-
Args:
24-
kind: Kind of links to produce with this extractor.
25-
up_links: Link from a section to it's parent.
26-
down_links: Link from a section to it's children.
27-
sibling_links: Link from a section to other sections with the same parent.
28-
"""
29-
self._kind = kind
30-
self._up_links = up_links
31-
self._down_links = down_links
32-
self._sibling_links = sibling_links
33-
34-
def as_document_extractor(
35-
self, hierarchy: Callable[[Document], HierarchyInput]
36-
) -> LinkExtractor[Document]:
37-
return LinkExtractorAdapter(underlying=self, transform=hierarchy)
38-
39-
def extract_one(
40-
self,
41-
input: HierarchyInput, # noqa: A002
42-
) -> Set[Link]:
43-
this_path = "/".join(input)
44-
parent_path = None
45-
46-
links = set()
47-
if self._up_links:
48-
links.add(Link.incoming(kind=self._kind, tag=f"up:{this_path}"))
49-
if self._down_links:
50-
links.add(Link.outgoing(kind=self._kind, tag=f"down:{this_path}"))
51-
52-
if len(input) >= 1:
53-
parent_path = "/".join(input[0:-1])
54-
if self._up_links and len(input) > 1:
55-
links.add(Link.outgoing(kind=self._kind, tag=f"up:{parent_path}"))
56-
if self._down_links and len(input) > 1:
57-
links.add(Link.incoming(kind=self._kind, tag=f"down:{parent_path}"))
58-
if self._sibling_links:
59-
links.add(Link.bidir(kind=self._kind, tag=f"sib:{parent_path}"))
60-
61-
return links
1+
from langchain_community.graph_vectorstores.extractors import (
2+
HierarchyInput,
3+
HierarchyLinkExtractor,
4+
)
5+
6+
__all__ = [
7+
"HierarchyInput",
8+
"HierarchyLinkExtractor",
9+
]
Lines changed: 9 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -1,119 +1,9 @@
1-
from dataclasses import dataclass
2-
from typing import TYPE_CHECKING, Set, Union
3-
from urllib.parse import urldefrag, urljoin, urlparse
4-
5-
from langchain_core.documents import Document
6-
from langchain_core.graph_vectorstores import Link
7-
8-
from .link_extractor import LinkExtractor
9-
from .link_extractor_adapter import LinkExtractorAdapter
10-
11-
if TYPE_CHECKING:
12-
from bs4 import BeautifulSoup
13-
14-
15-
def _parse_url(link, page_url, drop_fragments: bool = True):
16-
href = link.get("href")
17-
if href is None:
18-
return None
19-
url = urlparse(href)
20-
if url.scheme not in ["http", "https", ""]:
21-
return None
22-
23-
# Join the HREF with the page_url to convert relative paths to absolute.
24-
url = urljoin(page_url, href)
25-
26-
# Fragments would be useful if we chunked a page based on section.
27-
# Then, each chunk would have a different URL based on the fragment.
28-
# Since we aren't doing that yet, they just "break" links. So, drop
29-
# the fragment.
30-
if drop_fragments:
31-
return urldefrag(url).url
32-
return url
33-
34-
35-
def _parse_hrefs(
36-
soup: "BeautifulSoup", url: str, drop_fragments: bool = True
37-
) -> Set[str]:
38-
links = soup.find_all("a")
39-
links = {
40-
_parse_url(link, page_url=url, drop_fragments=drop_fragments) for link in links
41-
}
42-
43-
# Remove entries for any 'a' tag that failed to parse (didn't have href,
44-
# or invalid domain, etc.)
45-
links.discard(None)
46-
47-
# Remove self links.
48-
links.discard(url)
49-
50-
return links
51-
52-
53-
@dataclass
54-
class HtmlInput:
55-
content: Union[str, "BeautifulSoup"]
56-
base_url: str
57-
58-
59-
class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
60-
def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
61-
"""Extract hyperlinks from HTML content.
62-
63-
Expects the input to be an HTML string or a `BeautifulSoup` object.
64-
65-
Args:
66-
kind: The kind of edge to extract. Defaults to "hyperlink".
67-
drop_fragments: Whether fragments in URLs and links shoud be
68-
dropped. Defaults to `True`.
69-
"""
70-
try:
71-
import bs4 # noqa:F401
72-
except ImportError as e:
73-
raise ImportError(
74-
"BeautifulSoup4 is required for HtmlLinkExtractor. "
75-
"Please install it with `pip install beautifulsoup4`."
76-
) from e
77-
78-
self._kind = kind
79-
self.drop_fragments = drop_fragments
80-
81-
def as_document_extractor(
82-
self, url_metadata_key: str = "source"
83-
) -> LinkExtractor[Document]:
84-
"""Return a LinkExtractor that applies to documents.
85-
86-
NOTE: Since the HtmlLinkExtractor parses HTML, if you use with other similar
87-
link extractors it may be more efficient to call the link extractors directly
88-
on the parsed BeautifulSoup object.
89-
90-
Args:
91-
url_metadata_key: The name of the filed in document metadata with the URL of
92-
the document.
93-
"""
94-
return LinkExtractorAdapter(
95-
underlying=self,
96-
transform=lambda doc: HtmlInput(
97-
doc.page_content, doc.metadata[url_metadata_key]
98-
),
99-
)
100-
101-
def extract_one(
102-
self,
103-
input: HtmlInput, # noqa: A002
104-
) -> Set[Link]:
105-
content = input.content
106-
if isinstance(content, str):
107-
from bs4 import BeautifulSoup
108-
109-
content = BeautifulSoup(content, "html.parser")
110-
111-
base_url = input.base_url
112-
if self.drop_fragments:
113-
base_url = urldefrag(base_url).url
114-
115-
hrefs = _parse_hrefs(content, base_url, self.drop_fragments)
116-
117-
links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
118-
links.add(Link.incoming(kind=self._kind, tag=base_url))
119-
return links
1+
from langchain_community.graph_vectorstores.extractors import (
2+
HtmlInput,
3+
HtmlLinkExtractor,
4+
)
5+
6+
__all__ = [
7+
"HtmlInput",
8+
"HtmlLinkExtractor",
9+
]
Lines changed: 9 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,9 @@
1-
from typing import Any, Dict, Iterable, Optional, Set, Union
2-
3-
from langchain_core.documents import Document
4-
from langchain_core.graph_vectorstores import Link
5-
6-
from ragstack_langchain.graph_store.extractors.link_extractor import LinkExtractor
7-
8-
# TypeAlias is not available in Python 2.9, we can't use that or the newer `type`.
9-
KeybertInput = Union[str, Document]
10-
11-
12-
class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
13-
def __init__(
14-
self,
15-
*,
16-
kind: str = "kw",
17-
embedding_model: str = "all-MiniLM-L6-v2",
18-
extract_keywords_kwargs: Optional[Dict[str, Any]] = None,
19-
):
20-
"""Extract keywords using Keybert.
21-
22-
Args:
23-
kind: Kind of links to produce with this extractor.
24-
embedding_model: Name of the embedding model to use with Keybert.
25-
extract_keywords_kwargs: Keyword arguments to pass to Keybert's
26-
`extract_keywords` method.
27-
"""
28-
try:
29-
import keybert
30-
31-
self._kw_model = keybert.KeyBERT(model=embedding_model)
32-
except ImportError:
33-
raise ImportError(
34-
"keybert is required for KeybertLinkExtractor. "
35-
"Please install it with `pip install keybert`."
36-
) from None
37-
38-
self._kind = kind
39-
self._extract_keywords_kwargs = extract_keywords_kwargs or {}
40-
41-
def extract_one(self, input: KeybertInput) -> Set[Link]: # noqa: A002
42-
keywords = self._kw_model.extract_keywords(
43-
input if isinstance(input, str) else input.page_content,
44-
**self._extract_keywords_kwargs,
45-
)
46-
return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
47-
48-
def extract_many(
49-
self,
50-
inputs: Iterable[KeybertInput],
51-
) -> Iterable[Set[Link]]:
52-
if len(inputs) == 1:
53-
# Even though we pass a list, if it contains one item, keybert will
54-
# flatten it. This means it's easier to just call the special case
55-
# for one item.
56-
yield self.extract_one(inputs[0])
57-
elif len(inputs) > 1:
58-
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
59-
extracted = self._kw_model.extract_keywords(
60-
strs, **self._extract_keywords_kwargs
61-
)
62-
for keywords in extracted:
63-
yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
1+
from langchain_community.graph_vectorstores.extractors import (
2+
KeybertInput,
3+
KeybertLinkExtractor,
4+
)
5+
6+
__all__ = [
7+
"KeybertInput",
8+
"KeybertLinkExtractor",
9+
]

0 commit comments

Comments
 (0)