Skip to content

Commit 38f9147

Browse files
committed
Initial Commit
1 parent 1a866c6 commit 38f9147

File tree

9 files changed

+1181
-635
lines changed

9 files changed

+1181
-635
lines changed

.github/workflows/release.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
# See here: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment
2424
environment:
2525
name: pypi
26-
url: https://pypi.org/p/nodestream-plugin-dotenv/
26+
url: https://pypi.org/p/nodestream-plugin-semantic/
2727

2828
steps:
2929
# Checkout the repository subject to the release.

README.md

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,2 @@
1-
# Nodestream Dotenv Plugin
1+
# Nodestream Semantic Plugin
22

3-
This plugin allows you to load environment variables from a `.env` file into your nodestream application.
4-
5-
## Installation
6-
7-
```bash
8-
pip install nodestream-plugin-dotenv
9-
```
10-
11-
## Usage
12-
13-
By default, the plugin will look for a `.env` file in the current working directory. You can specify a different path by setting the `NODESTREAM_DOTENV_PATH` environment variable.
14-
15-
For more information on how to use the `.env` file, see the [python-dotenv](https://github.com/theskumar/python-dotenv#file-format) documentation.
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from abc import ABC, abstractmethod
2+
from typing import Iterable
3+
4+
from nodestream.subclass_registry import SubclassRegistry
5+
from nodestream.pluggable import Pluggable
6+
7+
from .model import Content
8+
9+
CHUNKER_SUBCLASS_REGISTRY = SubclassRegistry()
10+
11+
12+
@CHUNKER_SUBCLASS_REGISTRY.connect_baseclass
13+
class Chunker(ABC, Pluggable):
14+
"""Chunker is a mechanism to split a large document into smaller chunks.
15+
16+
The chunker is used to split a large document into smaller chunks.
17+
The chunker is useful when the document is too large to be
18+
semantically meaningful as one piece of content.
19+
"""
20+
21+
entrypoint_name = "chunkers"
22+
23+
@staticmethod
24+
def from_file_data(type, **chunker_kwargs) -> "Chunker":
25+
return CHUNKER_SUBCLASS_REGISTRY.get(type)(**chunker_kwargs)
26+
27+
@abstractmethod
28+
def chunk(self, content: Content) -> Iterable[Content]: ...
29+
30+
31+
class SplitOnDelimiterChunker(Chunker):
32+
def __init__(self, delimiter: str):
33+
self.delimiter = delimiter
34+
35+
def chunk(self, content: Content) -> Iterable[Content]:
36+
for item in content.split_on_delimiter(self.delimiter):
37+
yield item
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from abc import ABC, abstractmethod
2+
from typing import Iterable
3+
from pathlib import Path
4+
5+
from nodestream.subclass_registry import SubclassRegistry
6+
7+
8+
CONTENT_TYPE_SUBCLASS_REGISTRY = SubclassRegistry()
9+
PLAIN_TEXT_ALIAS = "plain_text"
10+
PLAIN_TEXT_EXTENSIONS = {".txt", ".md"}
11+
12+
13+
@CONTENT_TYPE_SUBCLASS_REGISTRY.connect_baseclass
14+
class ContentType(ABC):
15+
"""Describes the mechanism to read a file of a specific content type."""
16+
17+
@classmethod
18+
def all(cls) -> Iterable["ContentType"]:
19+
cls.import_all() # Import all embedders to register them.
20+
for sub in CONTENT_TYPE_SUBCLASS_REGISTRY.all_subclasses():
21+
yield sub()
22+
23+
@classmethod
24+
def by_name(cls, name: str) -> "ContentType":
25+
cls.import_all() # Import all embedders to register them.
26+
return CONTENT_TYPE_SUBCLASS_REGISTRY.get(name)()
27+
28+
@abstractmethod
29+
def is_supported(self, file_path: Path) -> bool:
30+
"""Returns True if the file extension is supported.
31+
32+
Args:
33+
file_path (Path): The file path to check.
34+
35+
Returns:
36+
bool: True if the file extension is supported, False otherwise.
37+
"""
38+
...
39+
40+
@abstractmethod
41+
def read(self, file_path: Path) -> str:
42+
"""Reads the content of the file.
43+
44+
Args:
45+
file_path (Path): The file path to read.
46+
47+
Returns:
48+
str: The content of the file.
49+
"""
50+
...
51+
52+
53+
class PlainText(ContentType, alias=PLAIN_TEXT_ALIAS):
54+
"""Reads plain text files."""
55+
56+
def is_supported(self, file_path: Path) -> bool:
57+
return file_path.suffix in PLAIN_TEXT_EXTENSIONS
58+
59+
def read(self, file_path: Path) -> str:
60+
with file_path.open("r") as f:
61+
return f.read()
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from abc import ABC, abstractmethod
2+
3+
from nodestream.subclass_registry import SubclassRegistry
4+
from nodestream.pluggable import Pluggable
5+
6+
from .model import Content, Embedding
7+
8+
9+
EMBEDDER_SUBCLASS_REGISTRY = SubclassRegistry()
10+
11+
12+
@EMBEDDER_SUBCLASS_REGISTRY.connect_baseclass
13+
class Embedder(ABC, Pluggable):
14+
"""Embedder is a mechanism to embed content into a vector space."""
15+
16+
entrypoint_name = "embedders"
17+
18+
@classmethod
19+
def from_file_data(cls, type, **embedder_kwargs) -> "Embedder":
20+
cls.import_all() # Import all embedders to register them.
21+
return EMBEDDER_SUBCLASS_REGISTRY.get(type)(**embedder_kwargs)
22+
23+
@abstractmethod
24+
async def embed(self, content: Content) -> Embedding:
25+
"""Embeds the content into a vector space.
26+
27+
Args:
28+
content (Content): The content to embed.
29+
30+
Returns:
31+
Embedding: The embedding of the content.
32+
"""
33+
...
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from dataclasses import dataclass
2+
import hashlib
3+
from typing import List, Optional, Iterable
4+
5+
from nodestream.model import DesiredIngestion, Node
6+
7+
8+
Embedding = List[float | int]
9+
CONTENT_NODE_TYPE_ID_PROPERTY = "id"
10+
11+
12+
def hash(content: str) -> str:
13+
sha1 = hashlib.sha1()
14+
sha1.update(content.encode())
15+
return sha1.hexdigest()
16+
17+
18+
@dataclass(slots=True)
19+
class Content:
20+
"""Content is a piece of text.
21+
22+
Content is a piece of text that can be embedded into a vector space.
23+
"""
24+
25+
id: str
26+
content: str
27+
parent: Optional["Content"] = None
28+
embedding: Optional[Embedding] = None
29+
metadata: Optional[dict] = None
30+
31+
@classmethod
32+
def from_text(
33+
cls,
34+
content: str,
35+
parent: Optional["Content"] = None,
36+
) -> "Content":
37+
return cls(id=hash(content), content=content, parent=parent)
38+
39+
def add_metadata(self, key: str, value: str):
40+
if not self.metadata:
41+
self.metadata = {}
42+
self.metadata[key] = value
43+
44+
def split_on_delimiter(self, delimiter: str) -> Iterable["Content"]:
45+
for line in self.content.split(delimiter):
46+
yield Content.from_text(line, parent=self)
47+
48+
def assign_embedding(self, embedding: Embedding):
49+
self.embedding = embedding
50+
51+
def apply_to_node(self, node_type: str, node: Node):
52+
node.type = node_type
53+
node.key_values.set_property(CONTENT_NODE_TYPE_ID_PROPERTY, self.id)
54+
node.properties.set_property("content", self.content)
55+
if self.embedding:
56+
node.properties.set_property("embedding", self.embedding)
57+
if self.metadata:
58+
for key, value in self.metadata.items():
59+
node.properties.set_property(key, value)
60+
61+
def make_ingestible(
62+
self, node_type: str, relationship_type: str
63+
) -> DesiredIngestion:
64+
ingest = DesiredIngestion()
65+
self.apply_to_node(node_type, ingest.source)
66+
67+
if self.parent:
68+
self.parent.apply_to_node(node_type, related := Node())
69+
ingest.add_relationship(
70+
related_node=related, type=relationship_type, outbound=False
71+
)
72+
73+
return ingest

0 commit comments

Comments
 (0)