Skip to content

Commit 3acf35e

Browse files
authored
Merge pull request #76 from FalkorDB/string-source
Add support for string-based source
2 parents 2ef26b2 + 066da8c commit 3acf35e

File tree

4 files changed

+95
-37
lines changed

4 files changed

+95
-37
lines changed

graphrag_sdk/document_loaders/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from .csv import CSVLoader
55
from .url import URLLoader
66
from .jsonl import JSONLLoader
7+
from .string import StringLoader
78

89
__all__ = [
910
"PDFLoader",
@@ -12,4 +13,5 @@
1213
"CSVLoader",
1314
"URLLoader",
1415
"JSONLLoader",
16+
"StringLoader",
1517
]
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from typing import Iterator
2+
from graphrag_sdk.document import Document
3+
4+
class StringLoader():
5+
"""
6+
Load String
7+
"""
8+
9+
def __init__(self, string: str) -> None:
10+
"""
11+
Initialize loader
12+
13+
Parameters:
14+
string (str): string from memory.
15+
"""
16+
self.string = string
17+
18+
def load(self) -> Iterator[Document]:
19+
"""
20+
Load string from memory
21+
22+
Returns:
23+
Iterator[Document]: document iterator
24+
"""
25+
yield Document(self.string)

graphrag_sdk/source.py

Lines changed: 63 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from typing import Iterator
2-
from abc import ABC, abstractmethod
1+
from abc import ABC
2+
from typing import Optional, Iterator
33
from graphrag_sdk.document import Document
44
from graphrag_sdk.document_loaders import (
55
PDFLoader,
@@ -8,26 +8,25 @@
88
HTMLLoader,
99
CSVLoader,
1010
JSONLLoader,
11+
StringLoader,
1112
)
1213

1314

14-
def Source(path: str, instruction: str | None = None) -> "AbstractSource":
15+
def Source(path: str, instruction: Optional[str] = None) -> "AbstractSource":
1516
"""
1617
Creates a source object
1718
18-
Parameters:
19+
Args:
1920
path (str): path to source
2021
instruction (str): source specific instruction for the LLM
2122
2223
Returns:
23-
AbstractSource: source
24+
AbstractSource: A source object corresponding to the input path format.
2425
"""
2526

2627
if not isinstance(path, str) or path == "":
2728
raise Exception("Invalid argument, path should be a none empty string.")
2829

29-
s = None
30-
3130
if ".pdf" in path.lower():
3231
s = PDF(path)
3332
elif ".html" in path.lower():
@@ -38,33 +37,54 @@ def Source(path: str, instruction: str | None = None) -> "AbstractSource":
3837
s = CSV(path)
3938
elif ".jsonl" in path.lower():
4039
s = JSONL(path)
41-
else:
40+
elif ".txt" in path.lower():
4241
s = TEXT(path)
42+
else:
43+
raise Exception("Unsupported file format.")
4344

4445
# Set source instructions
4546
s.instruction = instruction
4647

4748
return s
4849

50+
def Source_FromRawText(text: str, instruction: Optional[str] = None) -> "AbstractSource":
51+
"""
52+
Creates a source object from raw text
53+
54+
Args:
55+
text (str): raw text
56+
instruction (str): source specific instruction for the LLM
57+
58+
Returns:
59+
AbstractSource: A string source object.
60+
"""
61+
if not isinstance(text, str) or text == "":
62+
raise Exception("Invalid argument, text should be a none empty string.")
63+
64+
s = STRING(text)
65+
s.instruction = instruction
66+
67+
return s
68+
4969

5070
class AbstractSource(ABC):
5171
"""
5272
Abstract class representing a source file
5373
"""
5474

55-
def __init__(self, path: str):
75+
def __init__(self, data_source: str):
5676
"""
5777
Initializes a new instance of the Source class.
5878
5979
Args:
60-
path (str): The path to the source file.
80+
data_source (str): Either a file path or a string.
6181
6282
Attributes:
63-
path (str): The path to the source file.
64-
loader: The loader object associated with the source file.
83+
data_source (str): The source path for the data or the data as a string.
84+
loader: The loader object associated with the source.
6585
instruction (str): The instruction for the source file.
6686
"""
67-
self.path = path
87+
self.data_source = data_source
6888
self.loader = None
6989
self.instruction = ""
7090

@@ -90,73 +110,82 @@ def __eq__(self, other) -> bool:
90110
if not isinstance(other, AbstractSource):
91111
return False
92112

93-
return self.path == other.path
113+
return self.data_source == other.data_source
94114

95115
def __hash__(self):
96116
"""
97-
Calculates the hash value of the Source object based on its path.
117+
Calculates the hash value of the Source object based on its data_source.
98118
99119
Returns:
100120
int: The hash value of the Source object.
101121
"""
102-
return hash(self.path)
122+
return hash(self.data_source)
103123

104124

105125
class PDF(AbstractSource):
106126
"""
107127
PDF resource
108128
"""
109129

110-
def __init__(self, path):
111-
super().__init__(path)
112-
self.loader = PDFLoader(self.path)
130+
def __init__(self, data_source):
131+
super().__init__(data_source)
132+
self.loader = PDFLoader(self.data_source)
113133

114134

115135
class TEXT(AbstractSource):
116136
"""
117137
TEXT resource
118138
"""
119139

120-
def __init__(self, path):
121-
super().__init__(path)
122-
self.loader = TextLoader(self.path)
140+
def __init__(self, data_source):
141+
super().__init__(data_source)
142+
self.loader = TextLoader(self.data_source)
123143

124144

125145
class URL(AbstractSource):
126146
"""
127147
URL resource
128148
"""
129149

130-
def __init__(self, path):
131-
super().__init__(path)
132-
self.loader = URLLoader(self.path)
150+
def __init__(self, data_source):
151+
super().__init__(data_source)
152+
self.loader = URLLoader(self.data_source)
133153

134154

135155
class HTML(AbstractSource):
136156
"""
137157
HTML resource
138158
"""
139159

140-
def __init__(self, path):
141-
super().__init__(path)
142-
self.loader = HTMLLoader(self.path)
160+
def __init__(self, data_source):
161+
super().__init__(data_source)
162+
self.loader = HTMLLoader(self.data_source)
143163

144164

145165
class CSV(AbstractSource):
146166
"""
147167
CSV resource
148168
"""
149169

150-
def __init__(self, path, rows_per_document: int = 50):
151-
super().__init__(path)
152-
self.loader = CSVLoader(self.path, rows_per_document)
170+
def __init__(self, data_source, rows_per_document: int = 50):
171+
super().__init__(data_source)
172+
self.loader = CSVLoader(self.data_source, rows_per_document)
153173

154174

155175
class JSONL(AbstractSource):
156176
"""
157177
JSONL resource
158178
"""
159179

160-
def __init__(self, path, rows_per_document: int = 50):
161-
super().__init__(path)
162-
self.loader = JSONLLoader(self.path, rows_per_document)
180+
def __init__(self, data_source, rows_per_document: int = 50):
181+
super().__init__(data_source)
182+
self.loader = JSONLLoader(self.data_source, rows_per_document)
183+
184+
class STRING(AbstractSource):
185+
"""
186+
String resource
187+
"""
188+
189+
def __init__(self, data_source: str):
190+
super().__init__(data_source)
191+
self.loader = StringLoader(self.data_source)

tests/test_kg_litellm_openai.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from falkordb import FalkorDB
55
from dotenv import load_dotenv
66
from graphrag_sdk.entity import Entity
7-
from graphrag_sdk.source import Source
7+
from graphrag_sdk.source import Source_FromRawText
88
from graphrag_sdk.relation import Relation
99
from graphrag_sdk.ontology import Ontology
1010
from graphrag_sdk.attribute import Attribute, AttributeType
@@ -78,8 +78,10 @@ def setUpClass(cls):
7878
def test_kg_creation(self):
7979

8080
file_path = "tests/data/madoff.txt"
81-
82-
sources = [Source(file_path)]
81+
with open(file_path) as f:
82+
string = f.read()
83+
84+
sources = [Source_FromRawText(string)]
8385

8486
self.kg.process_sources(sources)
8587

0 commit comments

Comments
 (0)