Skip to content

Commit 312ebb2

Browse files
authored
Data refactor (#253)
1 parent 86ebdeb commit 312ebb2

File tree

40 files changed

+975
-2280
lines changed

40 files changed

+975
-2280
lines changed

llmstack/apps/tasks.py

Lines changed: 0 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,89 +0,0 @@
1-
import logging
2-
3-
import weaviate
4-
5-
from llmstack.data.datasource_processor import DataSourceProcessor
6-
from llmstack.data.models import DataSource, DataSourceEntry, DataSourceEntryStatus
7-
from llmstack.data.types import DataSourceTypeFactory
8-
9-
logger = logging.getLogger(__name__)
10-
11-
12-
def delete_data_entry_task(
13-
datasource: DataSource,
14-
entry_data: DataSourceEntry,
15-
):
16-
logger.error("Deleting data_source_entry: %s" % str(entry_data.uuid))
17-
entry_data.status = DataSourceEntryStatus.MARKED_FOR_DELETION
18-
entry_data.save()
19-
20-
datasource.size -= entry_data.size
21-
datasource_entry_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
22-
datasource.type,
23-
)
24-
datasource_entry_handler = datasource_entry_handler_cls(datasource)
25-
try:
26-
datasource_entry_items = datasource_entry_handler.delete_entry(
27-
entry_data.config,
28-
)
29-
if datasource_entry_items:
30-
logger.debug(
31-
f"Deleted {len(datasource_entry_items)} items from weaviate for data_source_entry: {str(entry_data.uuid)}",
32-
)
33-
entry_data.delete()
34-
except weaviate.exceptions.UnexpectedStatusCodeException:
35-
logger.exception("Error deleting data source entry from weaviate")
36-
entry_data.delete()
37-
except Exception:
38-
logger.exception(
39-
"Error deleting data_source_entry: %s" % str(entry_data.name),
40-
)
41-
entry_data.status = DataSourceEntryStatus.FAILED
42-
entry_data.config = {
43-
"errors": {
44-
"message": "Error in deleting data source entry",
45-
},
46-
}
47-
entry_data.save()
48-
49-
datasource.save()
50-
return
51-
52-
53-
def resync_data_entry_task(
54-
datasource: DataSource,
55-
entry_data: DataSourceEntry,
56-
):
57-
logger.info("Resyncing task for data_source_entry: %s" % str(entry_data))
58-
59-
datasource_entry_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
60-
datasource.type,
61-
)
62-
datasource_entry_handler: DataSourceProcessor = datasource_entry_handler_cls(
63-
datasource,
64-
)
65-
entry_data.status = DataSourceEntryStatus.PROCESSING
66-
entry_data.save()
67-
old_size = entry_data.size
68-
69-
result = datasource_entry_handler.resync_entry(entry_data.config)
70-
entry_data.size = result.size
71-
config_entry = result.config
72-
config_entry["input"] = entry_data.config["input"]
73-
entry_data.config = config_entry
74-
entry_data.status = DataSourceEntryStatus.READY
75-
entry_data.save()
76-
77-
datasource.size = datasource.size - old_size + result.size
78-
datasource.save()
79-
80-
81-
def delete_data_source_task(datasource):
82-
datasource_type = datasource.type
83-
if datasource_type.is_external_datasource:
84-
return
85-
datasource_entry_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
86-
datasource_type,
87-
)
88-
datasource_entry_handler = datasource_entry_handler_cls(datasource)
89-
datasource_entry_handler.delete_all_entries()

llmstack/client/src/components/datasource/DataSourceEntryContent.jsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ function DataSourceEntryContent({ onCancel, dataSourceEntry, open }) {
2121
axios()
2222
.get(`/api/datasource_entries/${dataSourceEntry.uuid}/text_content`)
2323
.then((response) => {
24+
console.log(response);
2425
setData(
2526
<TextareaAutosize
2627
value={response.data?.content}

llmstack/common/blocks/data/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55

66
class DataDocument(BaseSchema):
7+
name: Optional[str] = None
78
content: Optional[bytes] = None
89
content_text: Optional[str] = None
910
metadata: Dict[str, Any] = {}

llmstack/common/blocks/data/source/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
class DataSourceEnvironmentSchema(BaseSchema):
8-
openai_key: str
8+
openai_key: Optional[str] = None
99

1010

1111
class DataSourceInputSchema(BaseSchema):

llmstack/common/blocks/data/source/uri.py

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,7 @@ class UriConfiguration(DataSourceConfigurationSchema):
9999
use_scrapy: bool = False
100100

101101

102-
class Uri(
103-
ProcessorInterface[
104-
UriInput,
105-
DataSourceOutputSchema,
106-
UriConfiguration,
107-
],
108-
):
102+
class Uri(ProcessorInterface[UriInput, DataSourceOutputSchema, UriConfiguration]):
109103
def _extract_text(
110104
self,
111105
data: bytes,
@@ -140,27 +134,12 @@ def _extract_text(
140134
else:
141135
raise Exception("Invalid mime type")
142136

143-
def process_data_url(
144-
self,
145-
input: UriInput,
146-
configuration: UriConfiguration,
147-
) -> DataSourceOutputSchema:
148-
mime_type, file_name, base64_encoded_data = validate_parse_data_uri(
149-
input.uri,
150-
)
137+
def process_data_url(self, input: UriInput, configuration: UriConfiguration) -> DataSourceOutputSchema:
138+
mime_type, file_name, base64_encoded_data = validate_parse_data_uri(input.uri)
151139
decoded_data = base64.b64decode(base64_encoded_data)
152-
return self._extract_text(
153-
decoded_data,
154-
mime_type,
155-
file_name,
156-
configuration,
157-
)
158-
159-
def process_http_url(
160-
self,
161-
input: UriInput,
162-
configuration: UriConfiguration,
163-
) -> DataSourceOutputSchema:
140+
return self._extract_text(decoded_data, mime_type, file_name, configuration)
141+
142+
def process_http_url(self, input: UriInput, configuration: UriConfiguration) -> DataSourceOutputSchema:
164143
data = None
165144
if is_youtube_video_url(input.uri):
166145
raise Exception("Youtube video URLs are not supported")
@@ -197,11 +176,7 @@ def process_http_url(
197176

198177
return self._extract_text(data, mime_type, input.uri, configuration)
199178

200-
def process(
201-
self,
202-
input: UriInput,
203-
configuration: UriConfiguration,
204-
) -> DataSourceOutputSchema:
179+
def process(self, input: UriInput, configuration: UriConfiguration) -> DataSourceOutputSchema:
205180
if input.uri.startswith("data:"):
206181
return self.process_data_url(input, configuration)
207182
elif input.uri.startswith("http://") or input.uri.startswith("https://"):
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: File
2+
slug: file
3+
description: Read data from a file
4+
pipeline:
5+
source:
6+
slug: file
7+
provider_slug: promptly
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: PDF
2+
slug: pdf
3+
description: Read data from a pdf
4+
pipeline:
5+
source:
6+
slug: pdf
7+
provider_slug: promptly
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: Singlestore
2+
slug: singlestore
3+
description: Read data from Singlestore
4+
pipeline:
5+
destination:
6+
slug: singlestore
7+
provider_slug: singlestore
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: Text
2+
slug: text
3+
description: Read data from a text box
4+
pipeline:
5+
source:
6+
slug: text
7+
provider_slug: promptly
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: URL
2+
slug: url
3+
description: Read data from a website URL
4+
pipeline:
5+
source:
6+
slug: url
7+
provider_slug: promptly

0 commit comments

Comments
 (0)