|
1 | 1 | import logging |
2 | | -from typing import List |
| 2 | +from typing import Any, List |
3 | 3 | from typing import Optional |
4 | 4 |
|
5 | 5 | from pydantic import Field |
|
9 | 9 | from common.utils.text_extract import ExtraParams |
10 | 10 | from common.utils.splitter import SpacyTextSplitter |
11 | 11 | from common.utils.utils import extract_urls_from_sitemap |
12 | | -from datasources.handlers.datasource_type_interface import DataSourceEntryItem |
| 12 | +from datasources.handlers.datasource_type_interface import DataSourceEntryItem, DataSourceSyncConfiguration, DataSourceSyncType |
13 | 13 | from datasources.handlers.datasource_type_interface import DataSourceSchema |
14 | 14 | from datasources.handlers.datasource_type_interface import DataSourceProcessor |
15 | 15 | from datasources.handlers.datasource_type_interface import WEAVIATE_SCHEMA |
@@ -54,6 +54,26 @@ def name() -> str: |
54 | 54 | @staticmethod |
55 | 55 | def slug() -> str: |
56 | 56 | return 'url' |
| 57 | + |
| 58 | + @classmethod |
| 59 | + def get_sync_configuration(cls) -> Optional[dict]: |
| 60 | + return DataSourceSyncConfiguration(sync_type=DataSourceSyncType.FULL).dict() |
| 61 | + |
| 62 | + def get_url_data(self, url: str) -> Optional[DataSourceEntryItem]: |
| 63 | + if not url.startswith('https://') and not url.startswith('http://'): |
| 64 | + url = f'https://{url}' |
| 65 | + |
| 66 | + text = extract_text_from_url( |
| 67 | + url, extra_params=ExtraParams(openai_key=self.openai_key), |
| 68 | + ) |
| 69 | + docs = [ |
| 70 | + Document( |
| 71 | + page_content_key=self.get_content_key(), page_content=t, metadata={ |
| 72 | + 'source': url, |
| 73 | + }, |
| 74 | + ) for t in SpacyTextSplitter(chunk_size=1500, length_func=len).split_text(text) |
| 75 | + ] |
| 76 | + return docs |
57 | 77 |
|
58 | 78 | def validate_and_process(self, data: dict) -> List[DataSourceEntryItem]: |
59 | 79 | entry = URLSchema(**data) |
@@ -83,22 +103,11 @@ def validate_and_process(self, data: dict) -> List[DataSourceEntryItem]: |
83 | 103 |
|
84 | 104 | return list(map(lambda entry: DataSourceEntryItem(name=entry, data={'url': entry}), urls + sitemap_urls)) |
85 | 105 |
|
| 106 | + |
| 107 | + |
86 | 108 | def get_data_documents(self, data: DataSourceEntryItem) -> Optional[DataSourceEntryItem]: |
87 | 109 | url = data.data['url'] |
88 | | - if not url.startswith('https://') and not url.startswith('http://'): |
89 | | - url = f'https://{url}' |
90 | | - |
91 | | - text = extract_text_from_url( |
92 | | - url, extra_params=ExtraParams(openai_key=self.openai_key), |
93 | | - ) |
94 | | - docs = [ |
95 | | - Document( |
96 | | - page_content_key=self.get_content_key(), page_content=t, metadata={ |
97 | | - 'source': url, |
98 | | - }, |
99 | | - ) for t in SpacyTextSplitter(chunk_size=1500, length_func=len).split_text(text) |
100 | | - ] |
101 | | - return docs |
| 110 | + return self.get_url_data(url) |
102 | 111 |
|
103 | 112 | def similarity_search(self, query: str, *args, **kwargs) -> List[dict]: |
104 | 113 | return super().similarity_search(query, *args, **kwargs) |
0 commit comments