Skip to content

Commit fea369f

Browse files
authored
SingleStore v2 API source connector (#310)
1 parent a574865 commit fea369f

File tree

10 files changed

+203
-56
lines changed

10 files changed

+203
-56
lines changed

api-reference/ingest/destination-connector/singlestore.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ import NewDocument from '/snippets/general-shared-text/new-document.mdx';
77
<NewDocument />
88

99
import SharedSingleStore from '/snippets/dc-shared-text/singlestore-cli-api.mdx';
10+
import SharedSingleStoreSchema from '/snippets/general-shared-text/singlestore-schema.mdx';
1011
import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';
1112

1213
<SharedSingleStore />
14+
<SharedSingleStoreSchema />
1315
<SharedAPIKeyURL/>
1416

1517
Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector:
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
---
2+
title: SingleStore
3+
---
4+
5+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6+
7+
<NewDocument />
8+
9+
import SharedContentSingleStore from '/snippets/sc-shared-text/singlestore-cli-api.mdx';
10+
import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';
11+
12+
<SharedContentSingleStore/>
13+
<SharedAPIKeyURL/>
14+
15+
Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector:
16+
17+
import SingleStoreAPISh from '/snippets/source_connectors/singlestore.sh.mdx';
18+
import SingleStoreAPIPyV2 from '/snippets/source_connectors/singlestore.v2.py.mdx';
19+
20+
<CodeGroup>
21+
<SingleStoreAPISh />
22+
<SingleStoreAPIPyV2 />
23+
</CodeGroup>

mint.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@
174174
"open-source/ingest/source-connectors/salesforce",
175175
"open-source/ingest/source-connectors/sftp",
176176
"open-source/ingest/source-connectors/sharepoint",
177+
"open-source/ingest/source-connectors/singlestore",
177178
"open-source/ingest/source-connectors/slack",
178179
"open-source/ingest/source-connectors/snowflake",
179180
"open-source/ingest/source-connectors/sqlite",
@@ -330,6 +331,7 @@
330331
"api-reference/ingest/source-connectors/slack",
331332
"api-reference/ingest/source-connectors/sftp",
332333
"api-reference/ingest/source-connectors/sharepoint",
334+
"api-reference/ingest/source-connectors/singlestore",
333335
"api-reference/ingest/source-connectors/slack",
334336
"api-reference/ingest/source-connectors/snowflake",
335337
"api-reference/ingest/source-connectors/sqlite",

open-source/ingest/destination-connectors/singlestore.mdx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@ import NewDocument from '/snippets/general-shared-text/new-document.mdx';
77
<NewDocument />
88

99
import SharedSingleStore from '/snippets/dc-shared-text/singlestore-cli-api.mdx';
10+
import SharedSingleStoreSchema from '/snippets/general-shared-text/singlestore-schema.mdx';
1011

1112
<SharedSingleStore />
13+
<SharedSingleStoreSchema />
1214

1315
Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector.
1416

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
title: SingleStore
3+
---
4+
5+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6+
7+
<NewDocument />
8+
9+
import SharedContentSingleStore from '/snippets/sc-shared-text/singlestore-cli-api.mdx';
10+
11+
<SharedContentSingleStore/>
12+
13+
Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector.
14+
15+
This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page.
16+
17+
import SingleStoreSh from '/snippets/source_connectors/singlestore.sh.mdx';
18+
import SingleStorePyV2 from '/snippets/source_connectors/singlestore.v2.py.mdx';
19+
20+
<CodeGroup>
21+
<SingleStoreSh />
22+
<SingleStorePyV2 />
23+
</CodeGroup>
24+
25+
import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';
26+
27+
<SharedPartitionByAPIOSS/>
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
The table's schema must match the schema of the documents that Unstructured produces.
2+
3+
During insertion, JSON objects are flattened, and underscores are placed between nested object names.
4+
For example, the following JSON object matches a column in the table named `data_source_date_created`:
5+
6+
```json
7+
{
8+
"...": "...",
9+
"data_source": {
10+
"date_created": "1719963233.949"
11+
},
12+
"...": "..."
13+
}
14+
```
15+
16+
The `metadata` object itself is also flattened in a similar way. However, for nested objects in `metadata`,
17+
the column in the table does not start with `metadata_`. For example, the following JSON object matches a
18+
column in the table named `last_modified`:
19+
20+
```json
21+
{
22+
"...": "...",
23+
"metadata": {
24+
"...": "...",
25+
"last_modified": "2022-12-16T17:04:16-05:00",
26+
"...": "..."
27+
},
28+
"...": "..."
29+
}
30+
```
31+
32+
Unstructured cannot provide a table schema that is guaranteed to work in all
33+
circumstances. This is because these schemas will vary based on your source files' types; how you
34+
want Unstructured to partition, chunk, and generate embeddings; any custom post-processing code that you run; and other factors.
35+
36+
You can adapt the following table schema example for your own needs:
37+
38+
```sql
39+
CREATE TABLE elements (
40+
id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
41+
element_id TEXT,
42+
text TEXT,
43+
embeddings Vector(384),
44+
parent_id TEXT,
45+
page_number TEXT,
46+
is_continuation BOOLEAN,
47+
orig_elements TEXT
48+
);
49+
```
50+
51+
See also:
52+
53+
- [CREATE TABLE](https://docs.singlestore.com/cloud/reference/sql-reference/data-definition-language-ddl/create-table/)
54+
in the SingleStore documentation
55+
- [Unstructured document elements and metadata](/api-reference/api-services/document-elements)

snippets/general-shared-text/singlestore.mdx

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,62 +8,6 @@ The SingleStore prerequisites:
88
- The name of the database in the deployment.
99
- The name of the table in the database.
1010

11-
The table's schema must match the schema of the documents that Unstructured produces.
12-
13-
During insertion, JSON objects are flattened, and underscores are placed between nested object names.
14-
For example, the following JSON object matches a column in the table named `data_source_date_created`:
15-
16-
```json
17-
{
18-
"...": "...",
19-
"data_source": {
20-
"date_created": "1719963233.949"
21-
},
22-
"...": "..."
23-
}
24-
```
25-
26-
The `metadata` object itself is also flattened in a similar way. However, for nested objects in `metadata`,
27-
the column in the table does not start with `metadata_`. For example, the following JSON object matches a
28-
column in the table named `last_modified`:
29-
30-
```json
31-
{
32-
"...": "...",
33-
"metadata": {
34-
"...": "...",
35-
"last_modified": "2022-12-16T17:04:16-05:00",
36-
"...": "..."
37-
},
38-
"...": "..."
39-
}
40-
```
41-
42-
Unstructured cannot provide a table schema that is guaranteed to work in all
43-
circumstances. This is because these schemas will vary based on your source files' types; how you
44-
want Unstructured to partition, chunk, and generate embeddings; any custom post-processing code that you run; and other factors.
45-
46-
You can adapt the following table schema example for your own needs:
47-
48-
```sql
49-
CREATE TABLE elements (
50-
id INT PRIMARY KEY NOT NULL AUTO_INCREMENT,
51-
element_id TEXT,
52-
text TEXT,
53-
embeddings Vector(384),
54-
parent_id TEXT,
55-
page_number TEXT,
56-
is_continuation BOOLEAN,
57-
orig_elements TEXT
58-
);
59-
```
60-
61-
See also:
62-
63-
- [CREATE TABLE](https://docs.singlestore.com/cloud/reference/sql-reference/data-definition-language-ddl/create-table/)
64-
in the SingleStore documentation
65-
- [Unstructured document elements and metadata](/api-reference/api-services/document-elements)
66-
6711
To get the values for the hostname, port, username, and password:
6812

6913
1. In your SingleStore account's dashboard sidebar, click **Deployments**.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Connect SingleStore to your preprocessing pipeline, and use the Unstructured Ingest CLI or the Unstructured Ingest Python library to batch process all your documents and store structured outputs locally on your filesystem.
2+
3+
You will need:
4+
5+
import SharedSingleStore from '/snippets/general-shared-text/singlestore.mdx';
6+
import SharedSingleStoreCLIAPI from '/snippets/general-shared-text/singlestore-cli-api.mdx';
7+
8+
<SharedSingleStore />
9+
<SharedSingleStoreCLIAPI />
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
```bash CLI
2+
#!/usr/bin/env bash
3+
4+
# Chunking and embedding are optional.
5+
6+
unstructured-ingest \
7+
singlestore \
8+
--host $SINGLESTORE_HOST \
9+
--port $SINGLESTORE_PORT \
10+
--user $SINGLESTORE_USER \
11+
--password $SINGLESTORE_PASSWORD \
12+
--database $SINGLESTORE_DB \
13+
--table-name $SINGLESTORE_TABLE \
14+
--id-column id \
15+
--batch-size 100 \
16+
--output-dir $LOCAL_FILE_OUTPUT_DIR \
17+
--partition-by-api \
18+
--api-key $UNSTRUCTURED_API_KEY \
19+
--partition-endpoint $UNSTRUCTURED_API_URL \
20+
--strategy fast \
21+
--chunking-strategy by_title \
22+
--embedding-provider huggingface
23+
```
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
```python Python Ingest v2
2+
import os
3+
4+
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
5+
from unstructured_ingest.v2.interfaces import ProcessorConfig
6+
7+
from unstructured_ingest.v2.processes.connectors.sql.singlestore import (
8+
SingleStoreIndexerConfig,
9+
SingleStoreDownloaderConfig,
10+
SingleStoreConnectionConfig,
11+
SingleStoreAccessConfig
12+
)
13+
14+
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
15+
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
16+
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
17+
18+
from unstructured_ingest.v2.processes.connectors.local import (
19+
LocalUploaderConfig,
20+
LocalConnectionConfig
21+
)
22+
23+
# Chunking and embedding are optional.
24+
25+
if __name__ == "__main__":
26+
Pipeline.from_configs(
27+
context=ProcessorConfig(),
28+
indexer_config=SingleStoreIndexerConfig(
29+
table_name=os.getenv("SINGLESTORE_TABLE"),
30+
id_column="id",
31+
batch_size=100
32+
),
33+
downloader_config=SingleStoreDownloaderConfig(
34+
download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR")
35+
),
36+
source_connection_config=SingleStoreConnectionConfig(
37+
access_config=SingleStoreAccessConfig(
38+
password=os.getenv("SINGLESTORE_PASSWORD")
39+
),
40+
host=os.getenv("SINGLESTORE_HOST"),
41+
port=os.getenv("SINGLESTORE_PORT"),
42+
user=os.getenv("SINGLESTORE_USER"),
43+
database=os.getenv("SINGLESTORE_DB")
44+
),
45+
partitioner_config=PartitionerConfig(
46+
partition_by_api=True,
47+
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
48+
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
49+
additional_partition_args={
50+
"split_pdf_page": True,
51+
"split_pdf_allow_failed": True,
52+
"split_pdf_concurrency_level": 15
53+
}
54+
),
55+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
56+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
57+
destination_connection_config=LocalConnectionConfig(),
58+
uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR"))
59+
).run()
60+
```

0 commit comments

Comments
 (0)