@@ -40,7 +40,7 @@ import os
40
40
```
41
41
42
42
``` python
43
- _ = load_dotenv(find_dotenv())
43
+ _ = load_dotenv(find_dotenv(), override = True )
44
44
service_url = os.environ[' TIMESCALE_SERVICE_URL' ]
45
45
```
46
46
@@ -97,24 +97,24 @@ Now you can query for similar items:
97
97
await vec.search([1.0 , 9.0 ])
98
98
```
99
99
100
- [<Record id=UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>,
101
- <Record id=UUID('2cdb8cbd-5dd7-4555-926a-5efafb4b1cf0 ') metadata={'animal': 'fox'} contents='the brown fox' embedding=array([1. , 1.3], dtype=float32) distance=0.14489260377438218>]
100
+ [<Record id=UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>,
101
+ <Record id=UUID('06153343-9085-4844-ad7a-b5cbed912053 ') metadata={'animal': 'fox'} contents='the brown fox' embedding=array([1. , 1.3], dtype=float32) distance=0.14489260377438218>]
102
102
103
103
You can specify the number of records to return.
104
104
105
105
``` python
106
106
await vec.search([1.0 , 9.0 ], limit = 1 )
107
107
```
108
108
109
- [<Record id=UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>]
109
+ [<Record id=UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>]
110
110
111
111
You can also specify a filter on the metadata as a simple dictionary
112
112
113
113
``` python
114
114
await vec.search([1.0 , 9.0 ], limit = 1 , filter = {" action" : " jump" })
115
115
```
116
116
117
- [<Record id=UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>]
117
+ [<Record id=UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>]
118
118
119
119
You can also specify a list of filter dictionaries, where an item is
120
120
returned if it matches any dict
@@ -123,8 +123,8 @@ returned if it matches any dict
123
123
await vec.search([1.0 , 9.0 ], limit = 2 , filter = [{" action" : " jump" }, {" animal" : " fox" }])
124
124
```
125
125
126
- [<Record id=UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>,
127
- <Record id=UUID('2cdb8cbd-5dd7-4555-926a-5efafb4b1cf0 ') metadata={'animal': 'fox'} contents='the brown fox' embedding=array([1. , 1.3], dtype=float32) distance=0.14489260377438218>]
126
+ [<Record id=UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ') metadata={'action': 'jump', 'animal': 'fox'} contents='jumped over the' embedding=array([ 1. , 10.8], dtype=float32) distance=0.00016793422934946456>,
127
+ <Record id=UUID('06153343-9085-4844-ad7a-b5cbed912053 ') metadata={'animal': 'fox'} contents='the brown fox' embedding=array([1. , 1.3], dtype=float32) distance=0.14489260377438218>]
128
128
129
129
You can access the fields as follows
130
130
@@ -133,7 +133,7 @@ records = await vec.search([1.0, 9.0], limit=1, filter={"action": "jump"})
133
133
records[0 ][client.SEARCH_RESULT_ID_IDX ]
134
134
```
135
135
136
- UUID('e5dbaa7c-081b-4131-be18-c81ce47fc864 ')
136
+ UUID('d10dc66f-92d5-4296-a702-1690860bbe55 ')
137
137
138
138
``` python
139
139
records[0 ][client.SEARCH_RESULT_METADATA_IDX ]
@@ -293,6 +293,134 @@ search call:
293
293
rec = await vec.search([1.0 , 2.0 ], limit = 4 , uuid_time_filter = client.UUIDTimeRange(specific_datetime- timedelta(days = 7 ), specific_datetime+ timedelta(days = 7 )))
294
294
```
295
295
296
+ # PgVectorize
297
+
298
+ PgVectorize enables you to create vector embeddings from any data that
299
+ you already have stored in Postgres. Simply, attach PgVectorize to any
300
+ Postgres table, and it will automatically sync that table’s data with a
301
+ set of embeddings stored in Timescale Vector. For example, let’s say you
302
+ have a blog table defined in the following way:
303
+
304
+ ``` python
305
+ import psycopg2
306
+ from langchain.docstore.document import Document
307
+ from langchain.text_splitter import CharacterTextSplitter
308
+ from timescale_vector import client, pgvectorizer
309
+ from langchain.embeddings.openai import OpenAIEmbeddings
310
+ from langchain.vectorstores.timescalevector import TimescaleVector
311
+ from datetime import timedelta
312
+ ```
313
+
314
+ ``` python
315
+ with psycopg2.connect(service_url) as conn:
316
+ with conn.cursor() as cursor:
317
+ cursor.execute('''
318
+ CREATE TABLE IF NOT EXISTS blog (
319
+ id SERIAL PRIMARY KEY NOT NULL,
320
+ title TEXT NOT NULL,
321
+ author TEXT NOT NULL,
322
+ contents TEXT NOT NULL,
323
+ category TEXT NOT NULL,
324
+ published_time TIMESTAMPTZ NULL --NULL if not yet published
325
+ );
326
+ ''' )
327
+ ```
328
+
329
+ You can insert some data as follows:
330
+
331
+ ``` python
332
+ with psycopg2.connect(service_url) as conn:
333
+ with conn.cursor() as cursor:
334
+ cursor.execute('''
335
+ INSERT INTO blog (title, author, contents, category, published_time) VALUES ('First Post', 'Matvey Arye', 'some super interesting content about cats.', 'AI', '2021-01-01');
336
+ ''' )
337
+ ```
338
+
339
+ Now, say you want to embed these blogs in Timescale Vector. First you
340
+ need to define an ` embed_and_write ` function, that takes a set of blog
341
+ posts, creates the embeddings, and writes them into TimescaleVector. For
342
+ example, if using LangChain, it could look something like the following.
343
+
344
+ ``` python
345
+ def get_document (blog ):
346
+ text_splitter = CharacterTextSplitter(
347
+ chunk_size = 1000 ,
348
+ chunk_overlap = 200 ,
349
+ )
350
+ docs = []
351
+ for chunk in text_splitter.split_text(blog[' contents' ]):
352
+ content = f " Author { blog[' author' ]} , title: { blog[' title' ]} , contents: { chunk} "
353
+ metadata = {
354
+ " id" : str (client.uuid_from_time(blog[' published_time' ])),
355
+ " blog_id" : blog[' id' ],
356
+ " author" : blog[' author' ],
357
+ " category" : blog[' category' ],
358
+ " published_time" : blog[' published_time' ].isoformat(),
359
+ }
360
+ docs.append(Document(page_content = content, metadata = metadata))
361
+ return docs
362
+
363
+ def embed_and_write (blog_instances , vectorizer ):
364
+ embedding = OpenAIEmbeddings()
365
+ vector_store = TimescaleVector(
366
+ collection_name = " blog_embedding" ,
367
+ service_url = service_url,
368
+ embedding = embedding,
369
+ time_partition_interval = timedelta(days = 30 ),
370
+ )
371
+
372
+ # delete old embeddings for all ids in the work queue. locked_id is a special column that is set to the primary key of the table being
373
+ # embedded. For items that are deleted, it is the only key that is set.
374
+ metadata_for_delete = [{" blog_id" : blog[' locked_id' ]} for blog in blog_instances]
375
+ vector_store.delete_by_metadata(metadata_for_delete)
376
+
377
+ documents = []
378
+ for blog in blog_instances:
379
+ # skip blogs that are not published yet, or are deleted (in which case it will be NULL)
380
+ if blog[' published_time' ] != None :
381
+ documents.extend(get_document(blog))
382
+
383
+ if len (documents) == 0 :
384
+ return
385
+
386
+ texts = [d.page_content for d in documents]
387
+ metadatas = [d.metadata for d in documents]
388
+ ids = [d.metadata[" id" ] for d in documents]
389
+ vector_store.add_texts(texts, metadatas, ids)
390
+ ```
391
+
392
+ Then, all you have to do is run the following code in a scheduled job
393
+ (cron job, lambda job, etc):
394
+
395
+ ``` python
396
+ vectorizer = pgvectorizer.Vectorize(service_url, ' blog' )
397
+ while vectorizer.process(embed_and_write) > 0 :
398
+ pass
399
+ ```
400
+
401
+ Every time that job runs it will sync the table with your embeddings. It
402
+ will sync all insert, updates, and deletes to an embeddings table called
403
+ ` blog_embedding ` .
404
+
405
+ Now, you can simply search the embeddings follows (again, using
406
+ LangChain in the exampls):
407
+
408
+ ``` python
409
+ embedding = OpenAIEmbeddings()
410
+ vector_store = TimescaleVector(
411
+ collection_name = " blog_embedding" ,
412
+ service_url = service_url,
413
+ embedding = embedding,
414
+ time_partition_interval = timedelta(days = 30 ),
415
+ )
416
+
417
+ res = vector_store.similarity_search_with_score(" Blogs about cats" )
418
+ res
419
+ ```
420
+
421
+ [(Document(page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.', metadata={'id': '4a784000-4bc4-11eb-9140-78a539e57b40', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00+00:00'}),
422
+ 0.12605134378941762)]
423
+
296
424
## Development
297
425
298
426
This project is developed with [ nbdev] ( https://nbdev.fast.ai/ ) . Please
0 commit comments