|
12 | 12 |
|
13 | 13 | client = OpenAI(api_key=API_KEY)
|
14 | 14 |
|
15 |
| -# Note: we exit if the vector store already exists because |
16 |
| -# OpenAI does not return the filenames of files in a vector store, |
17 |
| -# meaning we cannot check if the files we want to upload |
18 |
| -# already exist in the vector store. |
19 |
| -# If you want to update the vector store, delete it first |
20 |
| -# and then run this script again. |
21 | 15 | # TODO: Would be nice to have a better way to check for the vector store than just the name.
|
22 | 16 | vector_stores = client.vector_stores.list()
|
23 | 17 | if any(store.name == "Oregon Housing Law" for store in vector_stores):
|
24 | 18 | vector_store = next(
|
25 | 19 | store for store in vector_stores if store.name == "Oregon Housing Law"
|
26 | 20 | )
|
27 |
| - print( |
28 |
| - f"Vector store 'Oregon Housing Law' already exists.\n" |
29 |
| - f"Add the following to your .env file to use this vector store:\n" |
30 |
| - f"VECTOR_STORE_ID={vector_store.id}\n" |
| 21 | + # Delete all files in the vector store |
| 22 | + vector_store_files = client.vector_stores.files.list( |
| 23 | + vector_store_id=vector_store.id |
31 | 24 | )
|
32 |
| - exit(1) |
| 25 | + for file in vector_store_files: |
| 26 | + print(f"Deleting file {file.id} from vector store '{vector_store.name}'.") |
| 27 | + client.vector_stores.files.delete( |
| 28 | + vector_store_id=vector_store.id, file_id=file.id |
| 29 | + ) |
| 30 | + client.files.delete(file_id=file.id) |
33 | 31 |
|
34 | 32 | else:
|
35 | 33 | print("Creating vector store 'Oregon Housing Law'.")
|
36 | 34 |
|
37 | 35 | # Create a new vector store
|
38 | 36 | vector_store = client.vector_stores.create(name="Oregon Housing Law")
|
39 | 37 |
|
40 |
| - # Get all the files in ./documents |
41 |
| - documents_path = Path("./scripts/documents") |
42 |
| - file_paths = [ |
43 |
| - f |
44 |
| - for f in os.listdir(documents_path) |
45 |
| - if os.path.isfile(os.path.join(documents_path, f)) |
46 |
| - ] |
| 38 | +# Get list of all directories in ./scripts/documents |
| 39 | +documents_path = Path(__file__).parent / "documents" |
| 40 | +for dirpath, dirnames, filenames in os.walk(documents_path): |
| 41 | + subdir = dirpath.replace(str(documents_path), "").strip(os.sep) |
| 42 | + if len(filenames) > 0: |
| 43 | + subdirs = ( |
| 44 | + subdir.split(os.sep) + [None] * 2 |
| 45 | + ) # Ensure we have at least two subdirs |
47 | 46 |
|
48 |
| - if not file_paths: |
49 |
| - print("No text files found in the documents directory.") |
50 |
| - exit(1) |
| 47 | + attributes = {} |
| 48 | + # Openai doesn't allow querying by empty attributes, so we set them to "null" |
| 49 | + if subdirs[1]: |
| 50 | + attributes["city"] = subdirs[1] |
| 51 | + else: |
| 52 | + attributes["city"] = "null" |
| 53 | + if subdirs[0]: |
| 54 | + attributes["state"] = subdirs[0] |
51 | 55 |
|
52 |
| - print("Uploading files to vector store...") |
53 |
| - file_streams = [ |
54 |
| - open(os.path.join(documents_path, path), "rb") for path in file_paths |
55 |
| - ] |
56 |
| - # Add the files to the vector store |
57 |
| - file_batch = client.vector_stores.file_batches.upload_and_poll( |
58 |
| - vector_store_id=vector_store.id, files=file_streams |
59 |
| - ) |
| 56 | + file_ids = [] |
| 57 | + for filename in filenames: |
| 58 | + file_path = Path(dirpath) / filename |
60 | 59 |
|
61 |
| - print(f"Uploaded files to vector store '{vector_store.name}'.") |
62 |
| - print( |
63 |
| - f"Add the following to your .env file to use this vector store:\n" |
64 |
| - f"VECTOR_STORE_ID={vector_store.id}\n" |
65 |
| - ) |
| 60 | + # Ensure the file is UTF-8 encoded |
| 61 | + # OpenAI rejects the file if not |
| 62 | + path = Path(file_path) |
| 63 | + path.write_text(path.read_text(encoding="utf-8"), encoding="utf-8") |
| 64 | + |
| 65 | + print(f"Uploading {file_path} to vector store '{vector_store.name}'.") |
| 66 | + file = client.files.create( |
| 67 | + file=open(file_path, "rb"), |
| 68 | + purpose="assistants", |
| 69 | + ) |
| 70 | + file_ids.append(file.id) |
| 71 | + |
| 72 | + # Add files to the vector store |
| 73 | + batch_upload = client.vector_stores.file_batches.create( |
| 74 | + vector_store_id=vector_store.id, |
| 75 | + file_ids=file_ids, |
| 76 | + attributes=attributes, # Only take the first two subdirs |
| 77 | + ) |
| 78 | + |
| 79 | +print(f"Uploaded files to vector store '{vector_store.name}'.") |
| 80 | +print( |
| 81 | + f"Add the following to your .env file to use this vector store:\n" |
| 82 | + f"VECTOR_STORE_ID={vector_store.id}\n" |
| 83 | +) |
0 commit comments