|
33 | 33 | }, |
34 | 34 | { |
35 | 35 | "cell_type": "code", |
36 | | - "execution_count": 2, |
| 36 | + "execution_count": null, |
37 | 37 | "metadata": { |
38 | 38 | "colab": { |
39 | 39 | "base_uri": "https://localhost:8080/" |
40 | 40 | }, |
41 | 41 | "id": "UQezgPCG1vml", |
42 | 42 | "outputId": "97b9bc03-da1b-439a-c37b-be6fdb58ab21" |
43 | 43 | }, |
44 | | - "outputs": [ |
45 | | - { |
46 | | - "name": "stdout", |
47 | | - "output_type": "stream", |
48 | | - "text": [ |
49 | | - "Cloning into 'temp_repo'...\n", |
50 | | - "remote: Enumerating objects: 138, done.\u001b[K\n", |
51 | | - "remote: Counting objects: 100% (138/138), done.\u001b[K\n", |
52 | | - "remote: Compressing objects: 100% (98/98), done.\u001b[K\n", |
53 | | - "remote: Total 138 (delta 68), reused 91 (delta 35), pack-reused 0\u001b[K\n", |
54 | | - "Receiving objects: 100% (138/138), 7.19 MiB | 4.45 MiB/s, done.\n", |
55 | | - "Resolving deltas: 100% (68/68), done.\n", |
56 | | - "mv: rename temp_repo/resources to ./resources: Directory not empty\n" |
57 | | - ] |
58 | | - } |
59 | | - ], |
| 44 | + "outputs": [], |
60 | 45 | "source": [ |
61 | 46 | "# NBVAL_SKIP\n", |
62 | 47 | "!git clone https://github.com/redis-developer/redis-ai-resources.git temp_repo\n", |
|
200 | 185 | "name": "stdout", |
201 | 186 | "output_type": "stream", |
202 | 187 | "text": [ |
203 | | - "Sample doc Doc ID: 67e07154-6ea0-4822-8957-ac1d212fc9ee\n", |
| 188 | + "Sample doc Doc ID: c013353e-dae7-4d17-befd-9e784c8acf79\n", |
204 | 189 | "Text: UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington,\n", |
205 | 190 | "D.C. 20549 FORM 10-K (Mark One) ☒ ANNUAL REPORT PURSUANT T O SECTION\n", |
206 | 191 | "13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year\n", |
|
245 | 230 | }, |
246 | 231 | { |
247 | 232 | "cell_type": "code", |
248 | | - "execution_count": 10, |
| 233 | + "execution_count": 4, |
249 | 234 | "metadata": {}, |
250 | 235 | "outputs": [], |
251 | 236 | "source": [ |
252 | 237 | "from llama_index.core import StorageContext\n", |
253 | 238 | "\n", |
254 | | - "vector_store = RedisVectorStore(redis_url=REDIS_URL, index_name=\"llama\", overwrite=True)\n", |
| 239 | + "vector_store = RedisVectorStore(redis_url=REDIS_URL, overwrite=True)\n", |
255 | 240 | "\n", |
256 | 241 | "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", |
257 | 242 | "\n", |
|
267 | 252 | }, |
268 | 253 | { |
269 | 254 | "cell_type": "code", |
270 | | - "execution_count": 11, |
| 255 | + "execution_count": 5, |
271 | 256 | "metadata": {}, |
272 | 257 | "outputs": [], |
273 | 258 | "source": [ |
|
285 | 270 | }, |
286 | 271 | { |
287 | 272 | "cell_type": "code", |
288 | | - "execution_count": 12, |
| 273 | + "execution_count": 6, |
289 | 274 | "metadata": {}, |
290 | 275 | "outputs": [ |
291 | 276 | { |
292 | 277 | "name": "stdout", |
293 | 278 | "output_type": "stream", |
294 | 279 | "text": [ |
295 | | - "Node ID: b561dd17-5545-4d3a-bc4f-18cb39c7c01e\n", |
| 280 | + "Node ID: d2e6cd9c-0716-49d8-8563-407a00d05445\n", |
296 | 281 | "Text: Table of Contents FISCAL 2023 NIKE BRAND REVENUE HIGHLIGHTS The\n", |
297 | 282 | "following tables present NIKE Brand revenues disaggregated by\n", |
298 | 283 | "reportable operating segment, distribution channel and major product\n", |
|
301 | 286 | "fiscal 2022 on...\n", |
302 | 287 | "Score: 0.900\n", |
303 | 288 | "\n", |
304 | | - "Node ID: 0415f059-9258-426b-8b21-34b287b3c21b\n", |
| 289 | + "Node ID: 28542d3b-b345-4e9e-b675-f62361ec85d9\n", |
305 | 290 | "Text: Table of Contents NORTH AMERICA (Dollars in millions) FISCAL\n", |
306 | 291 | "2023FISCAL 2022 % CHANGE% CHANGE EXCLUDING CURRENCY CHANGESFISCAL 2021\n", |
307 | 292 | "% CHANGE% CHANGE EXCLUDING CURRENCY CHANGES Revenues by: Footwear $\n", |
308 | 293 | "14,897 $ 12,228 22 % 22 %$ 11,644 5 % 5 % Apparel 5,947 5,492 8 % 9 %\n", |
309 | 294 | "5,028 9 % 9 % Equipment 764 633 21 % 21 % 507 25 % 25 % TOTAL REVENUES\n", |
310 | 295 | "$ 21,6...\n", |
311 | | - "Score: 0.886\n", |
| 296 | + "Score: 0.885\n", |
312 | 297 | "\n" |
313 | 298 | ] |
314 | 299 | } |
|
329 | 314 | }, |
330 | 315 | { |
331 | 316 | "cell_type": "code", |
332 | | - "execution_count": 13, |
| 317 | + "execution_count": 7, |
333 | 318 | "metadata": {}, |
334 | 319 | "outputs": [ |
335 | 320 | { |
|
338 | 323 | "\"NIKE's revenue in fiscal 23 was $51.2 billion.\"" |
339 | 324 | ] |
340 | 325 | }, |
341 | | - "execution_count": 13, |
| 326 | + "execution_count": 7, |
342 | 327 | "metadata": {}, |
343 | 328 | "output_type": "execute_result" |
344 | 329 | } |
|
348 | 333 | "response.response" |
349 | 334 | ] |
350 | 335 | }, |
| 336 | + { |
| 337 | + "cell_type": "markdown", |
| 338 | + "metadata": {}, |
| 339 | + "source": [ |
| 340 | + "### Use a custom index schema\n", |
| 341 | + "\n", |
| 342 | + "In most use cases, you need the ability to customize the underling index configuration\n", |
| 343 | + "and specification. For example, this is handy in order to define specific metadata filters you wish to enable.\n", |
| 344 | + "\n", |
| 345 | + "With Redis, this is as simple as defining an index schema object\n", |
| 346 | + "(from file or dict) and passing it through to the vector store client wrapper." |
| 347 | + ] |
| 348 | + }, |
| 349 | + { |
| 350 | + "cell_type": "code", |
| 351 | + "execution_count": 8, |
| 352 | + "metadata": {}, |
| 353 | + "outputs": [], |
| 354 | + "source": [ |
| 355 | + "from redisvl.schema import IndexSchema\n", |
| 356 | + "\n", |
| 357 | + "\n", |
| 358 | + "custom_schema = IndexSchema.from_dict(\n", |
| 359 | + " {\n", |
| 360 | + " # customize basic index specs\n", |
| 361 | + " \"index\": {\n", |
| 362 | + " \"name\": \"custom_index\",\n", |
| 363 | + " \"prefix\": \"docs\",\n", |
| 364 | + " \"key_separator\": \":\",\n", |
| 365 | + " },\n", |
| 366 | + " # customize fields that are indexed\n", |
| 367 | + " \"fields\": [\n", |
| 368 | + " # required fields for llamaindex\n", |
| 369 | + " {\"type\": \"tag\", \"name\": \"id\"},\n", |
| 370 | + " {\"type\": \"tag\", \"name\": \"doc_id\"},\n", |
| 371 | + " {\"type\": \"text\", \"name\": \"text\"},\n", |
| 372 | + " # custom metadata fields\n", |
| 373 | + " {\"type\": \"numeric\", \"name\": \"updated_at\"},\n", |
| 374 | + " {\"type\": \"tag\", \"name\": \"file_name\"},\n", |
| 375 | + " # custom vector field definition for cohere embeddings\n", |
| 376 | + " {\n", |
| 377 | + " \"type\": \"vector\",\n", |
| 378 | + " \"name\": \"vector\",\n", |
| 379 | + " \"attrs\": {\n", |
| 380 | + " \"dims\": 1536,\n", |
| 381 | + " \"algorithm\": \"hnsw\",\n", |
| 382 | + " \"distance_metric\": \"cosine\",\n", |
| 383 | + " },\n", |
| 384 | + " },\n", |
| 385 | + " ],\n", |
| 386 | + " }\n", |
| 387 | + ")" |
| 388 | + ] |
| 389 | + }, |
| 390 | + { |
| 391 | + "cell_type": "code", |
| 392 | + "execution_count": 9, |
| 393 | + "metadata": {}, |
| 394 | + "outputs": [ |
| 395 | + { |
| 396 | + "data": { |
| 397 | + "text/plain": [ |
| 398 | + "IndexInfo(name='custom_index', prefix='docs', key_separator=':', storage_type=<StorageType.HASH: 'hash'>)" |
| 399 | + ] |
| 400 | + }, |
| 401 | + "execution_count": 9, |
| 402 | + "metadata": {}, |
| 403 | + "output_type": "execute_result" |
| 404 | + } |
| 405 | + ], |
| 406 | + "source": [ |
| 407 | + "custom_schema.index" |
| 408 | + ] |
| 409 | + }, |
| 410 | + { |
| 411 | + "cell_type": "code", |
| 412 | + "execution_count": 10, |
| 413 | + "metadata": {}, |
| 414 | + "outputs": [ |
| 415 | + { |
| 416 | + "data": { |
| 417 | + "text/plain": [ |
| 418 | + "{'id': TagField(name='id', type='tag', path=None, attrs=TagFieldAttributes(sortable=False, separator=',', case_sensitive=False, withsuffixtrie=False)),\n", |
| 419 | + " 'doc_id': TagField(name='doc_id', type='tag', path=None, attrs=TagFieldAttributes(sortable=False, separator=',', case_sensitive=False, withsuffixtrie=False)),\n", |
| 420 | + " 'text': TextField(name='text', type='text', path=None, attrs=TextFieldAttributes(sortable=False, weight=1, no_stem=False, withsuffixtrie=False, phonetic_matcher=None)),\n", |
| 421 | + " 'updated_at': NumericField(name='updated_at', type='numeric', path=None, attrs=NumericFieldAttributes(sortable=False)),\n", |
| 422 | + " 'file_name': TagField(name='file_name', type='tag', path=None, attrs=TagFieldAttributes(sortable=False, separator=',', case_sensitive=False, withsuffixtrie=False)),\n", |
| 423 | + " 'vector': HNSWVectorField(name='vector', type='vector', path=None, attrs=HNSWVectorFieldAttributes(dims=1536, algorithm=<VectorIndexAlgorithm.HNSW: 'HNSW'>, datatype=<VectorDataType.FLOAT32: 'FLOAT32'>, distance_metric=<VectorDistanceMetric.COSINE: 'COSINE'>, initial_cap=None, m=16, ef_construction=200, ef_runtime=10, epsilon=0.01))}" |
| 424 | + ] |
| 425 | + }, |
| 426 | + "execution_count": 10, |
| 427 | + "metadata": {}, |
| 428 | + "output_type": "execute_result" |
| 429 | + } |
| 430 | + ], |
| 431 | + "source": [ |
| 432 | + "custom_schema.fields" |
| 433 | + ] |
| 434 | + }, |
| 435 | + { |
| 436 | + "cell_type": "code", |
| 437 | + "execution_count": 11, |
| 438 | + "metadata": {}, |
| 439 | + "outputs": [], |
| 440 | + "source": [ |
| 441 | + "# from datetime import datetime\n", |
| 442 | + "\n", |
| 443 | + "\n", |
| 444 | + "# def date_to_timestamp(date_string: str) -> int:\n", |
| 445 | + "# date_format: str = \"%Y-%m-%d\"\n", |
| 446 | + "# return int(datetime.strptime(date_string, date_format).timestamp())\n", |
| 447 | + "\n", |
| 448 | + "\n", |
| 449 | + "# # iterate through documents and add new field\n", |
| 450 | + "# for document in docs:\n", |
| 451 | + "# document.metadata[\"updated_at\"] = date_to_timestamp(\n", |
| 452 | + "# document.metadata[\"last_modified_date\"]\n", |
| 453 | + "# )" |
| 454 | + ] |
| 455 | + }, |
| 456 | + { |
| 457 | + "cell_type": "code", |
| 458 | + "execution_count": 12, |
| 459 | + "metadata": {}, |
| 460 | + "outputs": [], |
| 461 | + "source": [ |
| 462 | + "vector_store = RedisVectorStore(\n", |
| 463 | + " schema=custom_schema, # provide customized schema\n", |
| 464 | + " redis_url=REDIS_URL,\n", |
| 465 | + " overwrite=True,\n", |
| 466 | + ")\n", |
| 467 | + "\n", |
| 468 | + "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", |
| 469 | + "\n", |
| 470 | + "# build and load index from documents and storage context\n", |
| 471 | + "index = VectorStoreIndex.from_documents(\n", |
| 472 | + " docs, storage_context=storage_context\n", |
| 473 | + ")" |
| 474 | + ] |
| 475 | + }, |
| 476 | + { |
| 477 | + "cell_type": "markdown", |
| 478 | + "metadata": {}, |
| 479 | + "source": [ |
| 480 | + "### Query the vector store and filter on metadata\n", |
| 481 | + "Now that we have additional metadata indexed in Redis, let's try some queries which add in filters. As an example, we'll do a search for chunks with the word \"audit\" from an exact file \"amzn-10k-2023.pdf\". " |
| 482 | + ] |
| 483 | + }, |
| 484 | + { |
| 485 | + "cell_type": "code", |
| 486 | + "execution_count": 13, |
| 487 | + "metadata": {}, |
| 488 | + "outputs": [], |
| 489 | + "source": [ |
| 490 | + "from llama_index.core.vector_stores import (\n", |
| 491 | + " MetadataFilters,\n", |
| 492 | + " MetadataFilter,\n", |
| 493 | + " ExactMatchFilter,\n", |
| 494 | + ")\n", |
| 495 | + "\n", |
| 496 | + "retriever = index.as_retriever(\n", |
| 497 | + " similarity_top_k=3,\n", |
| 498 | + " filters=MetadataFilters(\n", |
| 499 | + " filters=[\n", |
| 500 | + " ExactMatchFilter(key=\"file_name\", value=\"amzn-10k-2023.pdf\"),\n", |
| 501 | + " MetadataFilter(\n", |
| 502 | + " key=\"text\",\n", |
| 503 | + " value=\"audit\",\n", |
| 504 | + " operator=\"text_match\",\n", |
| 505 | + " ),\n", |
| 506 | + " ],\n", |
| 507 | + " condition=\"and\",\n", |
| 508 | + " ),\n", |
| 509 | + ")" |
| 510 | + ] |
| 511 | + }, |
| 512 | + { |
| 513 | + "cell_type": "code", |
| 514 | + "execution_count": 14, |
| 515 | + "metadata": {}, |
| 516 | + "outputs": [ |
| 517 | + { |
| 518 | + "name": "stdout", |
| 519 | + "output_type": "stream", |
| 520 | + "text": [ |
| 521 | + "Node ID: cd0c5d8f-e3b1-4cbb-aa6a-5960003cdb2d\n", |
| 522 | + "Text: Table of Contents valuation. In the ordinary course of our\n", |
| 523 | + "business, there are many transactions and calculations for which the\n", |
| 524 | + "ultimate tax determination is uncertain. Significant judgment is\n", |
| 525 | + "required in evaluating and estimating our tax expense, assets, and\n", |
| 526 | + "liabilities. We are also subject to tax controversies in various\n", |
| 527 | + "jurisdictions that can...\n", |
| 528 | + "Score: 0.746\n", |
| 529 | + "\n", |
| 530 | + "Node ID: 6745f668-4c7a-43bf-a9c3-9b04e1a497f8\n", |
| 531 | + "Text: Table of Contents Included in other income (expense), net in\n", |
| 532 | + "2021 and 2022 is a marketable equity securities valuation gain (loss)\n", |
| 533 | + "of $11.8 billion and $(12.7) billion from our equity investment in\n", |
| 534 | + "Rivian Automotive, Inc. (“Rivian”). Our investment in Rivian’s\n", |
| 535 | + "preferred stock was accounted for at cost, with adjustments for\n", |
| 536 | + "observable changes in ...\n", |
| 537 | + "Score: 0.740\n", |
| 538 | + "\n", |
| 539 | + "Node ID: 717666fe-fea5-488b-999c-84e6d8b9a0db\n", |
| 540 | + "Text: Exhibit 31.1 CERTIFICATIONS I, Andrew R. Jassy, certify that: 1.\n", |
| 541 | + "I have reviewed this Form 10-K of Amazon.com, Inc.; 2. Based on my\n", |
| 542 | + "knowledge, this report does not contain any untrue statement of a\n", |
| 543 | + "material fact or omit to state a material fact necessary to make the\n", |
| 544 | + "statements made, in light of the circumstances under which such\n", |
| 545 | + "statements were ...\n", |
| 546 | + "Score: 0.732\n", |
| 547 | + "\n" |
| 548 | + ] |
| 549 | + } |
| 550 | + ], |
| 551 | + "source": [ |
| 552 | + "result_nodes = retriever.retrieve(\"What did the author learn?\")\n", |
| 553 | + "\n", |
| 554 | + "for node in result_nodes:\n", |
| 555 | + " print(node)" |
| 556 | + ] |
| 557 | + }, |
351 | 558 | { |
352 | 559 | "cell_type": "code", |
353 | 560 | "execution_count": null, |
|
376 | 583 | "name": "python", |
377 | 584 | "nbconvert_exporter": "python", |
378 | 585 | "pygments_lexer": "ipython3", |
379 | | - "version": "3.9.12" |
| 586 | + "version": "3.11.9" |
380 | 587 | }, |
381 | 588 | "widgets": { |
382 | 589 | "application/vnd.jupyter.widget-state+json": { |
|
0 commit comments