|
55 | 55 | "%%bash\n", |
56 | 56 | "\n", |
57 | 57 | "pip install --upgrade pip\n", |
58 | | - "pip install haystack-ai" |
| 58 | + "pip install haystack-ai nltk" |
59 | 59 | ] |
60 | 60 | }, |
61 | 61 | { |
|
98 | 98 | "source": [ |
99 | 99 | "from haystack import Document\n", |
100 | 100 | "from haystack.components.preprocessors import DocumentSplitter\n", |
101 | | - "splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by=\"sentence\")\n", |
102 | | - "\n", |
103 | | - "text = (\"Paul fell asleep to dream of an Arrakeen cavern, silent people all around him moving in the dim light \"\n", |
104 | | - " \"of glowglobes. It was solemn there and like a cathedral as he listened to a faint sound—the \"\n", |
105 | | - " \"drip-drip-drip of water. Even while he remained in the dream, Paul knew he would remember it upon \"\n", |
106 | | - " \"awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel \"\n", |
107 | | - " \"himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or \"\n", |
108 | | - " \"companions his own age, perhaps did not deserve sadness in farewell. Dr Yueh, his teacher, had \"\n", |
109 | | - " \"hinted that the faufreluches class system was not rigidly guarded on Arrakis. The planet sheltered \"\n", |
110 | | - " \"people who lived at the desert edge without caid or bashar to command them: will-o’-the-sand people \"\n", |
111 | | - " \"called Fremen, marked down on no census of the Imperial Regate.\")\n", |
| 101 | + "\n", |
| 102 | + "splitter = DocumentSplitter(split_length=1, split_overlap=0, split_by=\"period\")\n", |
| 103 | + "\n", |
| 104 | + "text = (\n", |
| 105 | + " \"Paul fell asleep to dream of an Arrakeen cavern, silent people all around him moving in the dim light \"\n", |
| 106 | + " \"of glowglobes. It was solemn there and like a cathedral as he listened to a faint sound—the \"\n", |
| 107 | + " \"drip-drip-drip of water. Even while he remained in the dream, Paul knew he would remember it upon \"\n", |
| 108 | + " \"awakening. He always remembered the dreams that were predictions. The dream faded. Paul awoke to feel \"\n", |
| 109 | + " \"himself in the warmth of his bed—thinking thinking. This world of Castle Caladan, without play or \"\n", |
| 110 | + " \"companions his own age, perhaps did not deserve sadness in farewell. Dr Yueh, his teacher, had \"\n", |
| 111 | + " \"hinted that the faufreluches class system was not rigidly guarded on Arrakis. The planet sheltered \"\n", |
| 112 | + " \"people who lived at the desert edge without caid or bashar to command them: will-o’-the-sand people \"\n", |
| 113 | + " \"called Fremen, marked down on no census of the Imperial Regate.\"\n", |
| 114 | + ")\n", |
112 | 115 | "\n", |
113 | 116 | "doc = Document(content=text)\n", |
114 | 117 | "docs = splitter.run([doc])" |
|
144 | 147 | "from haystack.document_stores.types import DuplicatePolicy\n", |
145 | 148 | "\n", |
146 | 149 | "doc_store = InMemoryDocumentStore()\n", |
147 | | - "doc_store.write_documents(docs['documents'], policy=DuplicatePolicy.OVERWRITE)" |
| 150 | + "doc_store.write_documents(docs[\"documents\"], policy=DuplicatePolicy.OVERWRITE)" |
148 | 151 | ] |
149 | 152 | }, |
150 | 153 | { |
|
167 | 170 | "from haystack.components.retrievers import SentenceWindowRetriever\n", |
168 | 171 | "\n", |
169 | 172 | "retriever = SentenceWindowRetriever(document_store=doc_store, window_size=2)\n", |
170 | | - "result = retriever.run(retrieved_documents=[docs['documents'][4]])" |
| 173 | + "result = retriever.run(retrieved_documents=[docs[\"documents\"][4]])" |
171 | 174 | ] |
172 | 175 | }, |
173 | 176 | { |
|
199 | 202 | } |
200 | 203 | ], |
201 | 204 | "source": [ |
202 | | - "result['context_windows']" |
| 205 | + "result[\"context_windows\"]" |
203 | 206 | ] |
204 | 207 | }, |
205 | 208 | { |
|
224 | 227 | } |
225 | 228 | ], |
226 | 229 | "source": [ |
227 | | - "result['context_documents']" |
| 230 | + "result[\"context_documents\"]" |
228 | 231 | ] |
229 | 232 | }, |
230 | 233 | { |
|
259 | 262 | "import csv\n", |
260 | 263 | "from haystack import Document\n", |
261 | 264 | "\n", |
| 265 | + "\n", |
262 | 266 | "def read_documents(file: str) -> List[Document]:\n", |
263 | 267 | " with open(file, \"r\") as file:\n", |
264 | 268 | " reader = csv.reader(file, delimiter=\"\\t\")\n", |
|
283 | 287 | "from pathlib import Path\n", |
284 | 288 | "import requests\n", |
285 | 289 | "\n", |
286 | | - "doc = requests.get('https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv')\n", |
| 290 | + "doc = requests.get(\"https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv\")\n", |
287 | 291 | "\n", |
288 | | - "datafolder = Path('data')\n", |
| 292 | + "datafolder = Path(\"data\")\n", |
289 | 293 | "datafolder.mkdir(exist_ok=True)\n", |
290 | | - "with open(datafolder/'bbc-news-data.csv', 'wb') as f:\n", |
| 294 | + "with open(datafolder / \"bbc-news-data.csv\", \"wb\") as f:\n", |
291 | 295 | " for chunk in doc.iter_content(512):\n", |
292 | 296 | " f.write(chunk)" |
293 | 297 | ] |
|
356 | 360 | "\n", |
357 | 361 | "indexing_pipeline.connect(\"splitter\", \"writer\")\n", |
358 | 362 | "\n", |
359 | | - "indexing_pipeline.run({\"documents\":docs})" |
| 363 | + "indexing_pipeline.run({\"documents\": docs})" |
360 | 364 | ] |
361 | 365 | }, |
362 | 366 | { |
|
421 | 425 | "metadata": {}, |
422 | 426 | "outputs": [], |
423 | 427 | "source": [ |
424 | | - "result = sentence_window_pipeline.run(data={'bm25_retriever': {'query': \"phishing attacks\", \"top_k\": 1}}, include_outputs_from={'bm25_retriever'})" |
| 428 | + "result = sentence_window_pipeline.run(\n", |
| 429 | + " data={\"bm25_retriever\": {\"query\": \"phishing attacks\", \"top_k\": 1}}, include_outputs_from={\"bm25_retriever\"}\n", |
| 430 | + ")" |
425 | 431 | ] |
426 | 432 | }, |
427 | 433 | { |
|
450 | 456 | } |
451 | 457 | ], |
452 | 458 | "source": [ |
453 | | - "result['bm25_retriever']['documents']" |
| 459 | + "result[\"bm25_retriever\"][\"documents\"]" |
454 | 460 | ] |
455 | 461 | }, |
456 | 462 | { |
|
479 | 485 | } |
480 | 486 | ], |
481 | 487 | "source": [ |
482 | | - "result['sentence_window__retriever']['context_windows']" |
| 488 | + "result[\"sentence_window__retriever\"][\"context_windows\"]" |
483 | 489 | ] |
484 | 490 | }, |
485 | 491 | { |
|
512 | 518 | } |
513 | 519 | ], |
514 | 520 | "source": [ |
515 | | - "result['sentence_window__retriever']['context_documents']" |
| 521 | + "result[\"sentence_window__retriever\"][\"context_documents\"]" |
516 | 522 | ] |
517 | 523 | }, |
518 | 524 | { |
|
0 commit comments