|
1 | 1 | import hashlib |
2 | 2 | import json |
3 | 3 | from dataclasses import dataclass |
4 | | -from typing import Generator, Optional, TypeVar |
| 4 | +from typing import AsyncGenerator, Generator, Optional, TypeVar |
5 | 5 |
|
6 | 6 | from unstructured_ingest.v2.interfaces.indexer import Indexer |
7 | 7 | from unstructured_ingest.v2.logger import logger |
@@ -52,6 +52,23 @@ def run(self) -> Generator[str, None, None]: |
52 | 52 | raise e |
53 | 53 | continue |
54 | 54 |
|
| 55 | + async def run_async(self) -> AsyncGenerator[str, None]: |
| 56 | + async for file_data in self.process.run_async(): |
| 57 | + logger.debug(f"generated file data: {file_data.to_dict()}") |
| 58 | + try: |
| 59 | + record_hash = self.get_hash(extras=[file_data.identifier]) |
| 60 | + filename = f"{record_hash}.json" |
| 61 | + filepath = (self.cache_dir / filename).resolve() |
| 62 | + filepath.parent.mkdir(parents=True, exist_ok=True) |
| 63 | + with open(str(filepath), "w") as f: |
| 64 | + json.dump(file_data.to_dict(), f, indent=2) |
| 65 | + yield str(filepath) |
| 66 | + except Exception as e: |
| 67 | + logger.error(f"failed to create index for file data: {file_data}", exc_info=True) |
| 68 | + if self.context.raise_on_error: |
| 69 | + raise e |
| 70 | + continue |
| 71 | + |
55 | 72 | def get_hash(self, extras: Optional[list[str]]) -> str: |
56 | 73 | index_config_dict = json.loads( |
57 | 74 | serialize_base_model_json(model=self.process.index_config, sort_keys=True) |
|
0 commit comments