Skip to content

Commit 734c363

Browse files
Save extracted text in a file (#9)
1 parent 0b02756 commit 734c363

File tree

2 files changed

+12
-1
lines changed

2 files changed

+12
-1
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,6 @@ cython_debug/
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160160
.idea/
161+
162+
.pdm-python
163+
.python-version

src/unstract/sdk/index.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from typing import Optional
2+
import os
23

34
from llama_index import Document, StorageContext, VectorStoreIndex
45
from llama_index.node_parser import SimpleNodeParser
@@ -104,6 +105,7 @@ def index_file(
104105
chunk_overlap: int,
105106
reindex: bool = False,
106107
file_hash: Optional[str] = None,
108+
is_summary: bool = False,
107109
):
108110
# Make file content hash if not available
109111
if not file_hash:
@@ -115,7 +117,13 @@ def index_file(
115117
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
116118
adapter_instance_id=x2text_adapter
117119
)
118-
extracted_text = x2text_adapter_inst.process(input_file_path=file_path)
120+
extract_file_path = None
121+
if not is_summary:
122+
directory, filename = os.path.split(file_path)
123+
extract_file_path: str = os.path.join(
124+
directory, "extract", os.path.splitext(filename)[0] + ".txt"
125+
)
126+
extracted_text = x2text_adapter_inst.process(input_file_path=file_path, output_file_path=extract_file_path)
119127
full_text.append(
120128
{
121129
"section": "full",

0 commit comments

Comments
 (0)