Skip to content

Commit f29a577

Browse files
author
adesousa_microsoft
committed
update scripts to support full content
1 parent ca85002 commit f29a577

File tree

3 files changed

+17
-3
lines changed

3 files changed

+17
-3
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,7 @@ data
1010
static
1111

1212
scripts/config.json
13-
venv
13+
venv
14+
15+
grant_data
16+
promissory_note_data

scripts/data_preparation.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,15 @@ def create_or_update_search_index(
182182
"filterable": False,
183183
"analyzer": f"{language}.lucene" if language else None,
184184
},
185+
{
186+
"name": "full_content",
187+
"type": "Edm.String",
188+
"searchable": True,
189+
"sortable": False,
190+
"facetable": False,
191+
"filterable": False,
192+
"analyzer": f"{language}.lucene" if language else None,
193+
},
185194
{
186195
"name": "title",
187196
"type": "Edm.String",
@@ -452,7 +461,7 @@ def valid_range(n):
452461
parser.add_argument("--config", type=str, help="Path to config file containing settings for data preparation")
453462
parser.add_argument("--form-rec-resource", type=str, help="Name of your Form Recognizer resource to use for PDF cracking.")
454463
parser.add_argument("--form-rec-key", type=str, help="Key for your Form Recognizer resource to use for PDF cracking.")
455-
parser.add_argument("--form-rec-use-layout", default=True, action='store_true', help="Whether to use Layout model for PDF cracking, if False will use Read model.")
464+
parser.add_argument("--form-rec-use-layout", default=False, action='store_true', help="Whether to use Layout model for PDF cracking, if False will use Read model.")
456465
parser.add_argument("--njobs", type=valid_range, default=4, help="Number of jobs to run (between 1 and 32). Default=4")
457466
parser.add_argument("--embedding-model-endpoint", type=str, help="Endpoint for the embedding model to use for vector search. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<Ada deployment name>/embeddings?api-version=2024-03-01-Preview'")
458467
parser.add_argument("--embedding-model-key", type=str, help="Key for the embedding model to use for vector search.")

scripts/data_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ class Document(object):
266266
metadata: Optional[Dict] = None
267267
contentVector: Optional[List[float]] = None
268268
image_mapping: Optional[Dict] = None
269+
full_content: Optional[str] = None
269270

270271
def cleanup_content(content: str) -> str:
271272
"""Cleans up the given content using regexes
@@ -884,7 +885,8 @@ def chunk_content(
884885
url=url,
885886
contentVector=doc.contentVector,
886887
metadata=doc.metadata,
887-
image_mapping=doc.image_mapping
888+
image_mapping=doc.image_mapping,
889+
full_content=content
888890
)
889891
)
890892
else:

0 commit comments

Comments
 (0)