update scripts to support full content

adesousa_microsoft · adesousa_microsoft · commit f29a5771fd3f · 2024-08-12T19:36:33.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,7 @@ data
 static
 
 scripts/config.json
-venv
+venv
+
+grant_data
+promissory_note_data
diff --git a/scripts/data_preparation.py b/scripts/data_preparation.py
@@ -182,6 +182,15 @@ def create_or_update_search_index(
                 "filterable": False,
                 "analyzer": f"{language}.lucene" if language else None,
             },
+            {
+                "name": "full_content",
+                "type": "Edm.String",
+                "searchable": True,
+                "sortable": False,
+                "facetable": False,
+                "filterable": False,
+                "analyzer": f"{language}.lucene" if language else None,
+            },
             {
                 "name": "title",
                 "type": "Edm.String",
@@ -452,7 +461,7 @@ def valid_range(n):
     parser.add_argument("--config", type=str, help="Path to config file containing settings for data preparation")
     parser.add_argument("--form-rec-resource", type=str, help="Name of your Form Recognizer resource to use for PDF cracking.")
     parser.add_argument("--form-rec-key", type=str, help="Key for your Form Recognizer resource to use for PDF cracking.")
-    parser.add_argument("--form-rec-use-layout", default=True, action='store_true', help="Whether to use Layout model for PDF cracking, if False will use Read model.")
+    parser.add_argument("--form-rec-use-layout", default=False, action='store_true', help="Whether to use Layout model for PDF cracking, if False will use Read model.")
     parser.add_argument("--njobs", type=valid_range, default=4, help="Number of jobs to run (between 1 and 32). Default=4")
     parser.add_argument("--embedding-model-endpoint", type=str, help="Endpoint for the embedding model to use for vector search. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<Ada deployment name>/embeddings?api-version=2024-03-01-Preview'")
     parser.add_argument("--embedding-model-key", type=str, help="Key for the embedding model to use for vector search.")
diff --git a/scripts/data_utils.py b/scripts/data_utils.py
@@ -266,6 +266,7 @@ class Document(object):
     metadata: Optional[Dict] = None
     contentVector: Optional[List[float]] = None
     image_mapping: Optional[Dict] = None
+    full_content: Optional[str] = None
 
 def cleanup_content(content: str) -> str:
     """Cleans up the given content using regexes
@@ -884,7 +885,8 @@ def chunk_content(
                         url=url,
                         contentVector=doc.contentVector,
                         metadata=doc.metadata,
-                        image_mapping=doc.image_mapping
+                        image_mapping=doc.image_mapping,
+                        full_content=content
                     )
                 )
             else: