Merge pull request #159 from VineetBala-AOT/main

don-aot · web-flow · commit 0a41bb6d74bc · 2026-03-09T12:17:31.000-06:00
condition parser changes to use azure model
diff --git a/.gitignore b/.gitignore
@@ -123,6 +123,7 @@ celerybeat.pid
 
 # Environments
 .env
+deploy.env
 .venv
 env/
 venv/
diff --git a/condition-parser/document_classifier.py b/condition-parser/document_classifier.py
@@ -1,15 +1,16 @@
 import os
 import json
 
-from colorama import Fore, Style
+from colorama import Fore
 from openai import OpenAI
 from dotenv import load_dotenv
 
 load_dotenv()
 
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-
-MODEL = "gpt-4o-2024-05-13"
+client = OpenAI(
+    api_key=os.getenv("EXTRACTOR_API_KEY") or os.getenv("OPENAI_API_KEY") or "not-set",
+    base_url=f"{os.getenv('EXTRACTOR_API_URL', '').rstrip('/')}/v1" if os.getenv("EXTRACTOR_API_URL") else None,
+)
 
 
 def classify_document(file_text):
@@ -50,9 +51,11 @@ def classify_document(file_text):
                             "type": "array",
                             "items": {"type": "string"},
                             "description": (
-                                "The main section or topic headers found in the document that group conditions/commitments. "
-                                "E.g., ['Environmental Management', 'Acid Rock Drainage Prevention', 'Monitoring', 'Fish and Aquatic Resources']. "
-                                "Empty array if conditions are not grouped by sections."
+                                "The main topic or subject-matter headers that group conditions/commitments — NOT preamble or structural headings. "
+                                "EXCLUDE generic document sections such as 'Definitions', 'Acronyms', 'Conditions', 'Introduction', 'Background', 'Purpose', 'Scope', 'General', 'Schedule'. "
+                                "INCLUDE only substantive environmental or project topic headings that categorise the actual conditions, "
+                                "e.g., ['Environmental Management', 'Acid Rock Drainage Prevention', 'Fish and Aquatic Resources', 'Air Quality', 'Wildlife']. "
+                                "Empty array if conditions are not grouped by topic sections."
                             ),
                         },
                         "estimated_item_count": {
@@ -91,7 +94,7 @@ def classify_document(file_text):
 
     try:
         completion = client.chat.completions.create(
-            model=MODEL,
+            model="gpt-4o-mini",
             messages=messages,
             tools=tools,
             temperature=0.0,
diff --git a/condition-parser/extract_first_nations.py b/condition-parser/extract_first_nations.py
@@ -7,7 +7,10 @@
 import json
 
 from openai import OpenAI
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+client = OpenAI(
+    api_key=os.getenv("EXTRACTOR_API_KEY") or os.getenv("OPENAI_API_KEY") or "not-set",
+    base_url=f"{os.getenv('EXTRACTOR_API_URL', '').rstrip('/')}/v1" if os.getenv("EXTRACTOR_API_URL") else None,
+)
 
 def extract_first_nation_from_pdf(pdf_file_path):
     with open(pdf_file_path, "rb") as f:
diff --git a/condition-parser/extract_management_plans.py b/condition-parser/extract_management_plans.py
@@ -14,7 +14,10 @@
 from openai import OpenAI
 import json
 
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+client = OpenAI(
+    api_key=os.getenv("EXTRACTOR_API_KEY") or os.getenv("OPENAI_API_KEY") or "not-set",
+    base_url=f"{os.getenv('EXTRACTOR_API_URL', '').rstrip('/')}/v1" if os.getenv("EXTRACTOR_API_URL") else None,
+)
 
 def management_plan_required(input_condition_text):
    
diff --git a/condition-parser/gpt.py b/condition-parser/gpt.py
@@ -1,10 +1,7 @@
 import os
 import json
-import aiohttp
-import asyncio
 
-import colorama
-from colorama import Fore, Back, Style
+from colorama import Fore
 
 from dotenv import load_dotenv
 load_dotenv()
@@ -13,8 +10,10 @@
 from document_classifier import classify_document
 from openai import OpenAI
 
-# Get OPENAI_API_KEY from environment variables
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+client = OpenAI(
+    api_key=os.getenv("EXTRACTOR_API_KEY") or os.getenv("OPENAI_API_KEY") or "not-set",
+    base_url=f"{os.getenv('EXTRACTOR_API_URL', '').rstrip('/')}/v1" if os.getenv("EXTRACTOR_API_URL") else None,
+)
 
 MODEL = "gpt-4o-2024-05-13"
 SCHEMAS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "schemas")
diff --git a/condition-parser/gradio_ui.py b/condition-parser/gradio_ui.py
@@ -1,7 +1,6 @@
 import gradio as gr
 from gpt import classify_and_count, extract_and_enrich_all
 from extract_first_nations import process_single_pdf
-
 import json
 import os
 
@@ -36,28 +35,24 @@ def classify_document_ui(file_input):
 
 
 def extract_and_enrich_ui(file_input, classification):
-    """Run the full extraction + enrichment pipeline."""
+    """Run extraction + enrichment + first nations."""
     if not classification:
-        return {"error": "Please classify the document first."}
+        return json.dumps({"error": "Please classify the document first."}, indent=4)
     result = extract_and_enrich_all(file_input, classification)
-    return result
-
+    if result and "conditions" in result:
+        file_path = file_input.name if hasattr(file_input, "name") else file_input
+        if isinstance(file_path, str) and file_path.endswith(".pdf"):
+            result = process_single_pdf(file_path, result)
+    return json.dumps(result, indent=4)
 
-def add_first_nations_ui(file_input, enriched_json):
-    """Add first nations info to the enriched JSON."""
-    if not enriched_json or "conditions" not in enriched_json:
-        return enriched_json
-    file_path = file_input.name if hasattr(file_input, "name") else file_input
-    if file_path.endswith(".pdf"):
-        result = process_single_pdf(file_path, enriched_json)
-        return result
-    return enriched_json
 
-
-def send_to_json_editor(json_data):
-    if isinstance(json_data, str):
-        json_data = json.loads(json_data)
-    return json.dumps(json_data, indent=4), json_data
+def send_to_json_editor(json_str):
+    if isinstance(json_str, dict):
+        json_data = json_str
+    else:
+        json_data = json.loads(json_str) if json_str else {}
+    formatted = json.dumps(json_data, indent=4)
+    return formatted, formatted
 
 
 def save_json(content, project_id, document_id, project_name, project_type,
@@ -93,9 +88,9 @@ def save_json(content, project_id, document_id, project_name, project_type,
         with open(output_path, "w") as f:
             json.dump(content_dict, f, indent=4)
 
-        return f"Saved to {output_path}", json.dumps(content_dict, indent=4), content_dict
+        return f"Saved to {output_path}", json.dumps(content_dict, indent=4), json.dumps(content_dict, indent=4)
     except Exception as e:
-        return f"Save failed: {str(e)}", content, None
+        return f"Save failed: {str(e)}", content, content if isinstance(content, str) else json.dumps(content, indent=4)
 
 
 # ---------------------------------------------------------------------------
@@ -129,8 +124,7 @@ def save_json(content, project_id, document_id, project_name, project_type,
         # --- Extraction section ---
         with gr.Column():
             submit_button = gr.Button("Extract & Enrich Conditions", variant="primary")
-            extracted_conditions = gr.JSON(label="Extracted & Enriched Conditions")
-            first_nations_result = gr.JSON(label="With First Nations")
+            final_result = gr.Code(language="json", label="Extracted Conditions", interactive=False)
 
     with gr.Tab("JSON Editor"):
         # --- Metadata inputs ---
@@ -153,21 +147,17 @@ def save_json(content, project_id, document_id, project_name, project_type,
         status_output = gr.Textbox(label="Status", lines=1, interactive=False)
 
         with gr.Row():
-            json_viewer = gr.JSON(label="JSON Viewer")
+            json_viewer = gr.Code(language="json", label="JSON Viewer", interactive=False)
             json_editor = gr.Textbox(label="JSON Content Editor", lines=500)
 
-    # --- Pipeline: Classify -> Extract & Enrich -> First Nations -> Editor ---
+    # --- Pipeline: Extract & Enrich (+ First Nations) -> Editor ---
     submit_button.click(
         fn=extract_and_enrich_ui,
         inputs=[file_input, classification_state],
-        outputs=[extracted_conditions]
-    ).then(
-        fn=add_first_nations_ui,
-        inputs=[file_input, extracted_conditions],
-        outputs=[first_nations_result]
+        outputs=[final_result]
     ).then(
         fn=send_to_json_editor,
-        inputs=[first_nations_result],
+        inputs=[final_result],
         outputs=[json_editor, json_viewer]
     )
 
diff --git a/condition-parser/sample.env b/condition-parser/sample.env
@@ -1 +1,9 @@
-OPENAI_API_KEY=
+# condition-parser environment variables.
+# Copy this file to .env and fill in your values.
+# .env is gitignored — never commit it.
+
+# ── Azure Extractor Proxy (recommended) ───────────────────────────────────────
+# Route all AI calls through the condition-extractor Azure App Service.
+# Leave EXTRACTOR_API_URL blank to call OpenAI directly instead (see below).
+EXTRACTOR_API_URL=
+EXTRACTOR_API_KEY=your-extractor-api-key-here