Skip to content

Commit 0a41bb6

Browse files
authored
Merge pull request #159 from VineetBala-AOT/main
condition parser changes to use azure model
2 parents 71d728e + 21efbd4 commit 0a41bb6

File tree

7 files changed

+55
-48
lines changed

7 files changed

+55
-48
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ celerybeat.pid
123123

124124
# Environments
125125
.env
126+
deploy.env
126127
.venv
127128
env/
128129
venv/

condition-parser/document_classifier.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
import os
22
import json
33

4-
from colorama import Fore, Style
4+
from colorama import Fore
55
from openai import OpenAI
66
from dotenv import load_dotenv
77

88
load_dotenv()
99

10-
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
11-
12-
MODEL = "gpt-4o-2024-05-13"
10+
client = OpenAI(
11+
api_key=os.getenv("EXTRACTOR_API_KEY") or os.getenv("OPENAI_API_KEY") or "not-set",
12+
base_url=f"{os.getenv('EXTRACTOR_API_URL', '').rstrip('/')}/v1" if os.getenv("EXTRACTOR_API_URL") else None,
13+
)
1314

1415

1516
def classify_document(file_text):
@@ -50,9 +51,11 @@ def classify_document(file_text):
5051
"type": "array",
5152
"items": {"type": "string"},
5253
"description": (
53-
"The main section or topic headers found in the document that group conditions/commitments. "
54-
"E.g., ['Environmental Management', 'Acid Rock Drainage Prevention', 'Monitoring', 'Fish and Aquatic Resources']. "
55-
"Empty array if conditions are not grouped by sections."
54+
"The main topic or subject-matter headers that group conditions/commitments — NOT preamble or structural headings. "
55+
"EXCLUDE generic document sections such as 'Definitions', 'Acronyms', 'Conditions', 'Introduction', 'Background', 'Purpose', 'Scope', 'General', 'Schedule'. "
56+
"INCLUDE only substantive environmental or project topic headings that categorise the actual conditions, "
57+
"e.g., ['Environmental Management', 'Acid Rock Drainage Prevention', 'Fish and Aquatic Resources', 'Air Quality', 'Wildlife']. "
58+
"Empty array if conditions are not grouped by topic sections."
5659
),
5760
},
5861
"estimated_item_count": {
@@ -91,7 +94,7 @@ def classify_document(file_text):
9194

9295
try:
9396
completion = client.chat.completions.create(
94-
model=MODEL,
97+
model="gpt-4o-mini",
9598
messages=messages,
9699
tools=tools,
97100
temperature=0.0,

condition-parser/extract_first_nations.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77
import json
88

99
from openai import OpenAI
10-
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
10+
client = OpenAI(
11+
api_key=os.getenv("EXTRACTOR_API_KEY") or os.getenv("OPENAI_API_KEY") or "not-set",
12+
base_url=f"{os.getenv('EXTRACTOR_API_URL', '').rstrip('/')}/v1" if os.getenv("EXTRACTOR_API_URL") else None,
13+
)
1114

1215
def extract_first_nation_from_pdf(pdf_file_path):
1316
with open(pdf_file_path, "rb") as f:

condition-parser/extract_management_plans.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
from openai import OpenAI
1515
import json
1616

17-
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
17+
client = OpenAI(
18+
api_key=os.getenv("EXTRACTOR_API_KEY") or os.getenv("OPENAI_API_KEY") or "not-set",
19+
base_url=f"{os.getenv('EXTRACTOR_API_URL', '').rstrip('/')}/v1" if os.getenv("EXTRACTOR_API_URL") else None,
20+
)
1821

1922
def management_plan_required(input_condition_text):
2023

condition-parser/gpt.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
import os
22
import json
3-
import aiohttp
4-
import asyncio
53

6-
import colorama
7-
from colorama import Fore, Back, Style
4+
from colorama import Fore
85

96
from dotenv import load_dotenv
107
load_dotenv()
@@ -13,8 +10,10 @@
1310
from document_classifier import classify_document
1411
from openai import OpenAI
1512

16-
# Get OPENAI_API_KEY from environment variables
17-
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
13+
client = OpenAI(
14+
api_key=os.getenv("EXTRACTOR_API_KEY") or os.getenv("OPENAI_API_KEY") or "not-set",
15+
base_url=f"{os.getenv('EXTRACTOR_API_URL', '').rstrip('/')}/v1" if os.getenv("EXTRACTOR_API_URL") else None,
16+
)
1817

1918
MODEL = "gpt-4o-2024-05-13"
2019
SCHEMAS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "schemas")

condition-parser/gradio_ui.py

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import gradio as gr
22
from gpt import classify_and_count, extract_and_enrich_all
33
from extract_first_nations import process_single_pdf
4-
54
import json
65
import os
76

@@ -36,28 +35,24 @@ def classify_document_ui(file_input):
3635

3736

3837
def extract_and_enrich_ui(file_input, classification):
39-
"""Run the full extraction + enrichment pipeline."""
38+
"""Run extraction + enrichment + first nations."""
4039
if not classification:
41-
return {"error": "Please classify the document first."}
40+
return json.dumps({"error": "Please classify the document first."}, indent=4)
4241
result = extract_and_enrich_all(file_input, classification)
43-
return result
44-
42+
if result and "conditions" in result:
43+
file_path = file_input.name if hasattr(file_input, "name") else file_input
44+
if isinstance(file_path, str) and file_path.endswith(".pdf"):
45+
result = process_single_pdf(file_path, result)
46+
return json.dumps(result, indent=4)
4547

46-
def add_first_nations_ui(file_input, enriched_json):
47-
"""Add first nations info to the enriched JSON."""
48-
if not enriched_json or "conditions" not in enriched_json:
49-
return enriched_json
50-
file_path = file_input.name if hasattr(file_input, "name") else file_input
51-
if file_path.endswith(".pdf"):
52-
result = process_single_pdf(file_path, enriched_json)
53-
return result
54-
return enriched_json
5548

56-
57-
def send_to_json_editor(json_data):
58-
if isinstance(json_data, str):
59-
json_data = json.loads(json_data)
60-
return json.dumps(json_data, indent=4), json_data
49+
def send_to_json_editor(json_str):
50+
if isinstance(json_str, dict):
51+
json_data = json_str
52+
else:
53+
json_data = json.loads(json_str) if json_str else {}
54+
formatted = json.dumps(json_data, indent=4)
55+
return formatted, formatted
6156

6257

6358
def save_json(content, project_id, document_id, project_name, project_type,
@@ -93,9 +88,9 @@ def save_json(content, project_id, document_id, project_name, project_type,
9388
with open(output_path, "w") as f:
9489
json.dump(content_dict, f, indent=4)
9590

96-
return f"Saved to {output_path}", json.dumps(content_dict, indent=4), content_dict
91+
return f"Saved to {output_path}", json.dumps(content_dict, indent=4), json.dumps(content_dict, indent=4)
9792
except Exception as e:
98-
return f"Save failed: {str(e)}", content, None
93+
return f"Save failed: {str(e)}", content, content if isinstance(content, str) else json.dumps(content, indent=4)
9994

10095

10196
# ---------------------------------------------------------------------------
@@ -129,8 +124,7 @@ def save_json(content, project_id, document_id, project_name, project_type,
129124
# --- Extraction section ---
130125
with gr.Column():
131126
submit_button = gr.Button("Extract & Enrich Conditions", variant="primary")
132-
extracted_conditions = gr.JSON(label="Extracted & Enriched Conditions")
133-
first_nations_result = gr.JSON(label="With First Nations")
127+
final_result = gr.Code(language="json", label="Extracted Conditions", interactive=False)
134128

135129
with gr.Tab("JSON Editor"):
136130
# --- Metadata inputs ---
@@ -153,21 +147,17 @@ def save_json(content, project_id, document_id, project_name, project_type,
153147
status_output = gr.Textbox(label="Status", lines=1, interactive=False)
154148

155149
with gr.Row():
156-
json_viewer = gr.JSON(label="JSON Viewer")
150+
json_viewer = gr.Code(language="json", label="JSON Viewer", interactive=False)
157151
json_editor = gr.Textbox(label="JSON Content Editor", lines=500)
158152

159-
# --- Pipeline: Classify -> Extract & Enrich -> First Nations -> Editor ---
153+
# --- Pipeline: Extract & Enrich (+ First Nations) -> Editor ---
160154
submit_button.click(
161155
fn=extract_and_enrich_ui,
162156
inputs=[file_input, classification_state],
163-
outputs=[extracted_conditions]
164-
).then(
165-
fn=add_first_nations_ui,
166-
inputs=[file_input, extracted_conditions],
167-
outputs=[first_nations_result]
157+
outputs=[final_result]
168158
).then(
169159
fn=send_to_json_editor,
170-
inputs=[first_nations_result],
160+
inputs=[final_result],
171161
outputs=[json_editor, json_viewer]
172162
)
173163

condition-parser/sample.env

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,9 @@
1-
OPENAI_API_KEY=
1+
# condition-parser environment variables.
2+
# Copy this file to .env and fill in your values.
3+
# .env is gitignored — never commit it.
4+
5+
# ── Azure Extractor Proxy (recommended) ───────────────────────────────────────
6+
# Route all AI calls through the condition-extractor Azure App Service.
7+
# Leave EXTRACTOR_API_URL blank to call OpenAI directly instead (see below).
8+
EXTRACTOR_API_URL=
9+
EXTRACTOR_API_KEY=your-extractor-api-key-here

0 commit comments

Comments
 (0)