Skip to content

Commit bf41dd4

Browse files
committed
refactor: Clean up legacy code and fix provider CSV handling
1 parent be44ff9 commit bf41dd4

File tree

1 file changed

+77
-84
lines changed

1 file changed

+77
-84
lines changed

poligrapher/gradio_app/app.py

Lines changed: 77 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import gradio as gr
55
import logging
66
import pandas as pd
7+
from datetime import date
78

89
from poligrapher.gradio_app import functions
910
from poligrapher.gradio_app.policy_analysis import (
@@ -13,37 +14,12 @@
1314
PolicyDocumentInfo,
1415
PolicyDocumentProvider,
1516
)
16-
# (Legacy direct script imports removed; generation orchestrated through functions.generate_graph)
1717

1818
logger = logging.getLogger(__name__)
1919
logger.setLevel(logging.INFO)
2020
# Global in‑memory provider registry
2121
providers: list[PolicyDocumentProvider] = []
22-
## (Legacy CSV Status augmentation code omitted for clarity)
23-
24-
# def get_analysis_results():
25-
# try:
26-
# df = get_company_df()
27-
# results = []
28-
# return results
29-
# except Exception as e:
30-
# logger.error("Error loading companies from CSV: %s", e)
31-
# return [PolicyAnalysisResult(company_name="error", privacy_policy_url="", score=None, kind="auto", has_name=False, has_score=False)]
32-
33-
# TODO: Modify to use PolicyAnalysisResult.get_graph_image_path()
34-
# def get_png_for_company(selected_row):
35-
# if selected_row is None or not isinstance(selected_row, list) or len(selected_row) == 0:
36-
# return None
37-
# idx = selected_row[1]
38-
# df = get_analysis_results()
39-
# if idx >= len(df):
40-
# return None
41-
# domain = df.iloc[idx]["Domain Name"]
42-
# png_path = f"./output/{domain}/knowledge_graph.png"
43-
# if os.path.exists(png_path):
44-
# return png_path
45-
# return None
46-
22+
CSV_PATH = "./poligrapher/gradio_app/policy_list.csv"
4723

4824
def add_provider(name: str, industry: str):
4925
provider = PolicyDocumentProvider(name=name, industry=industry)
@@ -94,55 +70,6 @@ def add_result_to_provider(
9470
):
9571
provider.add_result(PolicyAnalysisResult(document=document, score=score, kind=kind))
9672

97-
# def analyze_url(policy: PolicyAnalysisResult):
98-
# try:
99-
# logger.info("API triggered: analyze_url for company: %s, URL: %s", policy.company_name, policy.privacy_policy_url)
100-
# if getattr(policy, "has_graph", False):
101-
# logger.info(
102-
# "Existing graph detected for %s; skipping regeneration and only scoring.",
103-
# policy.company_name,
104-
# )
105-
# output_info = score_existing_policy(policy)
106-
# else:
107-
# output_info = process_policy_url(policy)
108-
109-
# if (output_info is None) or (not output_info.get("success", True)):
110-
# logger.error("Error processing policy URL: %s", output_info.get('message', 'Unknown error'))
111-
# return {"error": output_info.get("message", "Unknown error")}
112-
113-
# # output_info follows shape { success: True, message: ..., result: { ... } }
114-
# result_payload = output_info.get("result", {})
115-
# total_score = result_payload.get("total_score")
116-
# grade = result_payload.get("grade")
117-
# category_scores = result_payload.get("category_scores")
118-
# feedback = result_payload.get("feedback")
119-
# graph_json_path = result_payload.get("graph_json_path")
120-
# structured = result_payload.get("structured")
121-
122-
# logger.info(
123-
# "API analyze_url completed: %s",
124-
# {
125-
# "company": policy.company_name,
126-
# "score": total_score,
127-
# "grade": grade,
128-
# "has_graph": getattr(policy, "has_graph", False),
129-
# },
130-
# )
131-
132-
# return {
133-
# "total_score": total_score,
134-
# "grade": grade,
135-
# "category_scores": category_scores,
136-
# "feedback": feedback,
137-
# "graph_json_path": graph_json_path,
138-
# "structured": structured,
139-
# }
140-
141-
# except Exception as e:
142-
# logger.error("Error in analyze_url: %s", e)
143-
# return {"error": str(e)}
144-
145-
14673
def get_providers(csv_file: str):
14774
# Reset existing providers to avoid duplicates on repeated calls
14875
providers.clear()
@@ -195,6 +122,65 @@ def _safe_enum_from_value(val) -> DocumentCaptureSource:
195122
providers.append(provider)
196123

197124

125+
def _providers_to_dataframe() -> pd.DataFrame:
126+
rows = []
127+
for provider in providers:
128+
for doc in provider.documents:
129+
result = next((r for r in provider.results if r.document == doc), None)
130+
rows.append(
131+
{
132+
"Provider": provider.name,
133+
"Policy URL": doc.path,
134+
"Industry": provider.industry,
135+
"Source": getattr(doc.source, "value", doc.source),
136+
"Date": doc.capture_date,
137+
"Status": bool(doc.has_results),
138+
"Score": getattr(result, "score", None),
139+
"Graph Kind": (
140+
getattr(result.kind, "value", None)
141+
if result and result.kind
142+
else None
143+
),
144+
}
145+
)
146+
return pd.DataFrame(
147+
rows,
148+
columns=[
149+
"Provider",
150+
"Policy URL",
151+
"Industry",
152+
"Source",
153+
"Date",
154+
"Status",
155+
"Score",
156+
"Graph Kind",
157+
],
158+
)
159+
160+
161+
def _save_providers_to_csv(path: str = CSV_PATH, allow_empty: bool = False):
162+
"""Persist in-memory providers to CSV.
163+
164+
Protection: Previously this function was invoked before any load, causing
165+
an existing populated CSV to be overwritten by an empty header line.
166+
We now skip writing when the in-memory provider list is empty unless
167+
explicitly forced (allow_empty=True).
168+
"""
169+
try:
170+
df = _providers_to_dataframe()
171+
if df.empty and not allow_empty and os.path.exists(path):
172+
logger.debug(
173+
"Skip saving providers: would overwrite existing non-empty CSV with empty dataset (%s)",
174+
path,
175+
)
176+
return
177+
os.makedirs(os.path.dirname(path), exist_ok=True)
178+
df.to_csv(path, index=False)
179+
logger.debug("Providers persisted to %s (rows=%d)", path, len(df))
180+
except Exception as e:
181+
logger.error("Failed to persist providers to CSV: %s", e)
182+
183+
198184
with gr.Blocks() as block1:
199185
gr.Markdown("#### PoliGraph-er Demo")
200186
company_name_input = gr.Textbox(label="Company Name")
@@ -219,8 +205,6 @@ def on_submit_click(company_name, privacy_policy_url, kind):
219205
gr.Markdown("#### Company Privacy Policy List")
220206
# Lazy load: summary placeholder (populated on .load())
221207
status_md = gr.Markdown("")
222-
# Enable the button for demonstration and add a progress bar
223-
score_btn = gr.Button("Score All", interactive=True)
224208
# Show only relevant columns, including Status
225209
display_cols = [
226210
"Status",
@@ -357,8 +341,6 @@ def _build_policies_df(provider_filter: str | None = None):
357341
)
358342

359343
# Policies UI will be added after company_info & png_image definitions
360-
361-
# ----- Add Provider Modal UI -----
362344
add_provider_btn = gr.Button("Add Provider")
363345
with gr.Group(visible=False, elem_id="add-provider-modal") as add_provider_modal:
364346
with gr.Column(elem_classes="modal-card"):
@@ -448,11 +430,12 @@ def _save_new_provider(name: str, industry: str):
448430
file_types=[".pdf", ".html", ".htm"],
449431
visible=False,
450432
)
451-
new_policy_date = gr.Textbox(label="Capture Date (YYYY-MM-DD)")
433+
with gr.Row():
434+
new_policy_date = gr.Textbox(label="Capture Date (YYYY-MM-DD)")
435+
new_policy_today = gr.Button("Today")
452436
with gr.Row():
453437
save_new_policy = gr.Button("Save", variant="primary")
454438
cancel_new_policy = gr.Button("Cancel")
455-
scoring_output = gr.Textbox(label="Scoring Results", interactive=False)
456439

457440
def _show_add_policy_modal(provider_name: str):
458441
if not provider_name:
@@ -551,7 +534,8 @@ def _save_new_policy(
551534
capture_date=capture_date,
552535
has_results=False,
553536
)
554-
537+
# Persist after adding new document
538+
_save_providers_to_csv()
555539
return (
556540
_build_display_df(), # updated company (providers) table including status
557541
_build_policies_df(provider_name), # updated policies list
@@ -584,6 +568,14 @@ def _on_policy_source_change(source_val: str):
584568
outputs=[new_policy_file, new_policy_url],
585569
)
586570

571+
def _set_new_policy_date_today():
572+
"""Set the new policy date textbox to today's date (YYYY-MM-DD)."""
573+
return gr.update(value=date.today().isoformat())
574+
575+
new_policy_today.click(
576+
_set_new_policy_date_today, inputs=[], outputs=[new_policy_date]
577+
)
578+
587579
# Auto-adjust source dropdown when a file is uploaded
588580
def _on_policy_file_change(uploaded_file, current_source):
589581
if uploaded_file is None:
@@ -655,6 +647,8 @@ def _ensure_graph_assets(doc: PolicyDocumentInfo, force: bool = False) -> bool:
655647
# Final status evaluation
656648
success = doc.has_graph() and doc.has_image()
657649
doc.has_results = success
650+
# Persist status change if success or partial
651+
_save_providers_to_csv()
658652
if not success:
659653
logger.debug("Artifacts incomplete for %s (graph=%s, image=%s)", doc.path, doc.has_graph(), doc.has_image())
660654
except BaseException as e:
@@ -759,11 +753,10 @@ def score_all(progress=gr.Progress()):
759753
policies_accordion,
760754
],
761755
)
762-
score_btn.click(score_all, inputs=[], outputs=scoring_output, show_progress="full")
763756

764757
# initial load (after client connects)
765758
def _initial_load():
766-
get_providers("./poligrapher/gradio_app/policy_list.csv")
759+
get_providers(CSV_PATH)
767760
df = _build_display_df()
768761
num_success = sum(
769762
1 for p in providers if p.documents and p.documents[0].has_results

0 commit comments

Comments
 (0)