dataforgoodfr
diff --git a/‎policy_analysis/cluster.png‎
163 KB b/‎policy_analysis/cluster.png‎
163 KB
diff --git a/‎policy_analysis/old/notebooks/BERTopic.ipynb‎
Lines changed: 89 additions & 443 deletions b/‎policy_analysis/old/notebooks/BERTopic.ipynb‎
Lines changed: 89 additions & 443 deletions
diff --git a/‎policy_analysis/policies_clustering/after_clustering/create_file_with_cluster_results.py‎
Lines changed: 56 additions & 0 deletions b/‎policy_analysis/policies_clustering/after_clustering/create_file_with_cluster_results.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎policy_analysis/policies_clustering/after_clustering/naming_clusters_with_ai.py‎
Lines changed: 97 additions & 0 deletions b/‎policy_analysis/policies_clustering/after_clustering/naming_clusters_with_ai.py‎
Lines changed: 97 additions & 0 deletions
@@ -0,0 +1,56 @@
+import numpy as np
+from datasets import load_dataset
+
+# --- Configuration ---
+HF_DATASET_ID = "EdouardCallet/wsl-policy-10k"
+LABELS_PATH = 'src/policy_analysis/policies_clustering/results/clustering_experiment/labels.npy'
+OUTPUT_FILENAME = "wsl_policies_with_clusters.csv"
+
+def main():
+    print(f"Loading dataset: {HF_DATASET_ID}...")
+    ds = load_dataset(HF_DATASET_ID, split="train")
+    
+    print(f"Loading labels from: {LABELS_PATH}...")
+    try:
+        labels = np.load(LABELS_PATH, allow_pickle=True)
+    except Exception as e:
+        print(f"❌ Error loading .npy file: {e}")
+        return
+
+    # --- Verification ---
+    print("\n--- Checking Dimensions ---")
+    n_rows = len(ds)
+    n_labels = len(labels)
+    print(f"Dataset rows: {n_rows}")
+    print(f"Labels count: {n_labels}")
+
+    if n_rows != n_labels:
+        print("\n⚠️  MISMATCH DETECTED ⚠️")
+        print(f"You have {n_rows} documents but {n_labels} labels.")
+        print("Reason: Your clustering likely ran on 'chunks' (sentences) rather than full documents.")
+        print("To fix this, we need to map the chunks back to the parent documents.")
+        return
+
+    # --- Merging ---
+    print("\nMerging data...")
+    df = ds.to_pandas()
+    df['cluster_label'] = labels
+
+    # --- Inspecting Result ---
+    print("\nTop 5 rows with new labels:")
+    print(df[['cluster_label']].head(5)) 
+
+    # --- Saving ---
+    print(f"\nSaving to {OUTPUT_FILENAME}...")
+    df.to_csv(OUTPUT_FILENAME, index=False)
+    print("✅ Success! File saved locally.")
+
+    # Optional: Push to Hub
+    # print("Pushing to Hub...")
+    # new_dataset = Dataset.from_pandas(df)
+    # new_dataset.push_to_hub("EdouardCallet/wsl_10k_policy_and_taxonomy_clustered")
+
+# --- EXECUTION BLOCK (This was likely missing) ---
+if __name__ == "__main__":
+    print("Script started...")
+    main()
@@ -0,0 +1,97 @@
+import pandas as pd
+import openai
+from tqdm import tqdm
+import os
+from dotenv import load_dotenv
+
+load_dotenv()  # Load environment variables
+
+# --- Configuration ---
+INPUT_CSV = "wsl_policies_with_clusters.csv"
+OUTPUT_CSV = "wsl_policies_clustered_and_named.csv"
+MODEL_NAME = "gpt-4o-mini"
+SAMPLES_PER_CLUSTER = 40     
+
+client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+def get_cluster_name(cluster_id, texts):
+    """Sends a list of policy texts to the LLM and asks for a category name."""
+    text_preview = "\n- ".join(texts)
+    
+    prompt = (
+        f"You are a policy analyst. Below is a list of policy excerpts that belong to the same cluster (Cluster ID: {cluster_id}).\n\n"
+        f"POLICIES:\n{text_preview}\n\n"
+        f"TASK: Provide a short, specific, and professional name for this cluster (max 5-7 words). "
+        f"Do not use quotes. Just the name."
+    )
+
+    try:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": "You are a helpful taxonomist for public policy."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.0,
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"Error naming cluster {cluster_id}: {e}")
+        return f"Cluster {cluster_id}"
+
+def main():
+    if not os.path.exists(INPUT_CSV):
+        print(f"❌ Could not find {INPUT_CSV}. Run the linking script first!")
+        return
+
+    print(f"Loading {INPUT_CSV}...")
+    df = pd.read_csv(INPUT_CSV)
+
+    text_col = 'single_policy_item' 
+    if text_col not in df.columns:
+        possible_cols = [c for c in df.columns if df[c].dtype == 'object']
+        print(f"⚠️ Column '{text_col}' not found. Using '{possible_cols[0]}' instead.")
+        text_col = possible_cols[0]
+
+    unique_clusters = df['cluster_label'].unique()
+    unique_clusters = sorted([c for c in unique_clusters if pd.notna(c)])
+    
+    print(f"Found {len(unique_clusters)} unique clusters.")
+    print("Generating names (this may take a moment)...")
+
+    cluster_names_map = {}
+
+    for cluster_id in tqdm(unique_clusters, desc="Naming Clusters"):
+        # Filter for current cluster
+        cluster_data = df[df['cluster_label'] == cluster_id]
+        
+        # Get valid texts only
+        valid_texts = cluster_data[text_col].dropna()
+        
+        # Skip if no text available
+        if len(valid_texts) == 0:
+            cluster_names_map[cluster_id] = f"Cluster {cluster_id}"
+            continue
+
+        # Calculate safe sample size based on VALID texts count
+        n_samples = min(SAMPLES_PER_CLUSTER, len(valid_texts))
+        
+        # Sample
+        sample_texts = valid_texts.sample(n=n_samples, random_state=42).tolist()
+        
+        # Call LLM
+        name = get_cluster_name(cluster_id, sample_texts)
+        cluster_names_map[cluster_id] = name
+
+    print("\nApplying names to dataset...")
+    df['cluster_name'] = df['cluster_label'].map(cluster_names_map)
+
+    print("\n--- Sample of Generated Names ---")
+    print(df[['cluster_label', 'cluster_name']].drop_duplicates().head(10))
+
+    print(f"\nSaving to {OUTPUT_CSV}...")
+    df.to_csv(OUTPUT_CSV, index=False)
+    print("✅ Done!")
+
+if __name__ == "__main__":
+    main()