-AI-Driven-ESG-Risk-Assessment/esg_pipeline.py at master · shhreya13/-AI-Driven-ESG-Risk-Assessment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import pandas as pd
from transformers import pipeline
import spacy
from risk_terms import risk_terms, weight_map
from load_texts import load_cleaned_texts


# ---------------- Setup ----------------
OUTPUT_FILE = "esg_risk_output.csv"

# Hugging Face sentiment model (DistilBERT fine-tuned on SST-2)
sentiment_model = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

# spaCy NER model
nlp = spacy.load("en_core_web_sm")


# ---------------- Helpers ----------------
def analyze_sentiment(sentences):
    """
    Use Hugging Face transformer to get sentiment polarity for a list of sentences.
    Positive → +score, Negative → -score
    """
    sentiments = []
    for s in sentences:
        try:
            res = sentiment_model(s[:512])[0]  # limit to 512 tokens
            polarity = res["score"] if res["label"] == "POSITIVE" else -res["score"]
            sentiments.append(polarity)
        except Exception:
            sentiments.append(0)
    return sum(sentiments) / len(sentiments) if sentiments else 0


def extract_risks(company, text):
    """
    Extract ESG risks from text:
    - Lexicon-based keyword detection
    - Transformer-based sentiment
    - spaCy NER for entities (e.g., regulators, laws, countries)
    """
    results = []
    text_lower = text.lower()

    # --- ESG Keyword Search ---
    for category, terms in risk_terms.items():
        for term in terms:
            count = text_lower.count(term.lower())
            if count > 0:
                # Find sentences with the term
                sentences = [s.strip() for s in text.split(".") if term in s]
                sentiment = analyze_sentiment(sentences) if sentences else 0

                risk_weight = weight_map.get(term, 1)
                weighted_score = count * risk_weight

                results.append({
                    "company": company,
                    "category": category,
                    "term": term,
                    "count": count,
                    "sentiment": round(sentiment, 2),
                    "risk_weight": risk_weight,
                    "weighted_score": weighted_score
                })

    # --- Entity Detection (Regulators, Laws, etc.) ---
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["ORG", "LAW", "GPE"]:  # SEC, EPA, EU, Supreme Court, etc.
            results.append({
                "company": company,
                "category": "Governance",
                "term": f"Entity: {ent.text}",
                "count": 1,
                "sentiment": 0,
                "risk_weight": 1,
                "weighted_score": 1
            })

    return results


# ---------------- Main ----------------
def main():
    company_texts = load_cleaned_texts()
    all_results = []

    for company, text in company_texts.items():
        try:
            rows = extract_risks(company, text)
            all_results.extend(rows)
            print(f"✅ Processed {company}: {len(rows)} risks flagged")
        except Exception as e:
            print(f"❌ Error processing {company}: {e}")

    if all_results:
        df = pd.DataFrame(all_results)
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"\n✅ ESG risk extraction complete. Results saved to {OUTPUT_FILE}")
    else:
        print("⚠️ No risks detected in any company reports.")


if __name__ == "__main__":
    main()