py-why · amit-sharma · Jun 28, 2025 · May 19, 2025 · May 19, 2025 · May 19, 2025
diff --git a/README.md b/README.md
@@ -28,6 +28,7 @@ PyWhy-LLM seamlessly integrates into your existing causal inference process. Imp
 from pywhyllm.suggesters.model_suggester import ModelSuggester 
 from pywhyllm.suggesters.identification_suggester import IdentificationSuggester
 from pywhyllm.suggesters.validation_suggester import ValidationSuggester
+from pywhyllm.suggesters.augmented_model_suggester import AugmentedModelSuggester
 from pywhyllm import RelationshipStrategy
 
 ```
@@ -49,11 +50,22 @@ domain_expertises = modeler.suggest_domain_expertises(all_factors)
 # Suggest a set of potential confounders
 suggested_confounders = modeler.suggest_confounders(treatment, outcome, all_factors, domain_expertises)
 
-# Suggest pair-wise relationship between variables
+# Suggest pair-wise relationships between variables
 suggested_dag = modeler.suggest_relationships(treatment, outcome, all_factors, domain_expertises, RelationshipStrategy.Pairwise)
 ```
 
+### Retrieval Augmented Generation (RAG)-based Modeler
 
+```python
+# Create instance of Modeler
+modeler = AugmentedModelSuggester('gpt-4')
+
+treatment = "smoking"
+outcome = "lung cancer"
+
+# Suggest pair-wise relationship between two given variables, utilizing CauseNet for RAGing the LLM
+suggested_relationship = modeler.suggest_relationships(treatment, outcome)
+```
 
 ### Identifier
 

diff --git a/docs/notebooks/augmented_model_suggester_examples.ipynb b/docs/notebooks/augmented_model_suggester_examples.ipynb
@@ -0,0 +1,136 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "pip install dotenv"
+      ],
+      "metadata": {
+        "id": "cmZerbMu6Uk4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "EulKv3Km4nMa"
+      },
+      "outputs": [],
+      "source": [
+        "from dotenv import load_dotenv\n",
+        "import os\n",
+        "\n",
+        "load_dotenv()\n",
+        "\n",
+        "os.environ[\"OPENAI_API_KEY\"] = '' # specify your key here"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pip install pywhyllm"
+      ],
+      "metadata": {
+        "collapsed": true,
+        "id": "83sxVcP97xlH"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from pywhyllm.suggesters.augmented_model_suggester import AugmentedModelSuggester\n",
+        "\n",
+        "model = AugmentedModelSuggester('gpt-4')"
+      ],
+      "metadata": {
+        "id": "VdfEKuDLEYcU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "result = model.suggest_pairwise_relationship(\"smoking\", \"lung cancer\")"
+      ],
+      "metadata": {
+        "id": "D85ec6Pk5JzA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "result"
+      ],
+      "metadata": {
+        "id": "W3bFehXh5SQl"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "result = model.suggest_pairwise_relationship(\"income\", \"exercise level\")"
+      ],
+      "metadata": {
+        "id": "odFkp921hQsX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "result"
+      ],
+      "metadata": {
+        "id": "ZIeStj9OwIPe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "result = model.suggest_pairwise_relationship(\"flooding\", \"rain\")"
+      ],
+      "metadata": {
+        "id": "Fm5XCFrRwKsV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "result"
+      ],
+      "metadata": {
+        "id": "HDo098ICwzi7"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -55,6 +55,14 @@ networkx = "<=3.2.1"
 guidance = ">=0.2"
 openai = ">=1.70"
 pydantic = ">=2.11"
+langchain = ">=0.3.25"
+langchain-chroma = ">=0.2.4"
+langchain-community = ">=0.3.24"
+langchain-core = ">=0.3.60"
+langchain-huggingface = ">=0.2.0"
+langchain-openai = ">=0.3.17"
+rank-bm25 = ">=0.2.2"
+sentence-transformers = ">=4.1.0"
 
 [tool.poetry.group.dev.dependencies]
 poethepoet = "^0.33.0"
@@ -110,7 +118,7 @@ _isort_check = 'isort --check .'
 
 # testing tasks
 test = "pytest -v -m 'not advanced' --durations=0 --durations-min=60.0"
-test_no_notebooks= "pytest -v -m 'not advanced and not notebook' --durations=0 --durations-min=60.0"
+test_no_notebooks = "pytest -v -m 'not advanced and not notebook' --durations=0 --durations-min=60.0"
 test_durations = "poetry run poe test --store-durations"
 test_advanced = "pytest -v"
 test_focused = "pytest -v -m 'focused'"

diff --git a/pywhyllm/suggesters/augmented_model_suggester.py b/pywhyllm/suggesters/augmented_model_suggester.py
@@ -0,0 +1,45 @@
+import logging
+import re
+
+from .simple_model_suggester import SimpleModelSuggester
+from pywhyllm.utils.data_loader import *
+from pywhyllm.utils.augmented_model_suggester_utils import *
+
+
+class AugmentedModelSuggester(SimpleModelSuggester):
+    def __init__(self, llm, file_path: str = 'data/causenet-precision.jsonl.bz2'):
+        super().__init__(llm)
+        self.file_path = file_path
+
+        logging.basicConfig(level=logging.INFO)
+        url = "https://groups.uni-paderborn.de/wdqa/causenet/causality-graphs/causenet-precision.jsonl.bz2"
+        success = download_causenet(url, file_path)
+
+        if success:
+            print(f"File downloaded to {file_path}")
+            json_data = load_causenet_json(file_path)
+            self.causenet_dict = create_causenet_dict(json_data)
+        else:
+            print("Download failed")
+
+    def suggest_pairwise_relationship(self, variable1: str, variable2: str):
+        result = find_top_match_in_causenet(self.causenet_dict, variable1, variable2)
+        if result:
+            source_text = get_source_text(result)
+            retriever = split_data_and_create_vectorstore_retriever(source_text)
+            response = query_llm(variable1, variable2, source_text, retriever)
+        else:
+            response = query_llm(variable1, variable2)
+
+        answer = re.findall(r'<answer>(.*?)</answer>', response)
+        answer = [ans.strip() for ans in answer]
+        answer_str = "".join(answer)
+
+        if answer_str == "A":
+            return [variable1, variable2, response]
+        elif answer_str == "B":
+            return [variable2, variable1, response]
+        elif answer_str == "C":
+            return [None, None, response]
+        else:
+            assert False, "Invalid answer from LLM: " + answer_str
diff --git a/pywhyllm/suggesters/simple_model_suggester.py b/pywhyllm/suggesters/simple_model_suggester.py
@@ -55,11 +55,11 @@ def suggest_pairwise_relationship(self, variable1: str, variable2: str):
         answer = [ans.strip() for ans in answer]
         answer_str = "".join(answer)
 
-        if (answer_str == "A"):
+        if answer_str == "A":
             return [variable1, variable2, description]
-        elif (answer_str == "B"):
+        elif answer_str == "B":
             return [variable2, variable1, description]
-        elif (answer_str == "C"):
+        elif answer_str == "C":
             return [None, None, description]  # maybe we want to save the description in this case too
         else:
             assert False, "Invalid answer from LLM: " + answer_str

diff --git a/pywhyllm/utils/__init__.py b/pywhyllm/utils/__init__.py
diff --git a/pywhyllm/utils/augmented_model_suggester_utils.py b/pywhyllm/utils/augmented_model_suggester_utils.py
@@ -0,0 +1,143 @@
+import os
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+import numpy as np
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer, util
+
+
+def find_top_match_in_causenet(causenet_dict, variable1, variable2, threshold=0.7):
+    # Sample dictionary
+    pair_strings = [
+        f"{causenet_dict[key]['causal_relation']['cause']}-{causenet_dict[key]['causal_relation']['effect']}"
+        for key in causenet_dict]
+
+    # Tokenize for BM25
+    tokenized_pairs = [text.split() for text in pair_strings]
+    bm25 = BM25Okapi(tokenized_pairs)
+
+    # Original and reverse queries
+    query = variable1 + "-" + variable2
+    reverse_query = variable2 + "-" + variable1
+    tokenized_query = query.split()
+    tokenized_reverse_query = reverse_query.split()
+
+    # Combine tokens from both queries (remove duplicates)
+    combined_query = list(set(tokenized_query + tokenized_reverse_query))
+
+    # Get top-k candidates using BM25 with combined query
+    k = 5
+    scores = bm25.get_scores(combined_query)
+    top_k_indices = np.argsort(scores)[::-1][:k]
+    candidate_pairs = [pair_strings[i] for i in top_k_indices]
+
+    # Apply SBERT to candidates
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    query_embedding = model.encode(query, convert_to_tensor=True)
+    reverse_query_embedding = model.encode(reverse_query, convert_to_tensor=True)
+    candidate_embeddings = model.encode(candidate_pairs, convert_to_tensor=True)
+
+    # Compute similarities for both original and reverse queries
+    similarities = util.cos_sim(query_embedding, candidate_embeddings).flatten()
+    reverse_similarities = util.cos_sim(reverse_query_embedding, candidate_embeddings).flatten()
+
+    # Take the maximum similarity for each candidate (original or reverse)
+    max_similarities = np.maximum(similarities, reverse_similarities)
+
+    # Get the top match and its similarity score
+    top_idx = np.argmax(max_similarities)
+    top_similarity = max_similarities[top_idx]
+    top_pair = candidate_pairs[top_idx]
+
+    # Check if the top similarity meets the threshold
+    if top_similarity >= threshold:
+        print(f"Best match: {top_pair} (Similarity: {top_similarity:.4f})")
+        return causenet_dict[top_pair]
+    else:
+        print(f"No match found with similarity above {threshold} (Best similarity: {top_similarity:.4f})")
+        return None
+
+
+def get_source_text(causenet_query_result):
+    source_text = ""
+    if causenet_query_result:
+        for item in causenet_query_result["sources"]:
+            if item["type"] == 'wikipedia_sentence' or item["type"] == 'clueweb12_sentence':
+                source_text += item["payload"]["sentence"] + " "
+
+    return source_text
+
+
+def split_data_and_create_vectorstore_retriever(source_text):
+    document = Document(page_content=source_text)
+
+    # Initialize the text splitter
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=100,  # Adjust chunk size as needed
+        chunk_overlap=20  # Overlap for context
+    )
+    # Split the documents
+    splits = text_splitter.split_documents([document])
+
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+
+    # Create a vector store from the document splits
+    vectorstore = Chroma.from_documents(
+        documents=splits,
+        embedding=embeddings,
+        persist_directory="./chroma_db"  # Optional: Save to disk for reuse
+    )
+
+    # Create a retriever from the vector store
+    retriever = vectorstore.as_retriever(
+        search_type="similarity",
+        search_kwargs={"k": 5}  # Retrieve top 5 relevant chunks
+    )
+
+    return retriever
+
+
+def query_llm(variable1, variable2, source_text=None, retriever=None):
+    # Initialize the language model
+    llm = ChatOpenAI(model="gpt-4")
+
+    if source_text:
+        system_prompt = """You are a helpful assistant for causal reasoning.
+
+    Context: {context}
+    """
+    else:
+        system_prompt = """You are a helpful assistant for causal reasoning.
+    """
+
+    # prompt template
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", system_prompt),
+        ("human", "{input}")
+    ])
+
+    query = f"""Which cause-and-effect-relationship is more likely? Provide reasoning and you must give your final answer (A, B, or C) in <answer> </answer> tags with the letter only.
+            A. {variable1} causes {variable2} B. {variable2} causes {variable1} C. neither {variable1} nor {variable2} cause each other."""
+
+    # Define the system prompt
+    if source_text:
+        # Create a document chain to combine retrieved documents
+        question_answer_chain = create_stuff_documents_chain(llm, prompt)
+
+        # Create the RAG chain
+        rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+
+        response = rag_chain.invoke({"input": query})
+        return response['answer']
+
+
+    else:
+        default_chain = prompt | llm
+        response = default_chain.invoke({"input": query})
+        return response.content