Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a82e872
Created new augmented_model_suggester and corresponding utils.
grace-sng7 May 19, 2025
63e6d82
Updated dependencies according to augmented_model_suggester and corre…
grace-sng7 May 19, 2025
b81e676
Updated LLM query prompt.
grace-sng7 May 19, 2025
2af6350
Minor fixes after testing AugmentedModelSuggester.
grace-sng7 May 20, 2025
e349bf5
Edited CauseNet search function.
grace-sng7 May 25, 2025
633d76d
Updated README.md to include augmented_model_suggester
grace-sng7 May 25, 2025
a474e5d
Update README.md
grace-sng7 May 27, 2025
d31fc79
Merge pull request #53 from grace-sng7/creating_augmented_suggester
grace-sng7 May 27, 2025
6b71deb
Update README.md
grace-sng7 May 27, 2025
d763517
Added augmented model suggester examples notebook
grace-sng7 May 27, 2025
e2f57e4
Merge pull request #55 from grace-sng7/creating_augmented_suggester
grace-sng7 May 27, 2025
072adc4
Uploaded augmented model suggester examples notebook again.
grace-sng7 May 27, 2025
8d82bd9
Merge branch 'py-why:creating_augmented_suggester' into creating_augm…
grace-sng7 May 27, 2025
fd17322
Merge pull request #56 from grace-sng7/creating_augmented_suggester
grace-sng7 May 27, 2025
05d9aa9
Set to ignore notebook testing for augmented model suggester examples
grace-sng7 May 27, 2025
0d1c2b5
Merge pull request #57 from grace-sng7/augmented_suggester
grace-sng7 May 27, 2025
00b61a2
Updated augmented_model_suggester_examples notebooks, docstrings, and…
grace-sng7 Jun 9, 2025
83a968f
Merge pull request #58 from grace-sng7/augmented_suggester
grace-sng7 Jun 9, 2025
2c6c7c2
Updated citations
grace-sng7 Jun 11, 2025
bfde305
Merge pull request #59 from grace-sng7/augmented_suggester
grace-sng7 Jun 11, 2025
9148c2a
Edited augmented model suggester llm_query method
grace-sng7 Jun 27, 2025
e868bb2
Merge pull request #60 from grace-sng7/augmented_suggester
grace-sng7 Jun 27, 2025
1e19398
Updated ignore_notebooks in tests
grace-sng7 Jun 27, 2025
ad04e9b
Merge pull request #61 from grace-sng7/augmented_suggester
grace-sng7 Jun 27, 2025
a90e0e4
Added onxruntime dependency
grace-sng7 Jun 28, 2025
59c6012
Merge pull request #62 from grace-sng7/augmented_suggester
grace-sng7 Jun 28, 2025
a68930b
onnxruntime dependency
grace-sng7 Jun 28, 2025
0f769a0
Merge pull request #63 from grace-sng7/augmented_suggester
grace-sng7 Jun 28, 2025
2e1ca67
Removed onnxruntime-silicon
grace-sng7 Jun 28, 2025
de4c85f
Merge pull request #64 from grace-sng7/augmented_suggester
grace-sng7 Jun 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ PyWhy-LLM seamlessly integrates into your existing causal inference process. Imp
from pywhyllm.suggesters.model_suggester import ModelSuggester
from pywhyllm.suggesters.identification_suggester import IdentificationSuggester
from pywhyllm.suggesters.validation_suggester import ValidationSuggester
from pywhyllm.suggesters.augmented_model_suggester import AugmentedModelSuggester
from pywhyllm import RelationshipStrategy

```
Expand All @@ -49,11 +50,22 @@ domain_expertises = modeler.suggest_domain_expertises(all_factors)
# Suggest a set of potential confounders
suggested_confounders = modeler.suggest_confounders(treatment, outcome, all_factors, domain_expertises)

# Suggest pair-wise relationship between variables
# Suggest pair-wise relationships between variables
suggested_dag = modeler.suggest_relationships(treatment, outcome, all_factors, domain_expertises, RelationshipStrategy.Pairwise)
```

### Retrieval Augmented Generation (RAG)-based Modeler

```python
# Create instance of Modeler
modeler = AugmentedModelSuggester('gpt-4')

treatment = "smoking"
outcome = "lung cancer"

# Suggest pair-wise relationship between two given variables, utilizing CauseNet for RAGing the LLM
suggested_relationship = modeler.suggest_relationships(treatment, outcome)
```

### Identifier

Expand Down
136 changes: 136 additions & 0 deletions docs/notebooks/augmented_model_suggester_examples.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"pip install dotenv"
],
"metadata": {
"id": "cmZerbMu6Uk4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "EulKv3Km4nMa"
},
"outputs": [],
"source": [
"from dotenv import load_dotenv\n",
"import os\n",
"\n",
"load_dotenv()\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = '' # specify your key here"
]
},
{
"cell_type": "code",
"source": [
"pip install pywhyllm"
],
"metadata": {
"collapsed": true,
"id": "83sxVcP97xlH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from pywhyllm.suggesters.augmented_model_suggester import AugmentedModelSuggester\n",
"\n",
"model = AugmentedModelSuggester('gpt-4')"
],
"metadata": {
"id": "VdfEKuDLEYcU"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"result = model.suggest_pairwise_relationship(\"smoking\", \"lung cancer\")"
],
"metadata": {
"id": "D85ec6Pk5JzA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"result"
],
"metadata": {
"id": "W3bFehXh5SQl"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"result = model.suggest_pairwise_relationship(\"income\", \"exercise level\")"
],
"metadata": {
"id": "odFkp921hQsX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"result"
],
"metadata": {
"id": "ZIeStj9OwIPe"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"result = model.suggest_pairwise_relationship(\"flooding\", \"rain\")"
],
"metadata": {
"id": "Fm5XCFrRwKsV"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"result"
],
"metadata": {
"id": "HDo098ICwzi7"
},
"execution_count": null,
"outputs": []
}
]
}
7,725 changes: 5,846 additions & 1,879 deletions poetry.lock

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ networkx = "<=3.2.1"
guidance = ">=0.2"
openai = ">=1.70"
pydantic = ">=2.11"
langchain = ">=0.3.25"
langchain-chroma = ">=0.2.4"
langchain-community = ">=0.3.24"
langchain-core = ">=0.3.60"
langchain-huggingface = ">=0.2.0"
langchain-openai = ">=0.3.17"
rank-bm25 = ">=0.2.2"
sentence-transformers = ">=4.1.0"

[tool.poetry.group.dev.dependencies]
poethepoet = "^0.33.0"
Expand Down Expand Up @@ -110,7 +118,7 @@ _isort_check = 'isort --check .'

# testing tasks
test = "pytest -v -m 'not advanced' --durations=0 --durations-min=60.0"
test_no_notebooks= "pytest -v -m 'not advanced and not notebook' --durations=0 --durations-min=60.0"
test_no_notebooks = "pytest -v -m 'not advanced and not notebook' --durations=0 --durations-min=60.0"
test_durations = "poetry run poe test --store-durations"
test_advanced = "pytest -v"
test_focused = "pytest -v -m 'focused'"
Expand Down
45 changes: 45 additions & 0 deletions pywhyllm/suggesters/augmented_model_suggester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import logging
import re

from .simple_model_suggester import SimpleModelSuggester
from pywhyllm.utils.data_loader import *
from pywhyllm.utils.augmented_model_suggester_utils import *


class AugmentedModelSuggester(SimpleModelSuggester):
def __init__(self, llm, file_path: str = 'data/causenet-precision.jsonl.bz2'):
super().__init__(llm)
self.file_path = file_path

logging.basicConfig(level=logging.INFO)
url = "https://groups.uni-paderborn.de/wdqa/causenet/causality-graphs/causenet-precision.jsonl.bz2"
success = download_causenet(url, file_path)

if success:
print(f"File downloaded to {file_path}")
json_data = load_causenet_json(file_path)
self.causenet_dict = create_causenet_dict(json_data)
else:
print("Download failed")

def suggest_pairwise_relationship(self, variable1: str, variable2: str):
result = find_top_match_in_causenet(self.causenet_dict, variable1, variable2)
if result:
source_text = get_source_text(result)
retriever = split_data_and_create_vectorstore_retriever(source_text)
response = query_llm(variable1, variable2, source_text, retriever)
else:
response = query_llm(variable1, variable2)

answer = re.findall(r'<answer>(.*?)</answer>', response)
answer = [ans.strip() for ans in answer]
answer_str = "".join(answer)

if answer_str == "A":
return [variable1, variable2, response]
elif answer_str == "B":
return [variable2, variable1, response]
elif answer_str == "C":
return [None, None, response]
else:
assert False, "Invalid answer from LLM: " + answer_str
6 changes: 3 additions & 3 deletions pywhyllm/suggesters/simple_model_suggester.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ def suggest_pairwise_relationship(self, variable1: str, variable2: str):
answer = [ans.strip() for ans in answer]
answer_str = "".join(answer)

if (answer_str == "A"):
if answer_str == "A":
return [variable1, variable2, description]
elif (answer_str == "B"):
elif answer_str == "B":
return [variable2, variable1, description]
elif (answer_str == "C"):
elif answer_str == "C":
return [None, None, description] # maybe we want to save the description in this case too
else:
assert False, "Invalid answer from LLM: " + answer_str
Expand Down
Empty file added pywhyllm/utils/__init__.py
Empty file.
143 changes: 143 additions & 0 deletions pywhyllm/utils/augmented_model_suggester_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util


def find_top_match_in_causenet(causenet_dict, variable1, variable2, threshold=0.7):
# Sample dictionary
pair_strings = [
f"{causenet_dict[key]['causal_relation']['cause']}-{causenet_dict[key]['causal_relation']['effect']}"
for key in causenet_dict]

# Tokenize for BM25
tokenized_pairs = [text.split() for text in pair_strings]
bm25 = BM25Okapi(tokenized_pairs)

# Original and reverse queries
query = variable1 + "-" + variable2
reverse_query = variable2 + "-" + variable1
tokenized_query = query.split()
tokenized_reverse_query = reverse_query.split()

# Combine tokens from both queries (remove duplicates)
combined_query = list(set(tokenized_query + tokenized_reverse_query))

# Get top-k candidates using BM25 with combined query
k = 5
scores = bm25.get_scores(combined_query)
top_k_indices = np.argsort(scores)[::-1][:k]
candidate_pairs = [pair_strings[i] for i in top_k_indices]

# Apply SBERT to candidates
model = SentenceTransformer('all-MiniLM-L6-v2')
query_embedding = model.encode(query, convert_to_tensor=True)
reverse_query_embedding = model.encode(reverse_query, convert_to_tensor=True)
candidate_embeddings = model.encode(candidate_pairs, convert_to_tensor=True)

# Compute similarities for both original and reverse queries
similarities = util.cos_sim(query_embedding, candidate_embeddings).flatten()
reverse_similarities = util.cos_sim(reverse_query_embedding, candidate_embeddings).flatten()

# Take the maximum similarity for each candidate (original or reverse)
max_similarities = np.maximum(similarities, reverse_similarities)

# Get the top match and its similarity score
top_idx = np.argmax(max_similarities)
top_similarity = max_similarities[top_idx]
top_pair = candidate_pairs[top_idx]

# Check if the top similarity meets the threshold
if top_similarity >= threshold:
print(f"Best match: {top_pair} (Similarity: {top_similarity:.4f})")
return causenet_dict[top_pair]
else:
print(f"No match found with similarity above {threshold} (Best similarity: {top_similarity:.4f})")
return None


def get_source_text(causenet_query_result):
source_text = ""
if causenet_query_result:
for item in causenet_query_result["sources"]:
if item["type"] == 'wikipedia_sentence' or item["type"] == 'clueweb12_sentence':
source_text += item["payload"]["sentence"] + " "

return source_text


def split_data_and_create_vectorstore_retriever(source_text):
document = Document(page_content=source_text)

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=100, # Adjust chunk size as needed
chunk_overlap=20 # Overlap for context
)
# Split the documents
splits = text_splitter.split_documents([document])

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a vector store from the document splits
vectorstore = Chroma.from_documents(
documents=splits,
embedding=embeddings,
persist_directory="./chroma_db" # Optional: Save to disk for reuse
)

# Create a retriever from the vector store
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={"k": 5} # Retrieve top 5 relevant chunks
)

return retriever


def query_llm(variable1, variable2, source_text=None, retriever=None):
# Initialize the language model
llm = ChatOpenAI(model="gpt-4")

if source_text:
system_prompt = """You are a helpful assistant for causal reasoning.

Context: {context}
"""
else:
system_prompt = """You are a helpful assistant for causal reasoning.
"""

# prompt template
prompt = ChatPromptTemplate.from_messages([
("system", system_prompt),
("human", "{input}")
])

query = f"""Which cause-and-effect-relationship is more likely? Provide reasoning and you must give your final answer (A, B, or C) in <answer> </answer> tags with the letter only.
A. {variable1} causes {variable2} B. {variable2} causes {variable1} C. neither {variable1} nor {variable2} cause each other."""

# Define the system prompt
if source_text:
# Create a document chain to combine retrieved documents
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Create the RAG chain
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": query})
return response['answer']


else:
default_chain = prompt | llm
response = default_chain.invoke({"input": query})
return response.content
Loading
Loading