Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
39ef5e8
tests
Mar 28, 2025
5a375b3
adding pipeline data type
Apr 2, 2025
f3f26c7
datatype added to evals
Apr 2, 2025
4b420f3
global visited_list
Apr 2, 2025
ce026cd
pipeline cli
Apr 2, 2025
446c15f
adding model_storage
Apr 4, 2025
dc448a9
PosixPath
Apr 4, 2025
6507c8d
lint
Apr 4, 2025
8e0e77c
refactoring var names
Apr 7, 2025
bc9b125
names refactoring
Apr 7, 2025
8296b73
dev merged
Apr 7, 2025
077131b
merge dev
Apr 7, 2025
dc9202c
formatting
Apr 7, 2025
3c904da
var names refactoring, docstrings
Apr 9, 2025
e6b7dc6
cli tests
Apr 10, 2025
d0bd849
windsurf refactoring
Apr 11, 2025
0cc59de
dev merged
Apr 11, 2025
6c20651
nb cleaned
Apr 11, 2025
7251122
nb cleaned
Apr 11, 2025
4277528
cli args tests
Apr 14, 2025
57d0524
Feat/report exporter (#45)
NotBioWaste905 Apr 14, 2025
8d35ca4
three_stages refactoring
Apr 14, 2025
af3ca0f
llm keys tests off
Apr 15, 2025
91c8007
checking model parameters
Apr 15, 2025
e6b31f8
logger added
Apr 16, 2025
269cc10
logger added
Apr 16, 2025
29dc3ea
reports deleted
Apr 16, 2025
49df8e5
model_name
Apr 16, 2025
00dcab5
dev merged
Apr 16, 2025
830f078
tests moved
Apr 16, 2025
61c1d6c
poetry lock
Apr 16, 2025
56ab2c5
tests run
Apr 16, 2025
af5d402
tests run
Apr 16, 2025
1998811
dev merged
Apr 16, 2025
5c2cf42
dev merged
Apr 16, 2025
6730cc5
getModelInstance
Apr 17, 2025
ebb6244
closure in class for model_type
Apr 17, 2025
127d428
ms save fixed
Apr 18, 2025
2cb0260
ms save fixed
Apr 18, 2025
d100bf9
ms save fixed
Apr 18, 2025
d0d3245
light format
Apr 18, 2025
8d7b827
get_dialogue_doublets removed
Apr 20, 2025
8bd89ad
dev merged
Apr 20, 2025
0d89aa6
overwright detailed in ms.add
Apr 20, 2025
f8fbf38
overwright warninings changed
Apr 21, 2025
c9895c3
Merge remote-tracking branch 'origin/dev' into feat/model_validation
NotBioWaste905 Apr 24, 2025
75536de
format, ad missing os import
NotBioWaste905 Apr 24, 2025
947db1a
update userguides
NotBioWaste905 Apr 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions dialogue2graph/datasets/complex_dialogues/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
The module provides graph generator capable of creating complex validated graphs.
"""

import logging
import os
from enum import Enum
from typing import Optional, Dict, Any, Union
Expand Down Expand Up @@ -37,8 +36,9 @@
)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from dialogue2graph.utils.logger import Logger

logger = Logger(__file__)


class ErrorType(str, Enum):
Expand Down
6 changes: 3 additions & 3 deletions dialogue2graph/metrics/llm_metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
The module contains functions that checks Graphs and Dialogues for various metrics using LLM calls.
"""

import logging
import json
from typing import List, TypedDict, Union
from pydantic import BaseModel, Field
Expand All @@ -22,8 +21,9 @@
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

# Set up logging
logging.basicConfig(level=logging.INFO)
from dialogue2graph.utils.logger import Logger

logger = Logger(__file__)


class InvalidTransition(TypedDict):
Expand Down
18 changes: 13 additions & 5 deletions dialogue2graph/metrics/llm_validators/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from dialogue2graph.pipelines.model_storage import ModelStorage
from dialogue2graph.metrics.similarity import compare_strings

from langchain_core.language_models.chat_models import BaseChatModel
from langchain_openai import ChatOpenAI
from langchain_core.language_models import BaseChatModel
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

Expand Down Expand Up @@ -125,7 +127,10 @@ def is_greeting_repeated_emb_llm(
starts = START_TURNS

if model_storage.storage.get(embedder_name):
if not model_storage.storage.get(embedder_name).model_type == "emb":
if (
not model_storage.storage.get(embedder_name).model_type
== HuggingFaceEmbeddings
):
raise TypeError(f"The {embedder_name} model is not an embedder")
embedder_model = model_storage.storage[embedder_name].model
else:
Expand All @@ -134,7 +139,7 @@ def is_greeting_repeated_emb_llm(
)

if model_storage.storage.get(llm_name):
if not model_storage.storage.get(llm_name).model_type == "llm":
if not model_storage.storage.get(llm_name).model_type == ChatOpenAI:
raise TypeError(f"The {llm_name} model is not an LLM")
llm_model = model_storage.storage[llm_name].model
else:
Expand Down Expand Up @@ -183,7 +188,10 @@ def is_dialog_closed_too_early_emb_llm(
ends = END_TURNS

if model_storage.storage.get(embedder_name):
if not model_storage.storage.get(embedder_name).model_type == "emb":
if (
not model_storage.storage.get(embedder_name).model_type
== HuggingFaceEmbeddings
):
raise TypeError(f"The {embedder_name} model is not an embedder")
embedder_model = model_storage.storage[embedder_name].model
else:
Expand All @@ -192,7 +200,7 @@ def is_dialog_closed_too_early_emb_llm(
)

if model_storage.storage.get(llm_name):
if not model_storage.storage.get(llm_name).model_type == "llm":
if not model_storage.storage.get(llm_name).model_type == ChatOpenAI:
raise TypeError(f"The {llm_name} model is not an LLM")
llm_model = model_storage.storage[llm_name].model
else:
Expand Down
3 changes: 3 additions & 0 deletions dialogue2graph/metrics/no_llm_metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@

from dialogue2graph.pipelines.core.graph import BaseGraph
from dialogue2graph.pipelines.core.dialogue import Dialogue
from dialogue2graph.utils.logger import Logger

logger = Logger(__file__)


logging.basicConfig(level=logging.INFO)
Expand Down
42 changes: 19 additions & 23 deletions dialogue2graph/pipelines/core/dialogue_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""

import itertools
import logging
from typing import Literal
import pandas as pd
from dialogue2graph.pipelines.core.graph import BaseGraph
Expand All @@ -19,8 +18,9 @@
from dialogue2graph.pipelines.helpers.find_cycle_ends import find_cycle_ends
from langchain_core.language_models.chat_models import BaseChatModel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from dialogue2graph.utils.logger import Logger

logger = Logger(__file__)


class _DialogPathsCounter:
Expand Down Expand Up @@ -189,23 +189,21 @@ def remove_duplicated_paths(node_paths: list[list[int]]) -> list[list[int]]:
return res


def get_dialogue_doublets(seq: list[list[dict]]) -> set[tuple[str]]:
"""Find all dialogue doublets with (edge, target) utterances

Args:
seq: sequence of dialogs

Returns:
Set of (user_utterance, assistant_utterance)
"""
doublets = set()
for dialogue in seq:
user_texts = [d["text"] for d in dialogue if d["participant"] == "user"]
assist_texts = [d["text"] for d in dialogue if d["participant"] == "assistant"]
if len(assist_texts) > len(user_texts):
user_texts += [""]
doublets.update(zip(user_texts, assist_texts))
return doublets
# def get_dialogue_doublets(seq: list[list[dict]]) -> set[tuple[str]]:
# """Find all dialogue doublets with (edge, target) utterances
# Args:
# seq: sequence of dialogs
# Returns:
# Set of (user_utterance, assistant_utterance)
# """
# doublets = set()
# for dialogue in seq:
# user_texts = [d["text"] for d in dialogue if d["participant"] == "user"]
# assist_texts = [d["text"] for d in dialogue if d["participant"] == "assistant"]
# if len(assist_texts) > len(user_texts):
# user_texts += [""]
# doublets.update(zip(user_texts, assist_texts))
# return doublets


def get_dialogue_triplets(seq: list[list[dict]]) -> set[tuple[str]]:
Expand Down Expand Up @@ -239,9 +237,7 @@ def remove_duplicated_dialogues(seq: list[list[dict]]) -> list[list[dict]]:
return []
uniq_seq = [non_empty_seq[0]]
for s in non_empty_seq[1:]:
if not get_dialogue_doublets([s]).issubset(
get_dialogue_doublets(uniq_seq)
) or not get_dialogue_triplets([s]).issubset(get_dialogue_triplets(uniq_seq)):
if not get_dialogue_triplets([s]).issubset(get_dialogue_triplets(uniq_seq)):
uniq_seq.append(s)
return uniq_seq

Expand Down
9 changes: 5 additions & 4 deletions dialogue2graph/pipelines/core/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from typing import Optional, Any
import matplotlib.pyplot as plt
import abc
import logging

logger = logging.getLogger(__name__)
from dialogue2graph.utils.logger import Logger

logger = Logger(__file__)


class BaseGraph(BaseModel, abc.ABC):
Expand Down Expand Up @@ -140,13 +141,13 @@ def load_graph(self):
"""
self.graph = nx.DiGraph()
nodes = sorted([v["id"] for v in self.graph_dict["nodes"]])
logging.debug(f"Nodes: {nodes}")
logger.debug(f"Nodes: {nodes}")

self.node_mapping = {}
renumber_flg = nodes != list(range(1, len(nodes) + 1))
if renumber_flg:
self.node_mapping = {node_id: idx + 1 for idx, node_id in enumerate(nodes)}
logging.debug(f"Renumber flag: {renumber_flg}")
logger.debug(f"Renumber flag: {renumber_flg}")

for node in self.graph_dict["nodes"]:
cur_node_id = node["id"]
Expand Down
34 changes: 34 additions & 0 deletions dialogue2graph/pipelines/d2g_extender/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@

from typing import Callable
from dotenv import load_dotenv

from dialogue2graph.pipelines.core.pipeline import BasePipeline
from dialogue2graph.pipelines.model_storage import ModelStorage
from dialogue2graph.pipelines.d2g_extender.three_stages_extender import LLMGraphExtender
from langchain_openai import ChatOpenAI
from langchain_huggingface import HuggingFaceEmbeddings

load_dotenv()

Expand All @@ -32,6 +35,37 @@ def __init__(
end_evals: list[Callable] = None,
step: int = 2,
):
# if model is not in model storage put the default model there
model_storage.add(
key=extending_llm,
config={"model_name": "chatgpt-4o-latest", "temperature": 0},
model_type="llm",
)

model_storage.add(
key=filling_llm,
config={"mode_name": "o3-mini", "temperature": 1},
model_type=ChatOpenAI,
)

model_storage.add(
key=formatting_llm,
config={"model_name": "gpt-4o-mini", "temperature": 0},
model_type=ChatOpenAI,
)

model_storage.add(
key=dialog_llm,
config={"model_name": "o3-mini", "temperature": 1},
model_type=ChatOpenAI,
)

model_storage.add(
key=sim_model,
config={"model_name": "BAAI/bge-m3", "device": "cpu"},
model_type=HuggingFaceEmbeddings,
)

super().__init__(
name=name,
steps=[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate

from dialogue2graph.utils.logger import Logger
from dialogue2graph import metrics
from dialogue2graph.pipelines.core.dialogue_sampling import RecursiveDialogueSampler
from dialogue2graph.pipelines.d2g_light.three_stages_light import LightGraphGenerator
Expand Down Expand Up @@ -44,6 +45,8 @@ class DialogueNodes(BaseModel):
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("langchain_core.vectorstores.base").setLevel(logging.ERROR)
logger = Logger(__file__)

dialogue_sampler = RecursiveDialogueSampler()


Expand Down
18 changes: 18 additions & 0 deletions dialogue2graph/pipelines/d2g_light/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from dialogue2graph.pipelines.core.pipeline import BasePipeline
from dialogue2graph.pipelines.d2g_light.three_stages_light import LightGraphGenerator
from dialogue2graph.pipelines.model_storage import ModelStorage
from langchain_openai import ChatOpenAI
from langchain_huggingface import HuggingFaceEmbeddings

load_dotenv()

Expand All @@ -27,6 +29,22 @@ def __init__(
step2_evals: list[Callable] = None,
end_evals: list[Callable] = None,
):
# if model is not in model storage put the default model there
model_storage.add(
key=filling_llm,
config={"model_name": "chatgpt-4o-latest", "temperature": 0},
model_type=ChatOpenAI,
)
model_storage.add(
key=formatting_llm,
config={"model_name": "gpt-4o-mini", "temperature": 0},
model_type=ChatOpenAI,
)
model_storage.add(
key=sim_model,
config={"model_name": "BAAI/bge-m3", "model_kwargs": {"device": "cpu"}},
model_type=HuggingFaceEmbeddings,
)
super().__init__(
name=name,
steps=[
Expand Down
1 change: 0 additions & 1 deletion dialogue2graph/pipelines/d2g_light/three_stages_light.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def __init__(
filling_llm: str = "three_stages_light_filling_llm:v1",
formatting_llm: str = "three_stages_light_formatting_llm:v1",
sim_model: str = "three_stages_light_sim_model:v1",

step2_evals: list[Callable] | None = [],
end_evals: list[Callable] | None = [],
):
Expand Down
24 changes: 24 additions & 0 deletions dialogue2graph/pipelines/d2g_llm/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from dialogue2graph.pipelines.core.pipeline import BasePipeline
from dialogue2graph.pipelines.model_storage import ModelStorage

from langchain_openai import ChatOpenAI
from langchain_huggingface import HuggingFaceEmbeddings
from dialogue2graph.pipelines.d2g_llm.three_stages_llm import LLMGraphGenerator

load_dotenv()
Expand Down Expand Up @@ -42,6 +44,28 @@ def __init__(
step2_evals: list[Callable] = None,
end_evals: list[Callable] = None,
):
# if model is not in model storage put the default model there
model_storage.add(
key=grouping_llm,
config={"model_name": "chatgpt-4o-latest", "temperature": 0},
model_type=ChatOpenAI,
)
model_storage.add(
key=filling_llm,
config={"model_name": "o3-mini", "temperature": 1},
model_type=ChatOpenAI,
)
model_storage.add(
key=formatting_llm,
config={"model_name": "gpt-4o-mini", "temperature": 0},
model_type=ChatOpenAI,
)
model_storage.add(
key=sim_model,
config={"model_name": "BAAI/bge-m3", "device": "cpu"},
model_type=HuggingFaceEmbeddings,
)

super().__init__(
name=name,
steps=[
Expand Down
8 changes: 2 additions & 6 deletions dialogue2graph/pipelines/d2g_llm/three_stages_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
from dialogue2graph.pipelines.core.graph import BaseGraph
from dialogue2graph.pipelines.core.schemas import ReasonGraph, Node
from dialogue2graph.pipelines.model_storage import ModelStorage


from dialogue2graph.utils.logger import Logger
from dialogue2graph.utils.dg_helper import connect_nodes, get_helpers
from dialogue2graph.pipelines.helpers.parse_data import PipelineDataType
from dialogue2graph.pipelines.helpers.prompts.missing_edges_prompt import (
Expand All @@ -33,9 +32,6 @@
grouping_prompt_2,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class DialogueNodes(BaseModel):
"""Class for dialog nodes"""
Expand All @@ -45,6 +41,7 @@ class DialogueNodes(BaseModel):


logging.getLogger("langchain_core.vectorstores.base").setLevel(logging.ERROR)
logger = Logger(__file__)


class LLMGraphGenerator(GraphGenerator):
Expand Down Expand Up @@ -97,7 +94,6 @@ def __init__(
filling_llm: str = "three_stages_filling_llm:v1",
formatting_llm: str = "three_stages_formatting_llm:v1",
sim_model: str = "three_stages_sim_model:v1",

step2_evals: list[Callable] | None = None,
end_evals: list[Callable] | None = None,
):
Expand Down
Loading