Skip to content

Commit 256ac1f

Browse files
Dialogue Augmentation (#49)
* Add DialogueAugmenter for creating synonymical dialogues from original dialogue
1 parent 061695a commit 256ac1f

File tree

17 files changed

+1720200
-4
lines changed

17 files changed

+1720200
-4
lines changed

dialogue2graph/datasets/augment_dialogues/__init__.py

Whitespace-only changes.
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import logging
2+
from typing import Union
3+
from pydantic import BaseModel, Field, ValidationError
4+
from langchain.prompts import PromptTemplate
5+
from langchain_core.output_parsers import JsonOutputParser
6+
from langchain.output_parsers import OutputFixingParser
7+
8+
from dialogue2graph.pipelines.core.algorithms import DialogAugmentation
9+
from dialogue2graph.pipelines.core.dialogue import Dialogue
10+
from dialogue2graph.pipelines.model_storage import ModelStorage
11+
from dialogue2graph.metrics.no_llm_metrics.metrics import (
12+
is_correct_length, match_roles
13+
)
14+
15+
logging.getLogger("langchain_core.vectorstores.base").setLevel(logging.ERROR)
16+
17+
class AugmentedTurn(BaseModel):
18+
participant: str
19+
text: list[str] = Field(..., description="List of utterance variations for this turn")
20+
21+
class DialogueSequence(BaseModel):
22+
result: list[AugmentedTurn] = Field(..., description="Sequence of augmented turns")
23+
24+
25+
class DialogueAugmenter(DialogAugmentation):
26+
"""Augments dialogues while preserving structure and conversation flow by rephrasing original dialogue lines."""
27+
28+
model_storage: ModelStorage = Field(..., description="Model storage instance")
29+
generation_llm: str = Field(..., description="Key for generation LLM in storage")
30+
formatting_llm: str = Field(..., description="Key for formatting LLM in storage")
31+
32+
def invoke(
33+
self,
34+
dialogue: Dialogue,
35+
prompt: str,
36+
topic: str = "",
37+
) -> Union[list[Dialogue], str]:
38+
"""Augments dialogue while preserving conversation structure.
39+
40+
Args:
41+
dialogue: Input Dialogue object to augment
42+
prompt: Required augmentation prompt template
43+
topic: Contextual topic for augmentation (default: empty)
44+
45+
Returns:
46+
List of augmented Dialogue objects or error message
47+
"""
48+
if prompt == '':
49+
return 'Preprocessing failed: prompt should be a valid instruction for LLM'
50+
51+
try:
52+
message_dicts = [msg.model_dump() for msg in dialogue.messages]
53+
if message_dicts == []:
54+
return 'Preprocessing failed: no messages found in the dialogue'
55+
56+
augmentation_prompt = PromptTemplate.from_template(prompt)
57+
parser = JsonOutputParser(pydantic_object=DialogueSequence)
58+
59+
fixed_parser = OutputFixingParser.from_llm(
60+
parser=parser,
61+
llm=self._get_llm(self.formatting_llm)
62+
)
63+
64+
chain = augmentation_prompt | self._get_llm(self.generation_llm) | fixed_parser
65+
66+
for attempt in range(3):
67+
try:
68+
result = chain.invoke({"topic": topic, "dialogue": message_dicts})
69+
try:
70+
augmented_dialogues = self._create_dialogues(result)
71+
return augmented_dialogues
72+
except Exception as e:
73+
logging.error(f"Error creating dialogues: {str(e)}")
74+
return f"Post-processing failed: {str(e)}"
75+
76+
except ValidationError as ve:
77+
logging.warning(f"Validation error attempt {attempt+1}: {ve}")
78+
79+
except Exception as e:
80+
logging.error(f"Unexpected error: {str(e)}")
81+
if attempt == 2:
82+
return f"Augmentation failed: {str(e)}"
83+
84+
return "Augmentation failed after 3 attempts"
85+
86+
except Exception as e:
87+
logging.exception("Critical error in augmentation pipeline")
88+
return f"Critical error: {str(e)}"
89+
90+
async def ainvoke(self, *args, **kwargs):
91+
"""Async version of invoke"""
92+
return self.invoke(*args, **kwargs)
93+
94+
async def evaluate(
95+
self,
96+
dialogue: Dialogue,
97+
prompt: str,
98+
topic: str = ""
99+
) -> dict:
100+
"""Evaluates augmentation quality with dictionary report format."""
101+
result = self.invoke(dialogue, prompt, topic)
102+
103+
if isinstance(result, str):
104+
return {"error": result}
105+
106+
report = {}
107+
for i, augmented_dialogue in enumerate(result):
108+
try:
109+
report[f'augmented_dialogue_{i}'] = {
110+
"match_roles": match_roles(dialogue, augmented_dialogue),
111+
"correct_length": is_correct_length(dialogue, augmented_dialogue)
112+
}
113+
except Exception as e:
114+
logging.error(f"Error while calculating metrics: {str(e)}")
115+
return report
116+
117+
def _get_llm(self, llm_key: str):
118+
"""Safe LLM retrieval with error handling"""
119+
if llm_key not in self.model_storage.storage:
120+
raise ValueError(f"LLM key '{llm_key}' not found in model storage")
121+
return self.model_storage.storage[llm_key].model
122+
123+
def _combine_one_dialogue(self, augmentation_result: DialogueSequence, i: int) -> dict:
124+
"""Combining new augmented dialogues from utterance variations"""
125+
new_augmented_dialogue = {}
126+
new_augmented_dialogue['messages'] = []
127+
roles_to_add = [turn.participant for turn in augmentation_result.result]
128+
utterances_to_add = [turn.text[i] for turn in augmentation_result.result]
129+
130+
for role, uttr in zip(roles_to_add, utterances_to_add):
131+
dict_messages = {}
132+
dict_messages["participant"] = role
133+
dict_messages["text"] = uttr
134+
new_augmented_dialogue["messages"].append(dict_messages)
135+
136+
return new_augmented_dialogue
137+
138+
def _create_dialogues(self, result: dict) -> list[Dialogue]:
139+
"""Creating a list of Dialogue objects"""
140+
try:
141+
augmentation_result = DialogueSequence(result=result)
142+
except Exception as e:
143+
logging.error(f"Wrong type of augmentation result: {str(e)}")
144+
return f"Creating a list of Dialogue objects failed: {str(e)}"
145+
146+
utterances_lists = [turn.text for turn in augmentation_result.result]
147+
lens = [len(uttr_list) for uttr_list in utterances_lists]
148+
149+
augmented_dialogues = []
150+
for i in range(min(lens)):
151+
new_augmented_dialogue = self._combine_one_dialogue(augmentation_result, i)
152+
augmented_dialogues.append(new_augmented_dialogue)
153+
154+
return [Dialogue.from_list(new_augmented_dialogue['messages']) for new_augmented_dialogue in augmented_dialogues]
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
augmentation_prompt_from2to5_vars = """
2+
You are tasked with augmenting a dialogue by adding variations to existing utterances while maintaining the original dialogue flow and intent.
3+
4+
INSTRUCTION:
5+
1. For each message in the dialogue:
6+
- Create 2-5 variations of the 'text' field that:
7+
* Express the same meaning/intent
8+
* Use different wording and phrasing
9+
* Match the given theme
10+
* Sound natural and conversational
11+
12+
2. Ensure all utterance variations:
13+
- Do not repeat each other word for word
14+
- Are appropriate for the theme
15+
- Maintain consistency in tone and style
16+
- Make sense in the conversation flow
17+
18+
3. Make sure that all the utterances in the dialogue are different from each other.
19+
20+
4. The output must be a list of dictionaries, where each dictionary has:
21+
- 'participant': either 'user' or 'assistant'
22+
- 'text': string
23+
24+
Below are EXAMPLES of original phrases and their corresponding augmented phrases.
25+
26+
**EXAMPLE 1**
27+
ORIGINAL PHRASE: "I've checked and the camera is not blocked"
28+
AUGMENTED PHRASES: ["I've ensured that there's nothing in front of the camera", "I've made sure the camera is clear of any obstructions."]
29+
30+
**EXAMPLE 2**
31+
ORIGINAL PHRASE: 'Alright, if you need any further assistance, feel free to reach out. Have a great day!'
32+
AUGMENTED PHRASES: ["Okay, if you ever need more help, don't hesitate to ask. Have a wonderful day!", "No problem! If you need any more help later on, don't hesitate to get in touch. Have a wonderful day!"]
33+
34+
**EXAMPLE 3**
35+
ORIGINAL PHRASE: "I'm curious about the pricing for eco-friendly packaging."
36+
AUGMENTED PHRASES: ['Can you tell me about the expenses associated with eco-friendly packaging?', 'I want to know about the costs of eco-friendly packaging.']
37+
38+
Now you will be provided with INPUT TOPIC and INPUT DIALOGUE. Return ONLY a valid JSON array containing the augmented dialogue messages. Each message should be in this exact format:
39+
For assistant messages: {{"participant": "assistant", "text": [list of utterance variations]}}
40+
For user messages: {{"participant": "user", "text": [list of utterance variations]}}
41+
42+
INPUT THEME: {topic}
43+
44+
INPUT DIALOGUE:
45+
{dialogue}
46+
"""
47+
48+
augmentation_prompt_3_vars = """
49+
You are tasked with augmenting a dialogue by adding variations to existing utterances while maintaining the original dialogue flow and intent.
50+
51+
INSTRUCTION:
52+
1. For each message in the dialogue:
53+
- Create 3 variations of the 'text' field that:
54+
* Express the same meaning/intent
55+
* Use different wording and phrasing
56+
* Match the given theme
57+
* Sound natural and conversational
58+
59+
2. Ensure all utterance variations:
60+
- Do not repeat each other word for word
61+
- Are appropriate for the theme
62+
- Maintain consistency in tone and style
63+
- Make sense in the conversation flow
64+
65+
3. Make sure that all the utterances in the dialogue are different from each other.
66+
67+
4. The output must be a list of dictionaries, where each dictionary has:
68+
- 'participant': either 'user' or 'assistant'
69+
- 'text': string
70+
71+
Below are EXAMPLES of original phrases and their corresponding augmented phrases.
72+
73+
**EXAMPLE 1**
74+
ORIGINAL PHRASE: "I've checked and the camera is not blocked"
75+
AUGMENTED PHRASES: ["I've ensured that there's nothing in front of the camera", "I've made sure the camera is clear of any obstructions."]
76+
77+
**EXAMPLE 2**
78+
ORIGINAL PHRASE: 'Alright, if you need any further assistance, feel free to reach out. Have a great day!'
79+
AUGMENTED PHRASES: ["Okay, if you ever need more help, don't hesitate to ask. Have a wonderful day!", "No problem! If you need any more help later on, don't hesitate to get in touch. Have a wonderful day!"]
80+
81+
**EXAMPLE 3**
82+
ORIGINAL PHRASE: "I'm curious about the pricing for eco-friendly packaging."
83+
AUGMENTED PHRASES: ['Can you tell me about the expenses associated with eco-friendly packaging?', 'I want to know about the costs of eco-friendly packaging.']
84+
85+
Now you will be provided with INPUT TOPIC and INPUT DIALOGUE. Return ONLY a valid JSON array containing the augmented dialogue messages. Each message should be in this exact format:
86+
For assistant messages: {{"participant": "assistant", "text": [list of utterance variations]}}
87+
For user messages: {{"participant": "user", "text": [list of utterance variations]}}
88+
89+
INPUT THEME: {topic}
90+
91+
INPUT DIALOGUE:
92+
{dialogue}
93+
"""

dialogue2graph/pipelines/core/algorithms.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,12 @@ class DialogAugmentation(BaseAlgorithm):
6161
:param topic: The topic to guide the augmentation process (optional).
6262
"""
6363

64-
def __init__(self) -> None:
65-
super().__init__()
66-
6764
def invoke(self, dialogue: Dialogue, topic: str = "") -> Dialogue:
6865
raise NotImplementedError
6966

67+
async def ainvoke(self, dialogue: Dialogue, topic: str = "") -> Dialogue:
68+
raise NotImplementedError
69+
7070

7171
class GraphAugmentation(BaseAlgorithm):
7272
"""Graph generator that works only with topics."""

dialogue2graph/pipelines/model_storage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from pathlib import Path
66
from pydantic import BaseModel, Field, model_validator
77

8-
from langchain_openai import ChatOpenAI
8+
from langchain_community.chat_models import ChatOpenAI
99
from langchain_core.language_models import BaseChatModel
1010
from langchain_huggingface import HuggingFaceEmbeddings
1111

experiments/exp2025_04_09_augment_metrics_and_class/README.md

Whitespace-only changes.

0 commit comments

Comments
 (0)