diff --git a/dataflow/operators/conversations/generate/consistent_chat_generator.py b/dataflow/operators/conversations/generate/consistent_chat_generator.py index 7d256f6f..3576f131 100644 --- a/dataflow/operators/conversations/generate/consistent_chat_generator.py +++ b/dataflow/operators/conversations/generate/consistent_chat_generator.py @@ -6,25 +6,24 @@ from dataflow.utils.storage import DataFlowStorage import pandas as pd from dataflow.core import LLMServingABC -from dataflow.prompts.general_text import ConsistentQueryPrompt, ConsistentResponsePrompt -from dataflow.core.prompt import prompt_restrict +from dataflow.prompts.general_text import ConsistentChatPrompt +from dataflow.core.prompt import DIYPromptABC, prompt_restrict +from typing import Union @prompt_restrict( - ConsistentQueryPrompt, - ConsistentResponsePrompt + ConsistentChatPrompt ) @OPERATOR_REGISTRY.register() class ConsistentChatGenerator(OperatorABC): - def __init__(self, llm_serving: LLMServingABC = None, num_dialogs_per_intent = 20, num_turns_per_dialog = 6, temperature = 0.9): + def __init__(self, llm_serving: LLMServingABC = None, num_dialogs_per_intent = 20, num_turns_per_dialog = 6, temperature = 0.9, prompt_template : Union[ConsistentChatPrompt, DIYPromptABC] = None): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') self.llm_serving = llm_serving self.num_dialogs_per_intent = num_dialogs_per_intent # Based on the topic_dict in the existing prompt, it is recommended to set the value to below 1000 (which can generate 9000 conversation data). Otherwise, it is recommended to add more topic_dict in dataflow.prompts.general_text.ConsistentChatPrompt to increase data richness self.num_turns_per_dialog = num_turns_per_dialog self.temperature = temperature - self.query_prompt = ConsistentQueryPrompt() - self.response_prompt = ConsistentResponsePrompt() + self.prompt_template = prompt_template self.logger.info(f'{self.__class__.__name__} initialized.') @staticmethod @@ -37,6 +36,7 @@ def get_desc(lang: str = "zh"): "- num_dialogs_per_intent:每个意图生成的对话数量,默认20\n" "- num_turns_per_dialog:每个对话的轮次数量,默认6\n" "- temperature:生成温度,控制输出随机性,默认0.9\n" + "- prompt_template:提示词模板对象,用于定义提示结构\n" "输出参数:\n" "- 包含category和conversation字段的DataFrame,其中conversation为多轮对话列表" ) @@ -48,6 +48,7 @@ def get_desc(lang: str = "zh"): "- num_dialogs_per_intent: Number of dialogs generated per intent, default 20\n" "- num_turns_per_dialog: Number of turns per dialog, default 6\n" "- temperature: Sampling temperature for generation, default 0.9\n" + "- prompt_template: Prompt template object, for defining the prompt structure\n" "Output Parameters:\n" "- DataFrame containing 'category' and 'conversation' fields, where conversation is a list of multi-turn dialogues" ) @@ -57,7 +58,7 @@ def get_desc(lang: str = "zh"): def run(self, storage: DataFlowStorage): # Step 1: Generate all queries using LLM - all_query_prompts = self.query_prompt.build_prompt(num_dialogs_per_intent=self.num_dialogs_per_intent) + all_query_prompts = self.prompt_template.build_prompt(mode="query", num_dialogs_per_intent=self.num_dialogs_per_intent) # Step 2: Generate queries by calling llm_serving once self.logger.info("Generating queries...") queries_list = self.llm_serving.generate_from_input(user_inputs=all_query_prompts) @@ -78,7 +79,7 @@ def run(self, storage: DataFlowStorage): for queries in valid_queries: category = queries.get("category") turns = queries.get("turns") - all_response_prompts.append(self.response_prompt.build_prompt(topic=category, queries=turns)) + all_response_prompts.append(self.prompt_template.build_prompt(mode="response", topic=category, queries=turns)) self.logger.info("Generating responses...") responses_list = self.llm_serving.generate_from_input(user_inputs=all_response_prompts) diff --git a/dataflow/operators/text_sft/generate/condor_generator.py b/dataflow/operators/text_sft/generate/condor_generator.py index f0d66cbb..672315c4 100644 --- a/dataflow/operators/text_sft/generate/condor_generator.py +++ b/dataflow/operators/text_sft/generate/condor_generator.py @@ -7,7 +7,8 @@ import pandas as pd from dataflow.core import LLMServingABC from dataflow.prompts.general_text import CondorQuestionPrompt -from dataflow.core.prompt import prompt_restrict +from dataflow.core.prompt import DIYPromptABC, prompt_restrict +from typing import Union @prompt_restrict( CondorQuestionPrompt @@ -15,13 +16,13 @@ @OPERATOR_REGISTRY.register() class CondorGenerator(OperatorABC): - def __init__(self, llm_serving: LLMServingABC = None, num_samples=15, use_task_diversity=True): + def __init__(self, llm_serving: LLMServingABC = None, num_samples=15, use_task_diversity=True, prompt_template: Union[CondorQuestionPrompt, DIYPromptABC] = None): # Based on the existing topics, it is recommended to set num_samples below 5000. Otherwise, it is recommended to add topics in dataflow.prompts.general_text.CondorPrompt on your own to increase data richness self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') self.llm_serving = llm_serving self.num_questions = num_samples // 3 # 每个prompt生成3个难度的问题 - self.prompt = CondorQuestionPrompt() + self.prompt = prompt_template self.use_task_diversity = use_task_diversity # 是否使用任务场景增强多样性 self.logger.info(f'{self.__class__.__name__} initialized.') @@ -33,6 +34,7 @@ def get_desc(lang: str = "zh"): "输入参数:\n" "- llm_serving:LLM服务对象,需实现LLMServingABC接口\n" "- num_samples:生成样本总数,建议小于5000,默认值为15\n" + "- prompt_template:提示词模板对象,用于定义提示结构\n" "输出参数:\n" "- 包含'difficulty'、'instruction'和'output'字段的DataFrame\n" "- 返回生成的DataFrame用于后续处理" @@ -44,6 +46,7 @@ def get_desc(lang: str = "zh"): "Input Parameters:\n" "- llm_serving: LLM serving object implementing LLMServingABC interface\n" "- num_samples: Total number of samples to generate, recommended to be less than 5000, default is 15\n\n" + "- prompt_template: Prompt template object, for defining the prompt structure\n" "Output Parameters:\n" "- DataFrame containing 'difficulty', 'instruction', and 'output' fields\n" "- Returns generated DataFrame for subsequent processing" diff --git a/dataflow/operators/text_sft/refine/condor_refiner.py b/dataflow/operators/text_sft/refine/condor_refiner.py index b2d0e0b0..e848513c 100644 --- a/dataflow/operators/text_sft/refine/condor_refiner.py +++ b/dataflow/operators/text_sft/refine/condor_refiner.py @@ -6,22 +6,21 @@ from dataflow.utils.storage import DataFlowStorage import pandas as pd from dataflow.core import LLMServingABC -from dataflow.prompts.general_text import CondorCritiquePrompt, CondorRefinePrompt -from dataflow.core.prompt import prompt_restrict +from dataflow.prompts.general_text import CondorRefinePrompt +from dataflow.core.prompt import prompt_restrict, DIYPromptABC +from typing import Union @prompt_restrict( - CondorCritiquePrompt, CondorRefinePrompt ) @OPERATOR_REGISTRY.register() class CondorRefiner(OperatorABC): - def __init__(self, llm_serving: LLMServingABC = None): + def __init__(self, llm_serving: LLMServingABC = None, prompt_template: Union[CondorRefinePrompt, DIYPromptABC] = None): self.logger = get_logger() self.logger.info(f'Initializing {self.__class__.__name__}...') self.llm_serving = llm_serving - self.critique_prompt = CondorCritiquePrompt() # 创建 CondorPrompt 类的实例 - self.refine_prompt = CondorRefinePrompt() + self.prompt_template = prompt_template self.logger.info(f'{self.__class__.__name__} initialized.') @staticmethod @@ -33,6 +32,7 @@ def get_desc(lang: str = "zh"): "- llm_serving:LLM服务对象,需实现LLMServingABC接口\n" "- input_instruction_key:输入指令字段名,默认为'instruction'\n" "- input_output_key:输入回复字段名,默认为'output'\n" + "- prompt_template:提示词模板对象,用于定义提示结构\n" "输出参数:\n" "- 包含优化后回复的DataFrame\n" "- 返回包含优化后回复字段名的列表,用于后续算子引用" @@ -44,7 +44,8 @@ def get_desc(lang: str = "zh"): "Input Parameters:\n" "- llm_serving: LLM serving object implementing LLMServingABC interface\n" "- input_instruction_key: Field name for input instructions, default is 'instruction'\n" - "- input_output_key: Field name for input responses, default is 'output'\n\n" + "- input_output_key: Field name for input responses, default is 'output'\n" + "- prompt_template: Prompt template object, for defining the prompt structure\n" "Output Parameters:\n" "- DataFrame containing refined responses\n" "- List containing refined response field name for subsequent operator reference" @@ -56,13 +57,13 @@ def get_desc(lang: str = "zh"): def generate_critique(self, question, answer): # 批量生成 Critique - critique_prompts = [self.critique_prompt.build_prompt(q, a) for q, a in zip(question, answer)] + critique_prompts = [self.prompt_template.build_prompt(mode="critique", question=q, answer=a) for q, a in zip(question, answer)] critique_responses = self.llm_serving.generate_from_input(critique_prompts) return critique_responses def generate_refined_answer(self, question, answer, critique): # 批量生成修改后的答案 - refine_prompts = [self.refine_prompt.build_prompt(q, a, c) for q, a, c in zip(question, answer, critique)] + refine_prompts = [self.prompt_template.build_prompt(mode="refine", question=q, answer=a, critique=c) for q, a, c in zip(question, answer, critique)] refined_answers = self.llm_serving.generate_from_input(refine_prompts) refined_answers = [answer.replace('[Improved Answer Start]', '').replace('[Improved Answer End]', '').strip() for answer in refined_answers] return refined_answers diff --git a/dataflow/prompts/general_text.py b/dataflow/prompts/general_text.py index e80664ab..53257013 100644 --- a/dataflow/prompts/general_text.py +++ b/dataflow/prompts/general_text.py @@ -1,4 +1,5 @@ import random +from typing import Literal from dataflow.utils.registry import PROMPT_REGISTRY from dataflow.core.prompt import PromptABC ''' @@ -196,9 +197,9 @@ def build_prompt(self): @PROMPT_REGISTRY.register() -class ConsistentQueryPrompt(PromptABC): +class ConsistentChatPrompt(PromptABC): def __init__(self): - self.intent_categories = { + self.query_intent_categories = { "Problem Solving Interaction": [ "From Problem Diagnosis to Solution Optimization" ], @@ -235,7 +236,7 @@ def __init__(self): "From Problem Diagnosis to Solution Optimization" ] } - self.topic_dict = { + self.query_topic_dict = { "Problem Solving Interaction": [ "Technical support for computer hardware issues", "Home repair advice for plumbing problems", @@ -701,90 +702,93 @@ def __init__(self): ] } - def build_prompt(self, num_dialogs_per_intent): - prompt = """ - Task Description and Rules - 1. Generate multiple rounds of realistic user questions based on the provided topic: - - Based on a single core topic (provided directly by the user), generate multiple rounds of realistic user questions, comprising 6-8 turns in total. - - The questions should match the characteristics of real users in natural communication: sometimes simple, sometimes vague, or including contextual backgrounds, and should reflect the language style of daily communication. - - Note: Avoid directly including the exact expression of the input topic in the questions. Instead, abstract it with natural and conversational language in practical scenarios. - - 2. Dynamic Dialogue Information Flow in Conversations: Below are the relevant steps of the information flow: {info_flow} - - The dialogue style should adhere to the following requirements: - - Utilize natural phrasing and vivid language, avoiding overly mechanical responses. - - Favor shorter sentences in questions, with occasional subject omission allowed. - - Ensure smooth and logical transitions through lighthearted or entertaining interjections. - - Permit the expression of specific personality traits and individualized tones. - - Proactively introduce new topics when appropriate, ensuring relevance to the current theme. - - The dialogue should comply with the following generation rules: - - For each round of dialogue, only simulate user questions without providing answers. - - Ensure the conversation flows naturally and reflects realistic interactive thinking. - - Avoid overly polished or templated content, ensuring the questions feel authentic and relatable in life scenarios. + def build_prompt(self, mode: Literal["query", "response"], num_dialogs_per_intent: int = None, topic: str = None, queries: list[str] = None) -> str: + if mode == "query": + if topic is not None or queries is not None: + raise ValueError("Topic and queries should be None when mode is query") + if num_dialogs_per_intent is None: + raise ValueError("num_dialogs_per_intent should be provided when mode is query") + prompt = """ + Task Description and Rules + 1. Generate multiple rounds of realistic user questions based on the provided topic: + - Based on a single core topic (provided directly by the user), generate multiple rounds of realistic user questions, comprising 6-8 turns in total. + - The questions should match the characteristics of real users in natural communication: sometimes simple, sometimes vague, or including contextual backgrounds, and should reflect the language style of daily communication. + - Note: Avoid directly including the exact expression of the input topic in the questions. Instead, abstract it with natural and conversational language in practical scenarios. + + 2. Dynamic Dialogue Information Flow in Conversations: Below are the relevant steps of the information flow: {info_flow} + + The dialogue style should adhere to the following requirements: + - Utilize natural phrasing and vivid language, avoiding overly mechanical responses. + - Favor shorter sentences in questions, with occasional subject omission allowed. + - Ensure smooth and logical transitions through lighthearted or entertaining interjections. + - Permit the expression of specific personality traits and individualized tones. + - Proactively introduce new topics when appropriate, ensuring relevance to the current theme. + + The dialogue should comply with the following generation rules: + - For each round of dialogue, only simulate user questions without providing answers. + - Ensure the conversation flows naturally and reflects realistic interactive thinking. + - Avoid overly polished or templated content, ensuring the questions feel authentic and relatable in life scenarios. + + Output Format: + Multi-turn Questions in JSON Format: + "category": "", + "turns": ["", "", "", "..."] + To generate multi-turn queries with high topic consistency, please think step-by-step. + The input core topic for this task is: {topic} + """ + all_query_prompts = [] + for intent, info_flows in self.query_intent_categories.items(): + for _ in range(num_dialogs_per_intent): + info_flow = random.choice(info_flows) + topic = random.choice(self.query_topic_dict[intent]) + query_prompt = prompt.format(info_flow=info_flow, topic=topic) + all_query_prompts.append(query_prompt) + return all_query_prompts - Output Format: - Multi-turn Questions in JSON Format: - "category": "", - "turns": ["", "", "", "..."] - To generate multi-turn queries with high topic consistency, please think step-by-step. - The input core topic for this task is: {topic} - """ - all_query_prompts = [] - for intent, info_flows in self.intent_categories.items(): - for _ in range(num_dialogs_per_intent): - info_flow = random.choice(info_flows) - topic = random.choice(self.topic_dict[intent]) - query_prompt = prompt.format(info_flow=info_flow, topic=topic) - all_query_prompts.append(query_prompt) - return all_query_prompts - + elif mode == "response": + if topic is None or queries is None: + raise ValueError("Topic and queries should be provided when mode is response") + if num_dialogs_per_intent is not None: + raise ValueError("num_dialogs_per_intent should be None when mode is response") + prompt = f""" + Your task is to simulate a multi-turn conversation where you progressively answer a series of user questions provided under a given topic category. For each answer, focus on delivering a natural, contextually relevant, and actionable response while considering both the current question and future questions in the sequence. The goal is to ensure consistency and logical progression throughout the dialogue and to avoid unnecessary follow-up questions in the responses simultaneously. To generate multi-turn responses with high topic consistency, think step-by-step. Key Dialogue Style Requirements are as follows: + Content and Structure: + 1. Directly Answer the Current Question: + - Provide a complete, useful response to the current question without posing additional questions unless they are directly relevant to future queries. + - If clarification or additional steps are needed, frame these as suggestions or explanations rather than questions. + 2. Be Context-Aware: + - Always tailor each response to the current question while remaining mindful of the context provided by prior and future questions. + - Avoid prematurely addressing future queries but create subtle links where necessary to ensure smooth progression. + 3. Clear, Action-Oriented Responses: + - Focus on providing actionable advice, logical explanations, or troubleshooting steps rather than speculative or rhetorical remarks. + - Avoid long or overly complex explanations; aim for clarity and efficiency. + Tone and Style: + 1. Conversational and Supportive: + - Use a natural, empathetic tone that simulates real-life problem-solving interactions. + - Avoid mechanical or overly formal responses. + 2. Economical with Words: + - Keep responses concise but informative. Minimize extraneous content while ensuring answers have enough detail to be helpful. + 3. No Unnecessary Questions: + - Limit unnecessary questions in the responses and focus instead on providing actionable steps or solutions directly. Avoid follow-up questions that don’t align with the next user query. + Turn-by-Turn Instructions: + 1. Answer Exclusively for the Current Question: + - For each turn, generate an answer that directly addresses the immediate question. Avoid revisiting past details unnecessarily unless they are highly relevant. + - While you shouldn’t anticipate or directly answer future queries, your response should create natural openings for upcoming questions if applicable. + 2. Avoid Irrelevant Follow-Up Questions: + - If the immediate question doesn’t require clarification, frame your response as a statement or suggestion rather than a question. + - Maintain alignment with the logical flow of dialogue to ensure each turn is coherent. + 3. Proactively Provide Scenarios or Steps: + - Where appropriate, guide the user with specific recommendations, troubleshooting actions, or observations they can make without requiring back-and-forth clarification. + Output Requirements: + The output must simulate the conversation by only providing responses (one per turn) in a sequential manner. The final format must strictly adhere to valid JSON and include the required structure. + + The input core topic and questions-only turns for this task is: + core topic: {topic} + queries: + {', '.join([f'User query: {query}' for query in queries])} + """ + return prompt -@PROMPT_REGISTRY.register() -class ConsistentResponsePrompt(PromptABC): - - def __init__(self): - pass - - def build_prompt(self, topic, queries): - prompt = f""" - Your task is to simulate a multi-turn conversation where you progressively answer a series of user questions provided under a given topic category. For each answer, focus on delivering a natural, contextually relevant, and actionable response while considering both the current question and future questions in the sequence. The goal is to ensure consistency and logical progression throughout the dialogue and to avoid unnecessary follow-up questions in the responses simultaneously. To generate multi-turn responses with high topic consistency, think step-by-step. Key Dialogue Style Requirements are as follows: - Content and Structure: - 1. Directly Answer the Current Question: - - Provide a complete, useful response to the current question without posing additional questions unless they are directly relevant to future queries. - - If clarification or additional steps are needed, frame these as suggestions or explanations rather than questions. - 2. Be Context-Aware: - - Always tailor each response to the current question while remaining mindful of the context provided by prior and future questions. - - Avoid prematurely addressing future queries but create subtle links where necessary to ensure smooth progression. - 3. Clear, Action-Oriented Responses: - - Focus on providing actionable advice, logical explanations, or troubleshooting steps rather than speculative or rhetorical remarks. - - Avoid long or overly complex explanations; aim for clarity and efficiency. - Tone and Style: - 1. Conversational and Supportive: - - Use a natural, empathetic tone that simulates real-life problem-solving interactions. - - Avoid mechanical or overly formal responses. - 2. Economical with Words: - - Keep responses concise but informative. Minimize extraneous content while ensuring answers have enough detail to be helpful. - 3. No Unnecessary Questions: - - Limit unnecessary questions in the responses and focus instead on providing actionable steps or solutions directly. Avoid follow-up questions that don’t align with the next user query. - Turn-by-Turn Instructions: - 1. Answer Exclusively for the Current Question: - - For each turn, generate an answer that directly addresses the immediate question. Avoid revisiting past details unnecessarily unless they are highly relevant. - - While you shouldn’t anticipate or directly answer future queries, your response should create natural openings for upcoming questions if applicable. - 2. Avoid Irrelevant Follow-Up Questions: - - If the immediate question doesn’t require clarification, frame your response as a statement or suggestion rather than a question. - - Maintain alignment with the logical flow of dialogue to ensure each turn is coherent. - 3. Proactively Provide Scenarios or Steps: - - Where appropriate, guide the user with specific recommendations, troubleshooting actions, or observations they can make without requiring back-and-forth clarification. - Output Requirements: - The output must simulate the conversation by only providing responses (one per turn) in a sequential manner. The final format must strictly adhere to valid JSON and include the required structure. - - The input core topic and questions-only turns for this task is: - core topic: {topic} - queries: - {', '.join([f'User query: {query}' for query in queries])} - """ - return prompt @PROMPT_REGISTRY.register() class CondorQuestionPrompt(PromptABC): @@ -1201,14 +1205,20 @@ def build_prompt(self, theme, domain): @PROMPT_REGISTRY.register() -class CondorCritiquePrompt(PromptABC): +class CondorRefinePrompt(PromptABC): def __init__(self): pass - def build_prompt(self, question, answer): - dialogue = [question, answer] - base_critique_prompt = f""" + def build_prompt(self, mode: Literal["critique", "refine"], question: str = None, answer: str = None, critique: str = None): + + if mode == "critique": + if question is None or answer is None: + raise ValueError("Question and answer should be provided when mode is critique") + if critique is not None: + raise ValueError("Critique should be None when mode is critique") + dialogue = [question, answer] + base_critique_prompt = f""" There is now a user’s question and a model’s response. You need to write a critique for this response, pointing out the strengths and weaknesses of the model’s answer to help the model improve its response. @@ -1228,16 +1238,11 @@ def build_prompt(self, question, answer): Now it’s your turn. Please provide your Critique as required: """ - return base_critique_prompt - -@PROMPT_REGISTRY.register() -class CondorRefinePrompt(PromptABC): - - def __init__(self): - pass - - def build_prompt(self, question, answer, critique): - base_refine_prompt = """ + return base_critique_prompt + elif mode == "refine": + if question is None or answer is None or critique is None: + raise ValueError("Question, answer and critique should be provided when mode is refine") + base_refine_prompt = """ Now there is a user's question, a model's answer, and the user's feedback. Please help modify the model's answer based on the user's feedback to make it better. Your improved answer must strictly adhere to the following format: @@ -1250,7 +1255,30 @@ def build_prompt(self, question, answer, critique): Now it's your turn, please provide your improved answer as required: """ - return base_refine_prompt.format(question=question, answer=answer, critique=critique) + return base_refine_prompt.format(question=question, answer=answer, critique=critique) + + +# @PROMPT_REGISTRY.register() +# class CondorRefinePrompt(PromptABC): + +# def __init__(self): +# pass + +# def build_prompt(self, question, answer, critique): +# base_refine_prompt = """ +# Now there is a user's question, a model's answer, and the user's feedback. Please help modify the model's answer based on the user's feedback to make it better. +# Your improved answer must strictly adhere to the following format: + +# [Improved Answer Start]Your answer[Improved Answer End] + +# Below is the user's question, the model's answer, and the feedback: +# [Question Start]{question}[Question End] +# [Answer Start]{answer}[Answer End] +# [Feedback Start]{critique}[Feedback End] + +# Now it's your turn, please provide your improved answer as required: +# """ +# return base_refine_prompt.format(question=question, answer=answer, critique=critique) @PROMPT_REGISTRY.register() class LanguageFilterPrompt(PromptABC): diff --git a/dataflow/statics/pipelines/api_pipelines/text_conversation_synthesis_pipeline.py b/dataflow/statics/pipelines/api_pipelines/text_conversation_synthesis_pipeline.py index c9e6d959..a823e9ac 100644 --- a/dataflow/statics/pipelines/api_pipelines/text_conversation_synthesis_pipeline.py +++ b/dataflow/statics/pipelines/api_pipelines/text_conversation_synthesis_pipeline.py @@ -1,4 +1,5 @@ from dataflow.operators.conversations import ConsistentChatGenerator +from dataflow.prompts.general_text import ConsistentChatPrompt from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request @@ -16,7 +17,7 @@ def __init__(self): max_workers=100 ) self.model_cache_dir = './dataflow_cache' - self.processor = ConsistentChatGenerator(llm_serving=self.llm_serving, num_dialogs_per_intent=5) + self.processor = ConsistentChatGenerator(llm_serving=self.llm_serving, num_dialogs_per_intent=5, prompt_template=ConsistentChatPrompt()) def forward(self): self.processor.run( diff --git a/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py b/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py index 0a0866da..417da44e 100644 --- a/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py +++ b/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py @@ -4,6 +4,7 @@ from dataflow.operators.text_sft import CondorRefiner from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request +from dataflow.prompts.general_text import CondorQuestionPrompt, CondorRefinePrompt class TextSFTSynthesis_APIPipeline(): def __init__(self): @@ -20,8 +21,8 @@ def __init__(self): model_name="gpt-4o", max_workers=100 ) - self.generator = CondorGenerator(llm_serving=self.llm_serving, num_samples=self.num_generated_samples) - self.refiner = CondorRefiner(llm_serving=self.llm_serving) + self.generator = CondorGenerator(llm_serving=self.llm_serving, num_samples=self.num_generated_samples, prompt_template=CondorQuestionPrompt()) + self.refiner = CondorRefiner(llm_serving=self.llm_serving, prompt_template=CondorRefinePrompt()) self.alpagasus_filter = AlpagasusFilter(min_score=3,max_score=5,llm_serving=self.llm_serving) def forward(self): diff --git a/dataflow/statics/playground/playground/text_conversation_synthesis_pipeline.py b/dataflow/statics/playground/playground/text_conversation_synthesis_pipeline.py index fe9e1b18..a3302ce9 100644 --- a/dataflow/statics/playground/playground/text_conversation_synthesis_pipeline.py +++ b/dataflow/statics/playground/playground/text_conversation_synthesis_pipeline.py @@ -1,6 +1,7 @@ from dataflow.operators.conversations import ConsistentChatGenerator from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request +from dataflow.prompts.general_text import ConsistentChatPrompt class TextPipeline(): def __init__(self): @@ -16,7 +17,7 @@ def __init__(self): max_workers=100 ) self.model_cache_dir = './dataflow_cache' - self.processor = ConsistentChatGenerator(llm_serving=serving, num_dialogs_per_intent=5) + self.processor = ConsistentChatGenerator(llm_serving=serving, num_dialogs_per_intent=5, prompt_template=ConsistentChatPrompt()) def forward(self): self.processor.run( diff --git a/dataflow/statics/playground/playground/text_sft_synthesis_from_scratch.py b/dataflow/statics/playground/playground/text_sft_synthesis_from_scratch.py index 8568f2d1..b20a9d31 100644 --- a/dataflow/statics/playground/playground/text_sft_synthesis_from_scratch.py +++ b/dataflow/statics/playground/playground/text_sft_synthesis_from_scratch.py @@ -2,6 +2,7 @@ from dataflow.operators.text_sft import AlpagasusFilter from dataflow.operators.text_sft import CondorGenerator from dataflow.operators.text_sft import CondorRefiner +from dataflow.prompts.general_text import CondorQuestionPrompt, CondorRefinePrompt from dataflow.utils.storage import FileStorage from dataflow.serving import APILLMServing_request @@ -20,8 +21,8 @@ def __init__(self): model_name="gpt-4o", max_workers=100 ) - self.generator = CondorGenerator(llm_serving=llm_serving, num_samples=self.num_generated_samples) - self.refiner = CondorRefiner(llm_serving=llm_serving) + self.generator = CondorGenerator(llm_serving=llm_serving, num_samples=self.num_generated_samples, prompt_template=CondorQuestionPrompt()) + self.refiner = CondorRefiner(llm_serving=llm_serving, prompt_template=CondorRefinePrompt()) self.alpagasus_filter = AlpagasusFilter(min_score=3,max_score=5,llm_serving=llm_serving) def forward(self): diff --git a/dataflow/statics/playground/simple_text_pipelines/sft_from_scratch.py b/dataflow/statics/playground/simple_text_pipelines/sft_from_scratch.py index d951dc30..28de19b4 100644 --- a/dataflow/statics/playground/simple_text_pipelines/sft_from_scratch.py +++ b/dataflow/statics/playground/simple_text_pipelines/sft_from_scratch.py @@ -1,5 +1,6 @@ from dataflow.operators.text_sft import CondorGenerator +from dataflow.prompts.general_text import CondorQuestionPrompt from dataflow.operators.core_text import PromptedGenerator,FormatStrPromptedGenerator from dataflow.operators.core_text import GeneralFilter from dataflow.utils.storage import FileStorage @@ -40,7 +41,7 @@ def __init__(self): model_name="gpt-4o", max_workers=100 ) - self.instruction_generator = CondorGenerator(llm_serving=self.llm_serving, num_samples=self.num_generated_samples) + self.instruction_generator = CondorGenerator(llm_serving=self.llm_serving, num_samples=self.num_generated_samples, prompt_template=CondorQuestionPrompt()) self.answer_generator = PromptedGenerator(llm_serving=self.llm_serving, system_prompt="Please answer this question.") self.prompt_template = FormatStrPrompt( f_str_template="Please rate the following SFT data: instruction: {instruction}, output: {output}?"