Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,4 @@ src/data_designer/_version.py
docs/notebooks/
docs/notebook_source/*.ipynb
docs/notebook_source/*.csv
docs/notebook_source/artifacts/
docs/**/artifacts/
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ repos:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
exclude: ^mkdocs\.yml$
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had to do this because pre-commit (and my IDE) don't like a couple lines added for more emoji support

- id: check-added-large-files
- id: check-json
- id: check-toml
Expand Down
318 changes: 318 additions & 0 deletions docs/assets/recipes/code_generation/text_to_python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
from pathlib import Path

from data_designer.essentials import (
CategorySamplerParams,
CodeLang,
CodeValidatorParams,
DataDesigner,
DataDesignerConfigBuilder,
LLMCodeColumnConfig,
LLMJudgeColumnConfig,
LLMTextColumnConfig,
SamplerColumnConfig,
SamplerType,
Score,
SubcategorySamplerParams,
ValidationColumnConfig,
ValidatorType,
)
from data_designer.interface.results import DatasetCreationResults


def build_config(model_alias: str) -> DataDesignerConfigBuilder:
config_builder = DataDesignerConfigBuilder()

config_builder.add_column(
SamplerColumnConfig(
name="industry_sector",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=[
"Healthcare",
"Finance",
"Technology",
],
),
),
)

config_builder.add_column(
SamplerColumnConfig(
name="topic",
sampler_type=SamplerType.SUBCATEGORY,
params=SubcategorySamplerParams(
category="industry_sector",
values={
"Healthcare": [
"Electronic Health Records (EHR) Systems",
"Telemedicine Platforms",
"AI-Powered Diagnostic Tools",
],
"Finance": [
"Fraud Detection Software",
"Automated Trading Systems",
"Personal Finance Apps",
],
"Technology": [
"Cloud Computing Platforms",
"Artificial Intelligence and Machine Learning Platforms",
"DevOps and CI/CD Tools",
],
},
),
),
)

config_builder.add_column(
SamplerColumnConfig(
name="code_complexity",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=[
"Beginner",
"Intermediate",
"Advanced",
],
),
),
)

config_builder.add_column(
SamplerColumnConfig(
name="code_concept",
sampler_type=SamplerType.SUBCATEGORY,
params=SubcategorySamplerParams(
category="code_complexity",
values={
"Beginner": [
"Variables",
"Data Types",
"Functions",
"Loops",
"Classes",
],
"Intermediate": [
"List Comprehensions",
"Object-oriented programming",
"Lambda Functions",
"Web frameworks",
"Pandas",
],
"Advanced": [
"Multithreading",
"Context Managers",
"Generators",
],
},
),
),
)

config_builder.add_column(
SamplerColumnConfig(
name="instruction_phrase",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=[
"Write a function that",
"Create a class that",
"Implement a script",
"Can you create a function",
"Develop a module that",
],
),
),
)

config_builder.add_column(
LLMTextColumnConfig(
name="instruction",
model_alias=model_alias,
system_prompt=("You are an expert at generating clear and specific programming tasks."),
prompt=(
"Generate an instruction to create Python code that solves a specific problem.\n"
"Each instruction should begin with one of the following phrases: {{ instruction_phrase }}.\n\n"
"Important Guidelines:\n"
"* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n"
"* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n"
"* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n"
"* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n"
),
)
)

config_builder.add_column(
LLMCodeColumnConfig(
name="code_implementation",
model_alias=model_alias,
code_lang=CodeLang.PYTHON,
system_prompt=(
"You are an expert Python programmer who writes clean, efficient, and well-documented code."
),
prompt=(
"Write Python code for the following instruction:\n"
"Instruction: {{ instruction }}\n\n"
"Important Guidelines:\n"
"* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n"
"* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n"
"* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n"
"* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{code_concept}}.\n"
),
)
)

config_builder.add_column(
LLMTextColumnConfig(
name="instruction",
model_alias=model_alias,
system_prompt=("You are an expert at generating clear and specific programming tasks."),
prompt=(
"Generate an instruction to create Python code that solves a specific problem.\n"
"Each instruction should begin with one of the following phrases: {{ instruction_phrase }}.\n\n"
"Important Guidelines:\n"
"* Industry Relevance: Ensure the instruction pertains to the {{ industry_sector }} sector and {{ topic }} topic.\n"
"* Code Complexity: Tailor the instruction to the {{ code_complexity }} level. Utilize relevant {{ code_concept }} where appropriate to match the complexity level.\n"
"* Clarity and Specificity: Make the problem statement clear and unambiguous. Provide sufficient context to understand the requirements without being overly verbose.\n"
"* Response Formatting: Do not include any markers such as ### Response ### in the instruction.\n"
),
)
)

config_builder.add_column(
LLMCodeColumnConfig(
name="code_implementation",
model_alias=model_alias,
code_lang=CodeLang.PYTHON,
system_prompt=(
"You are an expert Python programmer who writes clean, efficient, and well-documented code."
),
prompt=(
"Write Python code for the following instruction:\n"
"Instruction: {{ instruction }}\n\n"
"Important Guidelines:\n"
"* Code Quality: Your code should be clean, complete, self-contained, and accurate.\n"
"* Code Validity: Please ensure that your Python code is executable and does not contain any errors.\n"
"* Packages: Remember to import any necessary libraries, and to use all libraries you import.\n"
"* Complexity & Concepts: The code should be written at a {{ code_complexity }} level, making use of concepts such as {{ code_concept }}.\n"
),
)
)

config_builder.add_column(
LLMJudgeColumnConfig(
name="code_judge_result",
model_alias=model_alias,
prompt=TEXT_TO_PYTHON_JUDGE_TEMPLATE,
scores=python_scoring,
)
)

config_builder.add_column(
ValidationColumnConfig(
name="code_validity_result",
validator_type=ValidatorType.CODE,
target_columns=["code_implementation"],
validator_params=CodeValidatorParams(
code_lang=CodeLang.PYTHON,
),
batch_size=100,
)
)

return config_builder


def create_dataset(
config_builder: DataDesignerConfigBuilder,
num_records: int,
artifact_path: Path | str | None = None,
) -> DatasetCreationResults:
data_designer = DataDesigner(artifact_path=artifact_path)
results = data_designer.create(config_builder, num_records=num_records)
return results


TEXT_TO_PYTHON_JUDGE_TEMPLATE = """\
You are an expert in Python programming, with specialized knowledge in software engineering, data science, and algorithmic problem-solving.

You think about potential flaws and errors in the code. You are a tough critic, but a fair one.

Take a deep breath and use the Python Code Quality Rubric below to score the **Generated Python Code** based on the INSTRUCTIONS.

#### INSTRUCTIONS
The Generated Python Code should be a valid response to the Natural Language Prompt below

Natural Language Prompt:
{{ instruction }}

Generated Python Code
{{ code_implementation }}
"""


python_scoring = [
Score(
name="Relevance",
description="Adherence to INSTRUCTIONS and CONTEXT",
options={
4: "Perfectly meets all specified requirements.",
3: "Meets most requirements with minor deviations.",
2: "Moderate deviation from the instructions.",
1: "Significant deviations from the instructions.",
0: "Does not adhere to the instructions.",
},
),
Score(
name="Pythonic",
description="Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)",
options={
4: "The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.",
3: "The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.",
2: "The code generally follows Python conventions but has room for better alignment with Pythonic practices.",
1: "The code loosely follows Python conventions, with several deviations from best practices.",
0: "The code does not follow Python conventions or best practices, using non-Pythonic approaches.",
},
),
Score(
name="Readability",
description="Readability and Maintainability (Is the Python code easy to understand and maintain?)",
options={
4: (
"The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, "
"ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format."
),
3: "The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.",
2: "The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.",
1: "The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.",
0: "The code is unreadable, with no attempt at formatting or description.",
},
),
Score(
name="Efficiency",
description="Efficiency and Performance (Is the code optimized for performance?)",
options={
4: "The solution is highly efficient, using appropriate data structures and algorithms; avoids unnecessary computations and optimizes for both time and space complexity.",
3: "The solution is efficient, with good use of Python's built-in functions and libraries; minor areas for optimization.",
2: "The solution is moderately efficient, but misses some opportunities for optimization; uses some inefficient patterns.",
1: "The solution shows poor efficiency, with notable performance issues; lacks effective optimization techniques.",
0: "The solution is highly inefficient; overlooks fundamental optimization practices, resulting in significant performance issues.",
},
),
]


if __name__ == "__main__":
from argparse import ArgumentParser

parser = ArgumentParser()
parser.add_argument("--model-alias", type=str, default="openai-text")
parser.add_argument("--num-records", type=int, default=5)
parser.add_argument("--artifact-path", type=str, default=None)
args = parser.parse_args()

config_builder = build_config(model_alias=args.model_alias)
results = create_dataset(config_builder, num_records=args.num_records, artifact_path=args.artifact_path)

print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")

results.load_analysis().to_report()
Loading