diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py index 791aef839..cec6851fc 100644 --- a/autointent/_pipeline/_pipeline.py +++ b/autointent/_pipeline/_pipeline.py @@ -37,7 +37,7 @@ class Pipeline: """Pipeline optimizer class. - See tutorial on AutoML features of AutoIntent. + See tutorial on AutoML features of AutoIntent in :ref:`user_guides`. """ def __init__( diff --git a/autointent/generation/__init__.py b/autointent/generation/__init__.py index 096e19f7d..ea2d0e446 100644 --- a/autointent/generation/__init__.py +++ b/autointent/generation/__init__.py @@ -1 +1,8 @@ -"""Some generative methods for enriching training datasets.""" +"""Some generative methods for enriching training datasets. + +See :ref:`data-aug-tuts`. +""" + +from ._generator import Generator + +__all__ = ["Generator"] diff --git a/autointent/generation/utterances/generator.py b/autointent/generation/_generator.py similarity index 93% rename from autointent/generation/utterances/generator.py rename to autointent/generation/_generator.py index 744d2765b..a9ba15be2 100644 --- a/autointent/generation/utterances/generator.py +++ b/autointent/generation/_generator.py @@ -16,8 +16,8 @@ class Generator: Args: base_url: HTTP-endpoint for sending API requests to OpenAI API compatible server. - Omit this to infer OPENAI_BASE_URL from environment. - model_name: Name of LLM. Omit this to infer OPENAI_MODEL_NAME from environment. + Omit this to infer ``OPENAI_BASE_URL`` from environment. + model_name: Name of LLM. Omit this to infer ``OPENAI_MODEL_NAME`` from environment. **generation_params: kwargs that will be sent with a request to the endpoint. """ @@ -28,7 +28,7 @@ class Generator: "temperature": 0.7, } - def __init__(self, base_url: str | None = None, model_name: str | None = None, **generation_params: Any) -> None: # noqa: ANN401, D107 + def __init__(self, base_url: str | None = None, model_name: str | None = None, **generation_params: Any) -> None: # noqa: ANN401 if not base_url: base_url = os.environ["OPENAI_BASE_URL"] if not model_name: diff --git a/autointent/generation/chat_templates/_base_synthesizer.py b/autointent/generation/chat_templates/_base_synthesizer.py index 406440377..ffb458440 100644 --- a/autointent/generation/chat_templates/_base_synthesizer.py +++ b/autointent/generation/chat_templates/_base_synthesizer.py @@ -6,6 +6,7 @@ from typing import ClassVar from autointent import Dataset +from autointent.custom_types import Split from autointent.schemas import Intent from ._evolution_templates_schemas import Message, Role @@ -38,7 +39,7 @@ class BaseSynthesizerTemplate(BaseChatTemplate): def __init__( self, dataset: Dataset, - split: str, + split: str = Split.TRAIN, extra_instructions: str | None = None, max_sample_utterances: int | None = None, ) -> None: diff --git a/autointent/generation/utterances/__init__.py b/autointent/generation/utterances/__init__.py index 91a66aec7..1c048d260 100644 --- a/autointent/generation/utterances/__init__.py +++ b/autointent/generation/utterances/__init__.py @@ -1,16 +1,10 @@ """Generative methods for enriching dataset with synthetic samples.""" -from .balancer import DatasetBalancer -from .basic import UtteranceGenerator -from .evolution import ( - IncrementalUtteranceEvolver, - UtteranceEvolver, -) -from .generator import Generator +from ._basic import DatasetBalancer, UtteranceGenerator +from ._evolution import IncrementalUtteranceEvolver, UtteranceEvolver __all__ = [ "DatasetBalancer", - "Generator", "IncrementalUtteranceEvolver", "UtteranceEvolver", "UtteranceGenerator", diff --git a/autointent/generation/utterances/_basic/__init__.py b/autointent/generation/utterances/_basic/__init__.py new file mode 100644 index 000000000..3c8df69ff --- /dev/null +++ b/autointent/generation/utterances/_basic/__init__.py @@ -0,0 +1,4 @@ +from .balancer import DatasetBalancer +from .utterance_generator import UtteranceGenerator + +__all__ = ["DatasetBalancer", "UtteranceGenerator"] diff --git a/autointent/generation/utterances/balancer.py b/autointent/generation/utterances/_basic/balancer.py similarity index 97% rename from autointent/generation/utterances/balancer.py rename to autointent/generation/utterances/_basic/balancer.py index 13b2036a9..0d5a49ea5 100644 --- a/autointent/generation/utterances/balancer.py +++ b/autointent/generation/utterances/_basic/balancer.py @@ -7,9 +7,10 @@ from autointent import Dataset from autointent.custom_types import Split +from autointent.generation import Generator from autointent.generation.chat_templates import BaseSynthesizerTemplate -from autointent.generation.utterances.basic.utterance_generator import UtteranceGenerator -from autointent.generation.utterances.generator import Generator + +from .utterance_generator import UtteranceGenerator logger = logging.getLogger(__name__) @@ -30,7 +31,7 @@ class DatasetBalancer: Must be a positive integer or None. Defaults to None. """ - def __init__( # noqa: D107 + def __init__( self, generator: Generator, prompt_maker: BaseSynthesizerTemplate, diff --git a/autointent/generation/utterances/basic/cli.py b/autointent/generation/utterances/_basic/cli.py similarity index 95% rename from autointent/generation/utterances/basic/cli.py rename to autointent/generation/utterances/_basic/cli.py index 0e56af613..aa4130b02 100644 --- a/autointent/generation/utterances/basic/cli.py +++ b/autointent/generation/utterances/_basic/cli.py @@ -4,8 +4,9 @@ from argparse import ArgumentParser from autointent import load_dataset +from autointent.generation import Generator from autointent.generation.chat_templates import EnglishSynthesizerTemplate, RussianSynthesizerTemplate -from autointent.generation.utterances import Generator, UtteranceGenerator +from autointent.generation.utterances import UtteranceGenerator logging.basicConfig(level="INFO") logger = logging.getLogger(__name__) diff --git a/autointent/generation/utterances/basic/utterance_generator.py b/autointent/generation/utterances/_basic/utterance_generator.py similarity index 89% rename from autointent/generation/utterances/basic/utterance_generator.py rename to autointent/generation/utterances/_basic/utterance_generator.py index 0e6c7b867..28dd8f7fd 100644 --- a/autointent/generation/utterances/basic/utterance_generator.py +++ b/autointent/generation/utterances/_basic/utterance_generator.py @@ -7,8 +7,8 @@ from autointent import Dataset from autointent.custom_types import Split +from autointent.generation import Generator from autointent.generation.chat_templates import BaseSynthesizerTemplate -from autointent.generation.utterances.generator import Generator from autointent.schemas import Intent, Sample @@ -23,9 +23,26 @@ class UtteranceGenerator: generator: Generator instance for generating utterances. prompt_maker: Prompt maker instance for generating prompts. async_mode: Whether to use asynchronous mode for generation. + + Usage + ----- + + .. code-block:: python + + from autointent import Dataset + from autointent.generation import Generator + from autointent.generation.utterances import UtteranceGenerator + from autointent.generation.chat_templates import RussianSynthesizerTemplate + + dataset = Dataset.from_json(path_to_json) + generator = Generator() + prompt = RussianSynthesizerTemplate(dataset) + augmenter = UtteranceGenerator(generator, prompt_maker=prompt) + augmenter.augment(dataset) + """ - def __init__(self, generator: Generator, prompt_maker: BaseSynthesizerTemplate, async_mode: bool = False) -> None: # noqa: D107 + def __init__(self, generator: Generator, prompt_maker: BaseSynthesizerTemplate, async_mode: bool = False) -> None: self.generator = generator self.prompt_maker = prompt_maker self.async_mode = async_mode diff --git a/autointent/generation/utterances/_evolution/__init__.py b/autointent/generation/utterances/_evolution/__init__.py new file mode 100644 index 000000000..c62ed0093 --- /dev/null +++ b/autointent/generation/utterances/_evolution/__init__.py @@ -0,0 +1,4 @@ +from .evolver import UtteranceEvolver +from .incremental_evolver import IncrementalUtteranceEvolver + +__all__ = ["IncrementalUtteranceEvolver", "UtteranceEvolver"] diff --git a/autointent/generation/utterances/evolution/cli.py b/autointent/generation/utterances/_evolution/cli.py similarity index 94% rename from autointent/generation/utterances/evolution/cli.py rename to autointent/generation/utterances/_evolution/cli.py index 50aff3788..d47ba1474 100644 --- a/autointent/generation/utterances/evolution/cli.py +++ b/autointent/generation/utterances/_evolution/cli.py @@ -4,12 +4,14 @@ from argparse import ArgumentParser, Namespace from autointent import load_dataset +from autointent.generation import Generator from autointent.generation.chat_templates import ( EVOLUTION_MAPPING, EVOLUTION_NAMES, ) -from autointent.generation.utterances.evolution import IncrementalUtteranceEvolver, UtteranceEvolver -from autointent.generation.utterances.generator import Generator + +from .evolver import UtteranceEvolver +from .incremental_evolver import IncrementalUtteranceEvolver logging.basicConfig(level="INFO") logger = logging.getLogger(__name__) diff --git a/autointent/generation/utterances/evolution/dspy_evolver.py b/autointent/generation/utterances/_evolution/dspy_evolver.py similarity index 100% rename from autointent/generation/utterances/evolution/dspy_evolver.py rename to autointent/generation/utterances/_evolution/dspy_evolver.py diff --git a/autointent/generation/utterances/evolution/evolver.py b/autointent/generation/utterances/_evolution/evolver.py similarity index 92% rename from autointent/generation/utterances/evolution/evolver.py rename to autointent/generation/utterances/_evolution/evolver.py index 3e4efc414..df45b5106 100644 --- a/autointent/generation/utterances/evolution/evolver.py +++ b/autointent/generation/utterances/_evolution/evolver.py @@ -12,8 +12,8 @@ from autointent import Dataset from autointent.custom_types import Split +from autointent.generation import Generator from autointent.generation.chat_templates import EvolutionChatTemplate -from autointent.generation.utterances.generator import Generator from autointent.schemas import Intent @@ -28,9 +28,25 @@ class UtteranceEvolver: prompt_makers: List of prompt makers for generating prompts. seed: Random seed for reproducibility. async_mode: Whether to use asynchronous mode for generation. + + Usage + ----- + + .. code-block:: python + + from autointent import Dataset + from autointent.generation import Generator + from autointent.generation.utterances import UtteranceEvolver + from autointent.generation.chat_templates import GoofyEvolution, InformalEvolution + + dataset = Dataset.from_json(path_to_json) + generator = Generator() + evolver = UtteranceEvolver(generator, prompt_makers=[GoofyEvolution(), InformalEvolution()]) + evolver.augment(dataset) + """ - def __init__( # noqa: D107 + def __init__( self, generator: Generator, prompt_makers: Sequence[EvolutionChatTemplate], diff --git a/autointent/generation/utterances/evolution/incremental_evolver.py b/autointent/generation/utterances/_evolution/incremental_evolver.py similarity index 95% rename from autointent/generation/utterances/evolution/incremental_evolver.py rename to autointent/generation/utterances/_evolution/incremental_evolver.py index 032c643a5..fb0826795 100644 --- a/autointent/generation/utterances/evolution/incremental_evolver.py +++ b/autointent/generation/utterances/_evolution/incremental_evolver.py @@ -13,9 +13,9 @@ from autointent import Dataset, Pipeline from autointent.custom_types import Split +from autointent.generation import Generator from autointent.generation.chat_templates import EvolutionChatTemplate -from autointent.generation.utterances.evolution.evolver import UtteranceEvolver -from autointent.generation.utterances.generator import Generator +from autointent.generation.utterances._evolution.evolver import UtteranceEvolver SEARCH_SPACE = [ { @@ -53,7 +53,7 @@ class IncrementalUtteranceEvolver(UtteranceEvolver): search_space: Search space for the pipeline optimizer. """ - def __init__( # noqa: D107 + def __init__( self, generator: Generator, prompt_makers: Sequence[EvolutionChatTemplate], diff --git a/autointent/generation/utterances/basic/__init__.py b/autointent/generation/utterances/basic/__init__.py deleted file mode 100644 index 5b68373ad..000000000 --- a/autointent/generation/utterances/basic/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .utterance_generator import UtteranceGenerator - -__all__ = ["UtteranceGenerator"] diff --git a/autointent/generation/utterances/evolution/__init__.py b/autointent/generation/utterances/evolution/__init__.py deleted file mode 100644 index b89fbc9dc..000000000 --- a/autointent/generation/utterances/evolution/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .dspy_evolver import DSPYIncrementalUtteranceEvolver -from .evolver import UtteranceEvolver -from .incremental_evolver import IncrementalUtteranceEvolver - -__all__ = ["DSPYIncrementalUtteranceEvolver", "IncrementalUtteranceEvolver", "UtteranceEvolver"] diff --git a/docs/source/augmentation_tutorials/balancer.rst b/docs/source/augmentation_tutorials/balancer.rst index fd9db77f5..14d60d47f 100644 --- a/docs/source/augmentation_tutorials/balancer.rst +++ b/docs/source/augmentation_tutorials/balancer.rst @@ -3,7 +3,7 @@ Balancing Datasets with DatasetBalancer ======================================= -This guide demonstrates how to use the DatasetBalancer class to balance class distribution in your datasets through LLM-based data augmentation. +This guide demonstrates how to use the :py:class:`autointent.generation.utterances.DatasetBalancer` class to balance class distribution in your datasets through LLM-based data augmentation. This method is a wrapper for more simple method :py:class:`autointent.generation.utterances.UtteranceGenerator`. .. contents:: Table of Contents :depth: 2 @@ -57,8 +57,9 @@ Setting up the Generator and Template ------------------------------------ DatasetBalancer requires two main components: -1. A Generator - responsible for creating new utterances using an LLM -2. A Template - defines the prompt format sent to the LLM + +1. A :py:class:`autointent.generation.Generator`` - responsible for creating new utterances using an LLM +2. A :py:class:`autointent.generation.chat_templates.EnglishSynthesizerTemplate` - defines the prompt format sent to the LLM Let's set up these components: diff --git a/docs/source/augmentation_tutorials/dspy_augmentation.rst b/docs/source/augmentation_tutorials/dspy_augmentation.rst index 6c2d9bf1a..7fd74fd63 100644 --- a/docs/source/augmentation_tutorials/dspy_augmentation.rst +++ b/docs/source/augmentation_tutorials/dspy_augmentation.rst @@ -3,7 +3,7 @@ DSPY Augmentation ################# -This tutorial covers the implementation and usage of an evolutionary strategy to augment utterances using DSPy. It explains how DSPy is used, how the module functions, and how the scoring metric works. +This tutorial covers the implementation and usage of an evolutionary strategy to augment utterances using DSPy. It explains how DSPy is used, how the module functions, and how the scoring metric works. This method is a wrapper for more simple method :py:class:`autointent.generation.utterances.UtteranceEvolver`. .. contents:: Table of Contents :depth: 2 diff --git a/docs/source/augmentation_tutorials/index.rst b/docs/source/augmentation_tutorials/index.rst new file mode 100644 index 000000000..665c45bf7 --- /dev/null +++ b/docs/source/augmentation_tutorials/index.rst @@ -0,0 +1,10 @@ +.. _data-aug-tuts: + +Data augmentation tutorials +--------------------------- + +.. toctree:: + :maxdepth: 1 + + balancer + dspy_augmentation diff --git a/docs/source/user_guides.rst b/docs/source/user_guides.rst index 5e3f8cf2b..2d7cfdf1c 100644 --- a/docs/source/user_guides.rst +++ b/docs/source/user_guides.rst @@ -1,3 +1,5 @@ +.. _user_guides: + User Guides ----------- @@ -8,13 +10,4 @@ User Guides user_guides/index_basic_usage user_guides/index_advanced_usage user_guides/index_cli_usage - -Data augmentation tutorials ---------------------------- - -.. toctree:: - :maxdepth: 1 - - augmentation_tutorials/dspy_augmentation - augmentation_tutorials/balancer - + augmentation_tutorials/index diff --git a/tests/generation/utterances/test_balancer.py b/tests/generation/utterances/test_balancer.py index 58d109c3e..1d8f637b3 100644 --- a/tests/generation/utterances/test_balancer.py +++ b/tests/generation/utterances/test_balancer.py @@ -7,8 +7,9 @@ from autointent import Dataset from autointent.custom_types import Split +from autointent.generation import Generator from autointent.generation.chat_templates import EnglishSynthesizerTemplate -from autointent.generation.utterances import DatasetBalancer, Generator +from autointent.generation.utterances import DatasetBalancer logger = logging.getLogger(__name__) diff --git a/tests/generation/utterances/test_generator.py b/tests/generation/utterances/test_generator.py index e7685612d..cc71a896e 100644 --- a/tests/generation/utterances/test_generator.py +++ b/tests/generation/utterances/test_generator.py @@ -2,8 +2,8 @@ import pytest +from autointent.generation import Generator from autointent.generation.chat_templates import Message -from autointent.generation.utterances.generator import Generator @pytest.fixture(autouse=True) diff --git a/user_guides/basic_usage/03_automl.py b/user_guides/basic_usage/03_automl.py index 18734d7ef..43f692eb8 100644 --- a/user_guides/basic_usage/03_automl.py +++ b/user_guides/basic_usage/03_automl.py @@ -1,6 +1,6 @@ # %% [markdown] """ -# Pipeline Auto Configuration (AutoML) +# AutoML """ # %%