Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
4aed585
move `EmbedderConfig` and `CrossEncoderConfig` to `autointent.configs`
voorhs Feb 22, 2025
27d6fa2
remove `_datafiles`
voorhs Feb 22, 2025
8e1ff88
implement search space filtering
voorhs Feb 22, 2025
e2acdc6
add `warning` mode
voorhs Feb 22, 2025
c3998e4
change default model for cross-encoder
voorhs Feb 22, 2025
33bd332
refactor constructor of `SklearnScorer`
voorhs Feb 22, 2025
450c6ff
add the heaviest search space preset
voorhs Feb 22, 2025
e7202a3
add search space validation for `DescriptionScorer`
voorhs Feb 22, 2025
871cb7e
add default value for `ThresholdDecision`
voorhs Feb 22, 2025
9195119
implement default configs for embedder and cross encoder
voorhs Feb 22, 2025
ee69f77
add transformers configs to pipeline
voorhs Feb 22, 2025
3f9b90c
implement two basic presets
voorhs Feb 22, 2025
d54f0ea
fix codestyle
voorhs Feb 22, 2025
d6229ce
fix typing
voorhs Feb 22, 2025
3a3ef99
Update optimizer_config.schema.json
github-actions[bot] Feb 22, 2025
02bf420
bug fix and update test
voorhs Feb 24, 2025
ccc4d57
search space validation bug found
voorhs Feb 24, 2025
2526fff
update unit tests
voorhs Feb 24, 2025
7df5500
update test
voorhs Feb 24, 2025
a676990
improve sklearn test
voorhs Feb 24, 2025
8d5eb6e
refactor sklearn scorer
voorhs Feb 24, 2025
7599116
remove `VectorIndexConfig` entirely from our lib
voorhs Feb 24, 2025
31e9812
try to fix validation errors
voorhs Feb 24, 2025
ef88dc2
remove multiclass/multilabel separation on modules dicts
voorhs Feb 24, 2025
a6a6e85
upd test
voorhs Feb 24, 2025
92456d1
fix codestyle
voorhs Feb 24, 2025
822a5a7
Update optimizer_config.schema.json
github-actions[bot] Feb 24, 2025
b85050a
something's wrong with sklearn scorer again
voorhs Feb 24, 2025
89976e5
remove unnecessary default value from sklearn scorer constructor
voorhs Feb 24, 2025
9b363c7
add default value for `weights` in knn and rerank scorer
voorhs Feb 24, 2025
603fabe
try without search space validation
voorhs Feb 24, 2025
0be875c
Update optimizer_config.schema.json
github-actions[bot] Feb 24, 2025
5128074
foolish bug fix
voorhs Feb 24, 2025
6f0612e
pull dev
voorhs Feb 24, 2025
e8c80ef
Update optimizer_config.schema.json
github-actions[bot] Feb 24, 2025
168cf85
finish implementing presets
voorhs Feb 24, 2025
a57c79d
Update optimizer_config.schema.json
github-actions[bot] Feb 24, 2025
21f9101
update docs
voorhs Feb 25, 2025
d357718
update docs and readme
voorhs Feb 25, 2025
f952a2d
upd ci
voorhs Feb 25, 2025
9dc0e0c
respond to samoed
voorhs Feb 26, 2025
5adde06
update tests
voorhs Feb 26, 2025
fe0f301
fix codestyle
voorhs Feb 26, 2025
e917443
Update optimizer_config.schema.json
github-actions[bot] Feb 26, 2025
00f4bc9
remove sklearn scorer from config for now
voorhs Feb 26, 2025
2f4aabf
bug fix
voorhs Feb 26, 2025
f7441dd
pull dev
voorhs Feb 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/test-presets.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: test presets

on:
push:
branches:
- dev
pull_request:

jobs:
test:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest ]
python-version: [ "3.10", "3.11", "3.12" ]
include:
- os: windows-latest
python-version: "3.10"

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: "pip"

- name: Install dependencies
run: |
pip install .
pip install pytest pytest-asyncio

- name: Run tests
run: |
pytest tests/pipeline/test_presets.py
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Example of building an intent classifier in a couple of lines of code:
from autointent import Pipeline, Dataset

dataset = Dataset.from_json(path_to_json)
pipeline = Pipeline.default_optimizer(multilabel=False)
pipeline = Pipeline.from_preset("light")
pipeline.fit(dataset)
pipeline.predict(["show me my latest recent transactions"])
pipeline.predict(["show me my latest transactions"])
```
2 changes: 2 additions & 0 deletions autointent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ._dataset import Dataset
from ._hash import Hasher
from .context import Context, load_dataset
from ._optimization_config import OptimizationConfig
from ._pipeline import Pipeline


Expand All @@ -15,6 +16,7 @@
"Dataset",
"Embedder",
"Hasher",
"OptimizationConfig",
"Pipeline",
"Ranker",
"VectorIndex",
Expand Down
26 changes: 0 additions & 26 deletions autointent/_datafiles/default-multiclass-config.yaml

This file was deleted.

21 changes: 0 additions & 21 deletions autointent/_datafiles/default-multilabel-config.yaml

This file was deleted.

17 changes: 0 additions & 17 deletions autointent/_datafiles/inference-config-example.yaml

This file was deleted.

21 changes: 21 additions & 0 deletions autointent/_dataset/_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""File with Dataset definition."""

import json
import logging
from collections import defaultdict
from functools import cached_property
from pathlib import Path
Expand All @@ -12,6 +13,8 @@
from autointent.custom_types import LabelWithOOS, Split
from autointent.schemas import Intent, Tag

logger = logging.getLogger(__name__)


class Sample(TypedDict):
"""
Expand All @@ -36,6 +39,7 @@ class Dataset(dict[str, HFDataset]):

label_feature = "label"
utterance_feature = "utterance"
has_descriptions: bool

def __init__(self, *args: Any, intents: list[Intent], **kwargs: Any) -> None: # noqa: ANN401
"""
Expand All @@ -49,6 +53,8 @@ def __init__(self, *args: Any, intents: list[Intent], **kwargs: Any) -> None: #

self.intents = intents

self.has_descriptions = self.validate_descriptions()

@property
def multilabel(self) -> bool:
"""
Expand Down Expand Up @@ -197,3 +203,18 @@ def _to_multilabel(self, sample: Sample) -> Sample:
ohe_vector[sample["label"]] = 1
sample["label"] = ohe_vector
return sample

def validate_descriptions(self) -> bool:
"""
Check whether the dataset contains text descriptions for each intent.

:return: True if all intents have description field
"""
has_any = any(intent.description is not None for intent in self.intents)
has_all = all(intent.description is not None for intent in self.intents)

if has_any and not has_all:
msg = "Some intents have text descriptions, but some of them not."
logger.warning(msg)

return has_all
3 changes: 2 additions & 1 deletion autointent/_dump_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from sklearn.base import BaseEstimator

from autointent import Embedder, Ranker, VectorIndex
from autointent.schemas import CrossEncoderConfig, EmbedderConfig, TagsList
from autointent.configs import CrossEncoderConfig, EmbedderConfig
from autointent.schemas import TagsList

ModuleSimpleAttributes = None | str | int | float | bool | list # type: ignore[type-arg]

Expand Down
2 changes: 1 addition & 1 deletion autointent/_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from sentence_transformers import SentenceTransformer

from ._hash import Hasher
from .schemas import EmbedderConfig, TaskTypeEnum
from .configs import EmbedderConfig, TaskTypeEnum


def get_embeddings_path(filename: str) -> Path:
Expand Down
17 changes: 17 additions & 0 deletions autointent/_optimization_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from pydantic import BaseModel, PositiveInt

from .configs import CrossEncoderConfig, DataConfig, EmbedderConfig, LoggingConfig
from .custom_types import SamplerType
from .nodes.schemes import OptimizationSearchSpaceConfig


class OptimizationConfig(BaseModel):
"""Configuration for the optimization process."""

data_config: DataConfig = DataConfig()
search_space: OptimizationSearchSpaceConfig
logging_config: LoggingConfig = LoggingConfig()
embedder_config: EmbedderConfig = EmbedderConfig()
cross_encoder_config: CrossEncoderConfig = CrossEncoderConfig()
sampler: SamplerType = "brute"
seed: PositiveInt = 42
Loading