Skip to content

Commit bfeb407

Browse files
committed
Merge branch 'dev' into feat/augmentation
2 parents 83c1c76 + a05b530 commit bfeb407

File tree

72 files changed

+2167
-617
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+2167
-617
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: Generate JSON Schema
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
8+
permissions:
9+
contents: write
10+
11+
jobs:
12+
generate-schema:
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- name: Checkout repository
17+
uses: actions/checkout@v4
18+
19+
- name: Set up Python
20+
uses: actions/setup-python@v5
21+
with:
22+
python-version: '3.10'
23+
24+
- name: Install dependencies
25+
run: |
26+
pip install .
27+
28+
- name: Generate JSON Schema
29+
run: python scripts/generate_json_schema_config.py
30+
31+
- name: Check for changes
32+
id: check_changes
33+
run: |
34+
git diff --exit-code docs/optimizer_config.schema.json || echo "changed=true" >> $GITHUB_ENV
35+
36+
- name: Commit and push changes
37+
if: env.changed == 'true'
38+
env:
39+
GITHUB_TOKEN: ${{ github.token }}
40+
run: |
41+
git config --global user.name "github-actions[bot]"
42+
git config --global user.email "github-actions[bot]@users.noreply.github.com"
43+
git add docs/optimizer_config.schema.json
44+
git commit -m "Update optimizer_config.schema.json"
45+
git push

.vscode/settings.json

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,11 @@
22
"ruff.configuration": "pyproject.toml",
33
"python.analysis.extraPaths": [
44
"./docs/source"
5-
]
5+
],
6+
"yaml.schemas": {
7+
"./docs/optimizer_config.schema.json": [
8+
"*.yaml",
9+
"!*/.github/*/*.yaml"
10+
]
11+
}
612
}
Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,26 @@
11
# TODO: make up a better and more versatile config
22
- node_type: embedding
3-
metric: retrieval_hit_rate
3+
target_metric: retrieval_hit_rate
44
search_space:
55
- module_name: retrieval
66
k: [10]
77
embedder_name:
88
- avsolatorio/GIST-small-Embedding-v0
9-
- infgrad/stella-base-en-v2
9+
- sergeyzh/rubert-tiny-turbo
1010
- node_type: scoring
11-
metric: scoring_roc_auc
11+
target_metric: scoring_roc_auc
1212
search_space:
1313
- module_name: knn
1414
k: [1, 3, 5, 10]
1515
weights: ["uniform", "distance", "closest"]
1616
- module_name: linear
1717
- module_name: dnnc
1818
cross_encoder_name:
19-
- BAAI/bge-reranker-base
2019
- cross-encoder/ms-marco-MiniLM-L-6-v2
2120
k: [1, 3, 5, 10]
2221
- node_type: decision
23-
metric: decision_accuracy
22+
target_metric: decision_accuracy
2423
search_space:
2524
- module_name: threshold
2625
thresh: [0.5]
27-
- module_name: argmax
26+
- module_name: argmax
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
# TODO: make up a better and more versatile config
22
- node_type: embedding
3-
metric: retrieval_hit_rate_intersecting
3+
target_metric: retrieval_hit_rate_intersecting
44
search_space:
55
- module_name: retrieval
66
k: [10]
77
embedder_name:
88
- deepvk/USER-bge-m3
99
- node_type: scoring
10-
metric: scoring_roc_auc
10+
target_metric: scoring_roc_auc
1111
search_space:
1212
- module_name: knn
1313
k: [3]
1414
weights: ["uniform", "distance", "closest"]
1515
- module_name: linear
1616
- node_type: decision
17-
metric: decision_accuracy
17+
target_metric: decision_accuracy
1818
search_space:
1919
- module_name: threshold
2020
thresh: [0.5]
21-
- module_name: adaptive
21+
- module_name: adaptive

autointent/_datafiles/inference-config-example.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
module_name: retrieval
33
module_config:
44
k: 10
5-
model_name: infgrad/stella-base-en-v2
5+
model_name: sergeyzh/rubert-tiny-turbo
66
load_path: .
77
- node_type: scoring
88
module_name: knn

autointent/_dataset/_dataset.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,14 @@ def from_hub(cls, repo_id: str) -> "Dataset":
100100
:param repo_id: ID of the Hugging Face repository.
101101
:return: Initialized Dataset object.
102102
"""
103-
splits, intents = load_dataset(repo_id), []
103+
from ._reader import DictReader
104+
105+
splits = load_dataset(repo_id)
106+
mapping = dict(**splits)
104107
if Split.INTENTS in get_dataset_config_names(repo_id):
105-
intents = load_dataset(repo_id, Split.INTENTS)[Split.INTENTS].to_list()
106-
return cls(
107-
splits.items(),
108-
intents=[Intent.model_validate(intent) for intent in intents],
109-
)
108+
mapping["intents"] = load_dataset(repo_id, Split.INTENTS)[Split.INTENTS].to_list()
109+
110+
return DictReader().read(mapping)
110111

111112
def to_multilabel(self) -> "Dataset":
112113
"""

autointent/_pipeline/_pipeline.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from autointent.custom_types import ListOfGenericLabels, NodeType
1414
from autointent.metrics import PREDICTION_METRICS_MULTILABEL
1515
from autointent.nodes import InferenceNode, NodeOptimizer
16+
from autointent.nodes.schemes import OptimizationConfig
1617
from autointent.utils import load_default_search_space, load_search_space
1718

1819
from ._schemas import InferencePipelineOutput, InferencePipelineUtteranceOutput
@@ -72,10 +73,12 @@ def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str, seed
7273
Create pipeline optimizer from dictionary search space.
7374
7475
:param search_space: Dictionary config
76+
:param seed: random seed
7577
"""
7678
if isinstance(search_space, Path | str):
7779
search_space = load_search_space(search_space)
78-
nodes = [NodeOptimizer(**node) for node in search_space]
80+
validated_search_space = OptimizationConfig(search_space).model_dump() # type: ignore[arg-type]
81+
nodes = [NodeOptimizer(**node) for node in validated_search_space]
7982
return cls(nodes=nodes, seed=seed)
8083

8184
@classmethod
@@ -84,6 +87,9 @@ def default_optimizer(cls, multilabel: bool, seed: int = 42) -> "Pipeline":
8487
Create pipeline optimizer with default search space for given classification task.
8588
8689
:param multilabel: Whether the task multi-label, or single-label.
90+
:param seed: random seed
91+
92+
:return: Pipeline
8793
"""
8894
return cls.from_search_space(search_space=load_default_search_space(multilabel), seed=seed)
8995

autointent/custom_types.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66

77
from enum import Enum
8-
from typing import Literal, TypeAlias, TypedDict
8+
from typing import Literal, TypeAlias
99

1010

1111
class LogLevel(Enum):
@@ -46,10 +46,6 @@ class LogLevel(Enum):
4646
"""
4747

4848

49-
class BaseMetadataDict(TypedDict):
50-
"""Base metadata dictionary for storing additional information."""
51-
52-
5349
class NodeType(str, Enum):
5450
"""Enumeration of node types in the AutoIntent pipeline."""
5551

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from .basic import SynthesizerChatTemplate, UtteranceGenerator
2+
from .evolution import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution, UtteranceEvolver
3+
from .generator import Generator
4+
5+
__all__ = [
6+
"AbstractEvolution",
7+
"ConcreteEvolution",
8+
"EvolutionChatTemplate",
9+
"Generator",
10+
"ReasoningEvolution",
11+
"SynthesizerChatTemplate",
12+
"UtteranceEvolver",
13+
"UtteranceGenerator",
14+
]
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .chat_template import SynthesizerChatTemplate
2+
from .utterance_generator import UtteranceGenerator
3+
4+
__all__ = ["SynthesizerChatTemplate", "UtteranceGenerator"]

0 commit comments

Comments
 (0)