Skip to content

Commit 1ed52ec

Browse files
committed
Merge branch 'dev' into feat/logreg-retrieval
2 parents 983f0f2 + e859a31 commit 1ed52ec

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+414
-743
lines changed

CONTRIBUTING.md

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -50,41 +50,6 @@ make lint
5050

5151
![](assets/dependency-graph.png)
5252

53-
## Настройка логгера
54-
Чтобы видеть debug строчки у вас есть несколько опций:
55-
56-
1. Включить весь debug output через опцию коммандной строки:
57-
```bash
58-
autointent hydra.verbose=true
59-
```
60-
2. Включить debug output только для определенных модулей, пример для autointent.pipeline.optimization.cli_endpoint и самой hydra:
61-
```bash
62-
autointent hydra.verbose=[hydra,autointent/pipeline/optimization/cli_endpoint] hydra.job_logging.root.level=DEBUG
63-
```
64-
65-
Само конфигурирование логгера сделано в autointent.configs.optimization_cli.logger_config. Вы можете изменить любой параметр логгера через коммандную строку. Вот пример, как поменять уровень логгера на ERROR:
66-
```bash
67-
autointent hydra.job_logging.root.level=ERROR
68-
```
69-
70-
Еще можно изменить параметры логгера через yaml файлы:
71-
1. Создадим папку с конфиг. файлами: test_config
72-
2. test_config/config.yaml:
73-
```yaml
74-
defaults:
75-
- optimization_config
76-
- _self_
77-
- override hydra/job_logging: custom
78-
79-
# set your config params for optimization here
80-
embedder_batch_size: 32
81-
```
82-
3. Поместите конфигурацию логгера в test_config/hydra/job_logging/custom.yaml (параметры см. [здесь](https://docs.python.org/3/howto/logging.html))
83-
4. Запускаем с конфиг файлом config.yaml:
84-
```bash
85-
autointent --config-path FULL_PATH/test_config --config-name config
86-
```
87-
8853
## Построение документации
8954

9055
Построить html версию в папке `docs/build`:

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ lint:
2424

2525
.PHONY: sync
2626
sync:
27-
poetry sync
27+
poetry sync --with dev,test,typing,docs
2828

2929
.PHONY: docs
3030
docs:

autointent/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""This is AutoIntent API reference."""
22

3+
from ._logging import setup_logging
34
from ._ranker import Ranker
45
from ._embedder import Embedder
56
from ._vector_index import VectorIndex
@@ -8,5 +9,4 @@
89
from .context import Context
910
from ._pipeline import Pipeline
1011

11-
12-
__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline", "Ranker", "VectorIndex"]
12+
__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline", "Ranker", "VectorIndex", "setup_logging"]

autointent/_logging/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .setup import setup_logging
2+
3+
__all__ = ["setup_logging"]

autointent/_logging/config.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
version: 1
2+
disable_existing_loggers: false
3+
formatters:
4+
simple:
5+
format: '%(levelname)s: %(message)s'
6+
datefmt: '%Y-%m-%dT%H:%M:%S%z'
7+
json:
8+
(): autointent._logging.formatter.JSONFormatter
9+
fmt_keys:
10+
level: levelname
11+
message: message
12+
timestamp: timestamp
13+
logger: name
14+
module: module
15+
function: funcName
16+
line: lineno
17+
thread_name: threadName
18+
handlers:
19+
stdout:
20+
class: logging.StreamHandler
21+
formatter: simple
22+
stream: ext://sys.stdout
23+
loggers:
24+
root:
25+
level: DEBUG
26+
handlers:
27+
- stdout

autointent/_logging/formatter.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import datetime as dt
2+
import json
3+
import logging
4+
from typing import Any
5+
6+
LOG_RECORD_BUILTIN_ATTRS = {
7+
"args",
8+
"asctime",
9+
"created",
10+
"exc_info",
11+
"exc_text",
12+
"filename",
13+
"funcName",
14+
"levelname",
15+
"levelno",
16+
"lineno",
17+
"module",
18+
"msecs",
19+
"message",
20+
"msg",
21+
"name",
22+
"pathname",
23+
"process",
24+
"processName",
25+
"relativeCreated",
26+
"stack_info",
27+
"thread",
28+
"threadName",
29+
"taskName",
30+
}
31+
32+
33+
class JSONFormatter(logging.Formatter):
34+
"""This is a custom formatter for saving logging records as a json."""
35+
36+
def __init__(
37+
self,
38+
*,
39+
fmt_keys: dict[str, str] | None = None,
40+
) -> None:
41+
super().__init__()
42+
self.fmt_keys = fmt_keys if fmt_keys is not None else {}
43+
44+
def format(self, record: logging.LogRecord) -> str:
45+
message = self._prepare_log_dict(record)
46+
return json.dumps(message, default=str)
47+
48+
def _prepare_log_dict(self, record: logging.LogRecord) -> dict[str, Any]:
49+
always_fields = {
50+
"message": record.getMessage(),
51+
"timestamp": dt.datetime.fromtimestamp(record.created, tz=dt.timezone.utc).isoformat(),
52+
}
53+
if record.exc_info is not None:
54+
always_fields["exc_info"] = self.formatException(record.exc_info)
55+
56+
if record.stack_info is not None:
57+
always_fields["stack_info"] = self.formatStack(record.stack_info)
58+
59+
message = {
60+
key: msg_val if (msg_val := always_fields.pop(val, None)) is not None else getattr(record, val)
61+
for key, val in self.fmt_keys.items()
62+
}
63+
message.update(always_fields)
64+
65+
extra_fields = {key: val for key, val in record.__dict__.items() if key not in LOG_RECORD_BUILTIN_ATTRS}
66+
message.update(extra_fields)
67+
68+
return message

autointent/_logging/setup.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import importlib.resources as ires
2+
import logging.config
3+
import logging.handlers
4+
from pathlib import Path
5+
6+
import yaml
7+
8+
from autointent.custom_types import LogLevel
9+
10+
11+
def setup_logging(level: LogLevel | str, log_filename: Path | str | None = None) -> None:
12+
"""
13+
Set stdout and file handlers for logging autointent internal actions.
14+
15+
The first parameter affects the logs to the standard output stream. The second parameter is optional.
16+
If it is specified, then the "DEBUG" messages are logged to the file,
17+
regardless of what is specified by the first parameter.
18+
19+
:param level: one of "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"
20+
:param log_to_filepath: specify location of logfile, omit extension as suffix ``.log.jsonl`` will be appended.
21+
"""
22+
config_file = ires.files("autointent._logging").joinpath("config.yaml")
23+
with config_file.open() as f_in:
24+
config = yaml.safe_load(f_in)
25+
26+
level = LogLevel(level)
27+
config["handlers"]["stdout"]["level"] = level.value
28+
29+
if log_filename is not None:
30+
config["loggers"]["root"]["handlers"].append("file")
31+
32+
filename = str(log_filename) + ".log.jsonl"
33+
config["handlers"]["file"] = {
34+
"class": "logging.FileHandler",
35+
"level": "DEBUG",
36+
"formatter": "json",
37+
"filename": filename,
38+
}
39+
Path(filename).parent.mkdir(parents=True, exist_ok=True)
40+
41+
logging.config.dictConfig(config)

autointent/_pipeline/_cli_endpoint.py

Lines changed: 0 additions & 68 deletions
This file was deleted.

autointent/_pipeline/_pipeline.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,17 @@ class Pipeline:
2525
def __init__(
2626
self,
2727
nodes: list[NodeOptimizer] | list[InferenceNode],
28+
seed: int = 42,
2829
) -> None:
2930
"""
3031
Initialize the pipeline optimizer.
3132
3233
:param nodes: list of nodes
34+
:param seed: random seed
3335
"""
3436
self._logger = logging.getLogger(__name__)
3537
self.nodes = {node.node_type: node for node in nodes}
38+
self.seed = seed
3639

3740
if isinstance(nodes[0], NodeOptimizer):
3841
self.logging_config = LoggingConfig(dump_dir=None)
@@ -62,7 +65,7 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig
6265
raise TypeError(msg)
6366

6467
@classmethod
65-
def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "Pipeline":
68+
def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str, seed: int = 42) -> "Pipeline":
6669
"""
6770
Create pipeline optimizer from dictionary search space.
6871
@@ -71,16 +74,16 @@ def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "
7174
if isinstance(search_space, Path | str):
7275
search_space = load_search_space(search_space)
7376
nodes = [NodeOptimizer(**node) for node in search_space]
74-
return cls(nodes)
77+
return cls(nodes=nodes, seed=seed)
7578

7679
@classmethod
77-
def default_optimizer(cls, multilabel: bool) -> "Pipeline":
80+
def default_optimizer(cls, multilabel: bool, seed: int = 42) -> "Pipeline":
7881
"""
7982
Create pipeline optimizer with default search space for given classification task.
8083
8184
:param multilabel: Whether the task multi-label, or single-label.
8285
"""
83-
return cls.from_search_space(load_default_search_space(multilabel))
86+
return cls.from_search_space(search_space=load_default_search_space(multilabel), seed=seed)
8487

8588
def _fit(self, context: Context) -> None:
8689
"""
@@ -91,8 +94,8 @@ def _fit(self, context: Context) -> None:
9194
self.context = context
9295
self._logger.info("starting pipeline optimization...")
9396
self.context.callback_handler.start_run(
94-
run_name=self.context.logging_config.get_run_name(),
95-
dirpath=self.context.logging_config.get_dirpath(),
97+
run_name=self.context.logging_config.run_name,
98+
dirpath=self.context.logging_config.dirpath,
9699
)
97100
for node_type in NodeType:
98101
node_optimizer = self.nodes.get(node_type, None)
@@ -111,20 +114,19 @@ def _is_inference(self) -> bool:
111114
"""
112115
return isinstance(self.nodes[NodeType.scoring], InferenceNode)
113116

114-
def fit(self, dataset: Dataset, force_multilabel: bool = False) -> Context:
117+
def fit(self, dataset: Dataset) -> Context:
115118
"""
116119
Optimize the pipeline from dataset.
117120
118121
:param dataset: Dataset for optimization
119-
:param force_multilabel: Whether to force multilabel or not
120122
:return: Context
121123
"""
122124
if self._is_inference():
123125
msg = "Pipeline in inference mode cannot be fitted"
124126
raise RuntimeError(msg)
125127

126128
context = Context()
127-
context.set_dataset(dataset, force_multilabel)
129+
context.set_dataset(dataset)
128130
context.configure_logging(self.logging_config)
129131
context.configure_vector_index(self.vector_index_config, self.embedder_config)
130132
context.configure_cross_encoder(self.cross_encoder_config)

autointent/configs/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
"""Dataclasses for the configuration of the :class:`autointent.Embedder` and other objects."""
22

33
from ._inference_node import InferenceNodeConfig
4-
from ._optimization_cli import (
4+
from ._optimization import (
55
CrossEncoderConfig,
66
DataConfig,
77
EmbedderConfig,
88
LoggingConfig,
9-
OptimizationConfig,
109
TaskConfig,
1110
VectorIndexConfig,
1211
)
@@ -18,7 +17,6 @@
1817
"InferenceNodeConfig",
1918
"InferenceNodeConfig",
2019
"LoggingConfig",
21-
"OptimizationConfig",
2220
"TaskConfig",
2321
"VectorIndexConfig",
2422
]

0 commit comments

Comments
 (0)