feat: add DISABLE_EMISSIONS_TRACKING (#191)

Darinochka · voorhs · github-actions[bot] · web-flow · commit 7b1a0d2e1bad · 2025-05-02T16:59:29.000+03:00
* feat: add DISABLE_EMISSIONS_TRACKING

* try to fix docs error

* Update optimizer_config.schema.json

* another attempt

* Update optimizer_config.schema.json

* i give up for now

* Update optimizer_config.schema.json

---------

Co-authored-by: voorhs &lt;ilya_alekseev_2016@list.ru&gt;
Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/autointent/configs/_transformers.py b/autointent/configs/_transformers.py
@@ -115,7 +115,10 @@ def get_prompt_type(self, prompt_type: TaskTypeEnum | None) -> str | None:  # no
 
 
 class CrossEncoderConfig(HFModelConfig):
-    model_name: str = Field("cross-encoder/ms-marco-MiniLM-L-6-v2", description="Name of the hugging face model.")
+    model_name: str = Field("cross-encoder/ms-marco-MiniLM-L6-v2", description="Name of the hugging face model.")
     train_head: bool = Field(
         False, description="Whether to train the head of the model. If False, LogReg will be trained."
     )
+    tokenizer_config: TokenizerConfig = Field(
+        default_factory=lambda: TokenizerConfig(max_length=512)
+    )  # this is because sentence-transformers doesn't allow you to customize tokenizer settings properly
diff --git a/autointent/modules/scoring/_dnnc/dnnc.py b/autointent/modules/scoring/_dnnc/dnnc.py
@@ -36,7 +36,7 @@ class DNNCScorer(BaseScorer):
         utterances = ["what is your name?", "how are you?"]
         labels = [0, 1]
         scorer = DNNCScorer(
-            cross_encoder_config="cross-encoder/ms-marco-MiniLM-L-6-v2",
+            cross_encoder_config="cross-encoder/ms-marco-MiniLM-L6-v2",
             embedder_config="sergeyzh/rubert-tiny-turbo",
             k=5,
         )
diff --git a/autointent/nodes/emissions_tracker.py b/autointent/nodes/emissions_tracker.py
@@ -2,6 +2,7 @@
 
 import json
 import logging
+import os
 
 from codecarbon import EmissionsTracker as CodeCarbonTracker  # type: ignore[import-untyped]
 from codecarbon.output import EmissionsData  # type: ignore[import-untyped]
@@ -20,22 +21,31 @@ def __init__(self, project_name: str, measure_power_secs: int = 1) -> None:
             measure_power_secs: How often to measure power consumption in seconds.
         """
         self._logger = logger
-        self.tracker = CodeCarbonTracker(project_name=project_name, measure_power_secs=measure_power_secs)
+        self._enabled = int(os.getenv("TRACK_EMISSIONS", "0"))
+        if self._enabled:
+            self.tracker = CodeCarbonTracker(project_name=project_name, measure_power_secs=measure_power_secs)
+        else:
+            self._logger.info("Emissions tracking is enabled via TRACK_EMISSIONS environment variable")
+            self.tracker = None
 
     def start_task(self, task_name: str) -> None:
         """Start tracking emissions for a specific task.
 
         Args:
             task_name: Name of the task to track emissions for.
         """
-        self.tracker.start_task(task_name)
+        if self._enabled:
+            self.tracker.start_task(task_name)
 
     def stop_task(self) -> dict[str, float]:
         """Stop tracking emissions and return the emissions data.
 
         Returns:
             Dictionary containing emissions metrics.
         """
+        if not self._enabled:
+            return {}
+
         emissions_data = self.tracker.stop_task()
         _ = self.tracker.stop()
         return self._process_metrics(emissions_data)
diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json
@@ -4,7 +4,7 @@
             "additionalProperties": false,
             "properties": {
                 "model_name": {
-                    "default": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+                    "default": "cross-encoder/ms-marco-MiniLM-L6-v2",
                     "description": "Name of the hugging face model.",
                     "title": "Model Name",
                     "type": "string"
@@ -424,11 +424,11 @@
         "cross_encoder_config": {
             "$ref": "#/$defs/CrossEncoderConfig",
             "default": {
-                "model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2",
+                "model_name": "cross-encoder/ms-marco-MiniLM-L6-v2",
                 "batch_size": 32,
                 "device": null,
                 "tokenizer_config": {
-                    "max_length": null,
+                    "max_length": 512,
                     "padding": true,
                     "truncation": true
                 },
diff --git a/tests/_transformers/test_nli_transformer.py b/tests/_transformers/test_nli_transformer.py
@@ -13,7 +13,7 @@ def data_handler():
 
 
 def test_nli_transformer_predict_without_trained_head(data_handler):
-    model = Ranker(cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": True})
+    model = Ranker(cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": True})
     with pytest.raises(ValueError, match="Classifier is not trained yet"):
         model.predict(data_handler.train_utterances(0))
 
@@ -48,7 +48,7 @@ def check_ranking(ranked, labels):
 
 
 def test_nli_transformer_predict_with_train_head(data_handler):
-    model = Ranker(cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": True})
+    model = Ranker(cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": True})
     texts = data_handler.train_utterances(0)
     labels = data_handler.train_labels(0)
     model.fit(texts, labels)
@@ -60,7 +60,7 @@ def test_nli_transformer_predict_with_train_head(data_handler):
 
 
 def test_nli_transformer_predict_default(data_handler):
-    model = Ranker(cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": False})
+    model = Ranker(cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": False})
     texts = data_handler.train_utterances(0)
     labels = data_handler.train_labels(0)
     predicted = model.predict(build_pairs(texts))
@@ -71,7 +71,7 @@ def test_nli_transformer_predict_default(data_handler):
 
 
 def test_nli_transformer_predict_default_with_fit(data_handler):
-    model = Ranker(cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": False})
+    model = Ranker(cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": False})
     texts = data_handler.train_utterances(0)
     labels = data_handler.train_labels(0)
     model.fit(texts, labels)
diff --git a/tests/assets/configs/description.yaml b/tests/assets/configs/description.yaml
@@ -14,7 +14,7 @@
       embedder_config:
         - model_name: sentence-transformers/all-MiniLM-L6-v2
       cross_encoder_config:
-        - model_name: cross-encoder/ms-marco-MiniLM-L-6-v2
+        - model_name: cross-encoder/ms-marco-MiniLM-L6-v2
       encoder_type: [cross, bi]
 - node_type: decision
   target_metric: decision_accuracy
diff --git a/tests/assets/configs/full_training.yaml b/tests/assets/configs/full_training.yaml
@@ -30,4 +30,4 @@ embedder_config:
   use_cache: true
 cross_encoder_config:
   batch_size: 32
-  model_name: cross-encoder/ms-marco-MiniLM-L-6-v2
+  model_name: cross-encoder/ms-marco-MiniLM-L6-v2
diff --git a/tests/assets/configs/multiclass.yaml b/tests/assets/configs/multiclass.yaml
@@ -15,7 +15,7 @@
     - module_name: linear
     - module_name: dnnc
       cross_encoder_config:
-        - model_name: cross-encoder/ms-marco-MiniLM-L-6-v2
+        - model_name: cross-encoder/ms-marco-MiniLM-L6-v2
           train_head: true
         - avsolatorio/GIST-small-Embedding-v0
       k: [1, 3]
@@ -25,7 +25,7 @@
       m: [ 2, 3 ]
       use_crosencoder_scores: [true, false]
       cross_encoder_config:
-        - cross-encoder/ms-marco-MiniLM-L-6-v2
+        - cross-encoder/ms-marco-MiniLM-L6-v2
     - module_name: sklearn
       clf_name: [RandomForestClassifier]
       n_estimators: [5, 10]
diff --git a/tests/assets/configs/multilabel.yaml b/tests/assets/configs/multilabel.yaml
@@ -21,7 +21,7 @@
       use_crosencoder_scores: [true, false]
       m: [ 2, 3 ]
       cross_encoder_config:
-        - model_name: cross-encoder/ms-marco-MiniLM-L-6-v2
+        - model_name: cross-encoder/ms-marco-MiniLM-L6-v2
     - module_name: sklearn
       clf_name: [RandomForestClassifier]
       n_estimators: [5, 10]
diff --git a/tests/configs/test_combined_config.py b/tests/configs/test_combined_config.py
@@ -17,7 +17,7 @@ def valid_optimizer_config():
                 {
                     "module_name": "dnnc",
                     "cross_encoder_config": [
-                        {"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": True},
+                        {"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": True},
                         {"model_name": "avsolatorio/GIST-small-Embedding-v0", "train_head": False},
                     ],
                     "k": [1, 3],
@@ -63,7 +63,7 @@ def test_invalid_optimizer_config_missing_field():
             "node_type": "scoring",
             # Missing "target_metric"
             "search_space": [
-                {"module_name": "dnnc", "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L-6-v2"], "k": [1, 3]}
+                {"module_name": "dnnc", "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L6-v2"], "k": [1, 3]}
             ],
         }
     ]
@@ -80,7 +80,7 @@ def test_invalid_optimizer_config_wrong_type():
         "search_space": [
             {
                 "module_name": "dnnc",
-                "cross_encoder_name": "cross-encoder/ms-marco-MiniLM-L-6-v2",  # Should be a list
+                "cross_encoder_name": "cross-encoder/ms-marco-MiniLM-L6-v2",  # Should be a list
                 "k": "wrong_type",  # Should be a list of integers
                 "train_head": "true",  # Should be a boolean, not a string
             }
diff --git a/tests/configs/test_scoring.py b/tests/configs/test_scoring.py
@@ -13,8 +13,8 @@ def valid_scoring_config():
             {
                 "module_name": "dnnc",
                 "cross_encoder_config": [
-                    "cross-encoder/ms-marco-MiniLM-L-6-v2",
-                    {"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": True},
+                    "cross-encoder/ms-marco-MiniLM-L6-v2",
+                    {"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": True},
                 ],
                 "embedder_config": ["sergeyzh/rubert-tiny-turbo"],
                 "k": [5, 10],
@@ -40,7 +40,7 @@ def valid_scoring_config():
             },
             {
                 "module_name": "rerank",
-                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L-6-v2"],
+                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L6-v2"],
                 "embedder_config": ["sergeyzh/rubert-tiny-turbo"],
                 "k": [5],
                 "weights": ["distance"],
@@ -71,7 +71,7 @@ def test_invalid_scoring_config_missing_field():
         "node_type": "scoring",
         # Missing "target_metric"
         "search_space": [
-            {"module_name": "dnnc", "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L-6-v2"], "k": [5, 10]}
+            {"module_name": "dnnc", "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L6-v2"], "k": [5, 10]}
         ],
     }
 
diff --git a/tests/modules/scoring/test_description.py b/tests/modules/scoring/test_description.py
@@ -62,7 +62,7 @@ def test_description_scorer_cross_encoder(dataset, expected_prediction, multilab
     data_handler = DataHandler(dataset)
 
     scorer = DescriptionScorer(
-        cross_encoder_config="cross-encoder/ms-marco-MiniLM-L-6-v2", encoder_type="cross", temperature=0.3
+        cross_encoder_config="cross-encoder/ms-marco-MiniLM-L6-v2", encoder_type="cross", temperature=0.3
     )
 
     scorer.fit(
@@ -98,7 +98,7 @@ def test_description_scorer_cross_encoder(dataset, expected_prediction, multilab
         scorer.dump(temp_dir)
 
         new_scorer = DescriptionScorer(
-            cross_encoder_config="cross-encoder/ms-marco-MiniLM-L-6-v2", encoder_type="cross", temperature=0.3
+            cross_encoder_config="cross-encoder/ms-marco-MiniLM-L6-v2", encoder_type="cross", temperature=0.3
         )
         new_scorer.load(temp_dir)
 
diff --git a/tests/modules/scoring/test_dnnc.py b/tests/modules/scoring/test_dnnc.py
@@ -10,7 +10,7 @@ def test_base_dnnc(dataset, train_head, pred_score):
     data_handler = DataHandler(dataset)
 
     scorer = DNNCScorer(
-        cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": train_head},
+        cross_encoder_config={"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": train_head},
         embedder_config="sergeyzh/rubert-tiny-turbo",
         k=3,
     )
diff --git a/tests/modules/scoring/test_rerank_scorer.py b/tests/modules/scoring/test_rerank_scorer.py
@@ -12,7 +12,7 @@ def test_base_rerank_scorer(dataset):
         weights="distance",
         embedder_config="sergeyzh/rubert-tiny-turbo",
         m=2,
-        cross_encoder_config="cross-encoder/ms-marco-MiniLM-L-6-v2",
+        cross_encoder_config="cross-encoder/ms-marco-MiniLM-L6-v2",
     )
 
     test_data = [
diff --git a/tests/nodes/test_scoring.py b/tests/nodes/test_scoring.py
@@ -32,8 +32,8 @@ def test_scoring_multiclass(embedding_optimizer_multiclass):
             {
                 "module_name": "dnnc",
                 "cross_encoder_config": [
-                    {"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": False},
-                    {"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": True},
+                    {"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": False},
+                    {"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": True},
                 ],
                 "embedder_config": ["sergeyzh/rubert-tiny-turbo"],
                 "k": [3],
@@ -48,14 +48,14 @@ def test_scoring_multiclass(embedding_optimizer_multiclass):
                 "temperature": [0.05, 0.1, 0.5, 1.0],
                 "embedder_config": ["sergeyzh/rubert-tiny-turbo"],
                 "encoder_type": ["bi", "cross"],
-                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L-6-v2"],
+                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L6-v2"],
             },
             {
                 "module_name": "rerank",
                 "weights": ["uniform", "distance", "closest"],
                 "k": [3],
                 "m": [2],
-                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L-6-v2"],
+                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L6-v2"],
                 "embedder_config": ["sergeyzh/rubert-tiny-turbo"],
             },
         ],
@@ -111,7 +111,7 @@ def test_scoring_multilabel(embedding_optimizer_multilabel):
                 "weights": ["uniform", "distance", "closest"],
                 "k": [3],
                 "m": [2],
-                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L-6-v2"],
+                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L6-v2"],
                 "embedder_config": ["sergeyzh/rubert-tiny-turbo"],
             },
         ],
diff --git a/user_guides/advanced/02_automl.py b/user_guides/advanced/02_automl.py
@@ -98,7 +98,7 @@
             {"module_name": "linear"},
             {
                 "module_name": "dnnc",
-                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L-6-v2"],
+                "cross_encoder_config": ["DiTy/cross-encoder-russian-msmarco"],
                 "k": [1, 3, 5, 10],
             },
         ],
diff --git a/user_guides/advanced/03_reporting.py b/user_guides/advanced/03_reporting.py
@@ -26,7 +26,7 @@
             {"module_name": "linear"},
             {
                 "module_name": "dnnc",
-                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L-6-v2"],
+                "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L6-v2"],
                 "k": [1, 3, 5, 10],
             },
         ],

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ class DNNCScorer(BaseScorer):`
`36`	`36`	`utterances = ["what is your name?", "how are you?"]`
`37`	`37`	`labels = [0, 1]`
`38`	`38`	`scorer = DNNCScorer(`
`39`		`- cross_encoder_config="cross-encoder/ms-marco-MiniLM-L-6-v2",`
	`39`	`+ cross_encoder_config="cross-encoder/ms-marco-MiniLM-L6-v2",`
`40`	`40`	`embedder_config="sergeyzh/rubert-tiny-turbo",`
`41`	`41`	`k=5,`
`42`	`42`	`)`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ def valid_optimizer_config():`
`17`	`17`	`{`
`18`	`18`	`"module_name": "dnnc",`
`19`	`19`	`"cross_encoder_config": [`
`20`		`- {"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": True},`
	`20`	`+ {"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2", "train_head": True},`
`21`	`21`	`{"model_name": "avsolatorio/GIST-small-Embedding-v0", "train_head": False},`
`22`	`22`	`],`
`23`	`23`	`"k": [1, 3],`
`@@ -63,7 +63,7 @@ def test_invalid_optimizer_config_missing_field():`
`63`	`63`	`"node_type": "scoring",`
`64`	`64`	`# Missing "target_metric"`
`65`	`65`	`"search_space": [`
`66`		`- {"module_name": "dnnc", "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L-6-v2"], "k": [1, 3]}`
	`66`	`+ {"module_name": "dnnc", "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L6-v2"], "k": [1, 3]}`
`67`	`67`	`],`
`68`	`68`	`}`
`69`	`69`	`]`
`@@ -80,7 +80,7 @@ def test_invalid_optimizer_config_wrong_type():`
`80`	`80`	`"search_space": [`
`81`	`81`	`{`
`82`	`82`	`"module_name": "dnnc",`
`83`		`- "cross_encoder_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", # Should be a list`
	`83`	`+ "cross_encoder_name": "cross-encoder/ms-marco-MiniLM-L6-v2", # Should be a list`
`84`	`84`	`"k": "wrong_type", # Should be a list of integers`
`85`	`85`	`"train_head": "true", # Should be a boolean, not a string`
`86`	`86`	`}`