From 32f27eb295e3b7cc1a4e139766f97f3d3e7b0d23 Mon Sep 17 00:00:00 2001
From: Pashteticus <ilinpashtet@yandex.ru>
Date: Wed, 6 Aug 2025 06:05:44 +0300
Subject: [PATCH] init

---
 runner.py          |   7 +-
 src/leaderboard.py | 321 ++++++++++++++++++++++++++++++++++++++++++---
 src/mat_boy.py     |  22 ++--
 3 files changed, 321 insertions(+), 29 deletions(-)

diff --git a/runner.py b/runner.py
index 84cfe3d..9f13fc9 100644
--- a/runner.py
+++ b/runner.py
@@ -27,6 +27,7 @@ def main() -> None:
 Примеры использования:
   python runner.py                        # Запустить оценку на всех датасетах (по умолчанию)
   python runner.py --dataset russianmath  # Запустить только на датасете RussianMath
+  python runner.py --dataset demidovich  # Запустить только на датасете MathDemon_Demidovich
   python runner.py --dataset physics      # Запустить только на датасете RussianPhysics
   python runner.py --no-cache             # Игнорировать кэш и переоценить все модели
   python runner.py --max-workers 8        # Использовать 8 параллельных потоков
@@ -51,7 +52,7 @@ def main() -> None:
     )
     parser.add_argument(
         "--dataset",
-        choices=["all", "russianmath", "physics"],
+        choices=["all", "russianmath", "physics", "demidovich"],
         default="all",
         help="Выбор датасета для оценки: all (все), russianmath, physics (по умолчанию: all)",
     )
@@ -91,6 +92,10 @@ def main() -> None:
         print("\nЗапуск оценки на датасете RussianMath")
         leaderboard.evaluate_all_models(system_prompts)
 
+    if args.dataset == "all" or args.dataset == "demidovich":
+        print("\nЗапуск оценки на датасете MathDemon_Demidovich")
+        leaderboard.evaluate_demidovich_models(system_prompts)
+
     if args.dataset == "all" or args.dataset == "physics":
         print("\nЗапуск оценки на датасете RussianPhysics")
         leaderboard.evaluate_physics_models(system_prompts)
diff --git a/src/leaderboard.py b/src/leaderboard.py
index a0a2c5f..d99d435 100644
--- a/src/leaderboard.py
+++ b/src/leaderboard.py
@@ -6,7 +6,7 @@
 from datetime import datetime
 from src.equality_checker import DoomSlayer
 from src.sampler import OaiSampler
-from src.mat_boy import RussianMathEval, MathDemonEval
+from src.mat_boy import *
 from src.types import SingleEvalResult
 from concurrent.futures import ThreadPoolExecutor, TimeoutError
 from tqdm.auto import tqdm
@@ -46,7 +46,7 @@ def __init__(
         self.cache_dir: Path = self.output_dir / "cache"
         self.cache_dir.mkdir(exist_ok=True)
 
-        with open(config_path, "r") as f:
+        with open(config_path, "r", encoding="utf-8") as f:
             self.config: Dict[str, Any] = yaml.safe_load(f)
         self.model_links: Dict[str, str] = self.config.get("model_links", {})
         self.equality_checker: DoomSlayer = DoomSlayer()
@@ -268,6 +268,7 @@ def _combine_detailed_reports(
         model_name: str,
         timestamp_math: Optional[str] = None,
         timestamp_physics: Optional[str] = None,
+        timestamp_demidovich: Optional[str] = None
     ) -> Optional[Path]:
         """
         Объединяет отчеты по математике и физике в один комбинированный отчет.
@@ -287,14 +288,16 @@ def _combine_detailed_reports(
             print(f"No detailed reports found for model {model_name}")
             return None
 
-        if timestamp_math and timestamp_physics:
+        if timestamp_math and timestamp_physics and timestamp_demidovich:
             math_report = model_dir / f"details_{timestamp_math}.md"
             physics_report = (
                 model_dir / f"details_{timestamp_physics}_RussianPhysics.md"
             )
+            demidovich_report = model_dir / f"details_{timestamp_demidovich}.md"
         else:
             math_reports = list(model_dir.glob("details_*.md"))
             physics_reports = list(model_dir.glob("details_*_RussianPhysics.md"))
+            demidovich_reports =  list(model_dir.glob("details_*_MathDemon_Demidovich.md"))
 
             if not math_reports or not physics_reports:
                 print(f"Missing either math or physics reports for model {model_name}")
@@ -306,12 +309,15 @@ def _combine_detailed_reports(
             physics_report = sorted(
                 physics_reports, key=lambda x: x.stat().st_mtime, reverse=True
             )[0]
+            demidovich_report = sorted(
+                demidovich_reports, key=lambda x: x.stat().st_mtime, reverse=True
+            )[0]
 
             timestamp_math = math_report.stem.split("_")[1]
             timestamp_physics = physics_report.stem.split("_")[1]
 
-        if not math_report.exists() or not physics_report.exists():
-            print(f"Could not find both reports for model {model_name}")
+        if not math_report.exists() or not physics_report.exists() or not demidovich_report.exists():
+            print(f"Could not find some reports for model {model_name}")
             return None
 
         combined_report = model_dir / f"details_{timestamp_math}_combined.md"
@@ -328,13 +334,19 @@ def _combine_detailed_reports(
         with open(physics_report, "r", encoding="utf-8") as f:
             physics_content = f.read()
 
+        with open(demidovich_report, "r", encoding="utf-8") as f:
+            demidovich_content = f.read()
+
         math_lines = math_content.split("\n")
         physics_lines = physics_content.split("\n")
+        demidovich_lines = demidovich_content.split("\n")
 
         math_start = 0
         physics_start = 0
+        demidovich_start = 0
         math_summary_start = 0
         physics_summary_start = 0
+        demidovich_summary_start = 0
 
         for i, line in enumerate(math_lines):
             if line.startswith("## Summary"):
@@ -350,6 +362,13 @@ def _combine_detailed_reports(
                 physics_start = i
                 break
 
+        for i, line in enumerate(demidovich_lines):
+            if line.startswith("## Summary"):
+                demidovich_summary_start = i
+            if line.startswith("## Example 1"):
+                demidovich_start = i
+                break
+
         combined_content = []
         for i in range(0, math_summary_start):
             if i == 0:
@@ -375,16 +394,28 @@ def _combine_detailed_reports(
             or "answers" in line
             or "Dataset" in line
         ]
+        demidovich_summary_lines = [
+            line
+            for line in demidovich_lines[demidovich_summary_start:demidovich_start]
+            if "Score" in line
+            or "examples" in line
+            or "answers" in line
+            or "Dataset" in line
+        ]
 
         math_score = next((line for line in math_summary_lines if "Score" in line), "")
         physics_score = next(
             (line for line in physics_summary_lines if "Score" in line), ""
         )
+        demidovich_score = next(
+            (line for line in demidovich_summary_lines if "Score" in line), ""
+        )
 
         try:
             math_score_value = float(math_score.split(":")[-1].strip())
             physics_score_value = float(physics_score.split(":")[-1].strip())
-            combined_score = (math_score_value + physics_score_value) / 2
+            demidovich_score_value = float(demidovich_score.split(":")[-1].strip())
+            combined_score = (math_score_value + physics_score_value + demidovich_score_value) / 3
             combined_score_line = f"- **Combined Score**: {combined_score:.3f}"
         except (ValueError, IndexError):
             combined_score_line = "- **Combined Score**: N/A"
@@ -403,6 +434,12 @@ def _combine_detailed_reports(
         combined_content.append("")
         combined_content.append("---")
         combined_content.append("")
+        combined_content.append("### Demidovich")
+        for line in demidovich_summary_lines:
+            combined_content.append(line)
+        combined_content.append("")
+        combined_content.append("---")
+        combined_content.append("")
 
         combined_content.append("## Mathematics Examples")
         combined_content.append("")
@@ -412,6 +449,10 @@ def _combine_detailed_reports(
         combined_content.append("")
         combined_content.extend(physics_lines[physics_start:])
 
+        combined_content.append("## Demidovich Examples")
+        combined_content.append("")
+        combined_content.extend(demidovich_lines[math_start:])
+
         with open(combined_report, "w", encoding="utf-8") as f:
             f.write("\n".join(combined_content))
 
@@ -432,7 +473,7 @@ def _prepare_combined_reports(self) -> None:
             timestamp = result.get("timestamp")
 
             if model_name not in models_with_both_datasets:
-                models_with_both_datasets[model_name] = {"math": None, "physics": None}
+                models_with_both_datasets[model_name] = {"math": None, "physics": None, "demidovich": None}
 
             if dataset == "RussianMath" or dataset is None:
                 if (
@@ -446,13 +487,20 @@ def _prepare_combined_reports(self) -> None:
                     or timestamp > models_with_both_datasets[model_name]["physics"]
                 ):
                     models_with_both_datasets[model_name]["physics"] = timestamp
+            elif dataset == "MathDemon_Demidovich":
+                if (
+                    not models_with_both_datasets[model_name]["demidovich"]
+                    or timestamp > models_with_both_datasets[model_name]["demidovich"]
+                ):
+                    models_with_both_datasets[model_name]["demidovich"] = timestamp
 
         for model_name, timestamps in models_with_both_datasets.items():
-            if timestamps["math"] and timestamps["physics"]:
+            if timestamps["math"] and timestamps["physics"] and timestamps["demidovich"]:
                 self._combine_detailed_reports(
                     model_name=model_name,
                     timestamp_math=timestamps["math"],
                     timestamp_physics=timestamps["physics"],
+                    timestamp_demidovich=timestamps["demidovich"],
                 )
 
     # МЕТОДЫ ОЦЕНКИ МОДЕЛЕЙ
@@ -707,8 +755,6 @@ def evaluate_physics_model(
             yaml.dump(temp_config, f)
 
         try:
-            from src.mat_boy import RussianPhysicsEval
-
             sampler = OaiSampler(str(temp_config_path))
             evaluator = RussianPhysicsEval(
                 equality_checker=self.equality_checker,
@@ -886,6 +932,222 @@ def handle_sigint(signum: int, frame: Any) -> None:
 
         self._save_results()
 
+    def evaluate_demidovich_model(
+        self, model_name: str, system_prompt: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Оценивает одну модель на датасете MathDemon_Demidovich.
+
+        Args:
+            model_name: Название модели для оценки
+            system_prompt: Системный промпт для модели (опционально)
+
+        Returns:
+            Словарь с результатами оценки модели
+        """
+        cache_key = f"{self._get_cache_key(model_name, system_prompt)}_demidovich"
+        cached_result = self._get_cached_result(cache_key)
+
+        if cached_result is not None:
+            if self.config.get("debug"):
+                print(f"\nUsing cached demidovich result for {model_name}")
+            return cached_result
+
+        if self.config.get("debug"):
+            print(f"\nEvaluating {model_name} on MathDemonDemidovich")
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        safe_model_name = model_name.replace("/", "_")
+
+        temp_config = self.config.copy()
+        temp_config["model_list"] = [model_name]
+        if system_prompt is not None:
+            temp_config[model_name]["system_prompt"] = system_prompt
+
+        temp_config_path = (
+            self.output_dir / f"temp_config_demidovich_{safe_model_name}.yaml"
+        )
+        with open(temp_config_path, "w") as f:
+            yaml.dump(temp_config, f)
+
+        try:
+            sampler = OaiSampler(str(temp_config_path))
+            evaluator = MathDemonEval(
+                equality_checker=self.equality_checker,
+                num_examples=self.config.get("num_examples", None),
+                debug=self.config.get("debug", False),
+            )
+
+            start_time = time.time()
+            results = evaluator(sampler)
+            evaluation_time = time.time() - start_time
+            self._save_detailed_results(
+                model_name, results.results, timestamp, "MathDemon_Demidovich"
+            )
+
+            total_tokens = sum(
+                r.tokens for r in results.results if hasattr(r, "tokens")
+            )
+
+            model_result = {
+                "model_name": model_name,
+                "score": results.score,
+                "total_tokens": total_tokens,
+                "evaluation_time": evaluation_time,
+                "system_prompt": system_prompt,
+                "timestamp": timestamp,
+                "cache_key": cache_key,
+                "dataset": "MathDemon_Demidovich",
+            }
+
+            self._save_to_cache(cache_key, model_result)
+            self.results[f"{model_name}_{timestamp}_demidovich"] = model_result
+            self._save_results()
+
+            return model_result
+
+        finally:
+            temp_config_path.unlink(missing_ok=True)
+
+    def evaluate_demidovich_model_parallel(
+        self, args: Tuple[str, Optional[str]]
+    ) -> Dict[str, Any]:
+        """
+        Оценивает одну модель на датасете MathDemon_Demidovich (для использования в ThreadPoolExecutor).
+
+        Args:
+            args: Кортеж (model_name, system_prompt)
+
+        Returns:
+            Словарь с результатами оценки модели
+        """
+        model_name, system_prompt = args
+        return self.evaluate_demidovich_model(model_name, system_prompt)
+
+    def evaluate_demidovich_models(
+        self, system_prompts: Optional[Dict[str, str]] = None
+    ) -> None:
+        """
+        Оценивает все модели на датасете MathDemon_Demidovich из конфига параллельно с использованием кэша.
+
+        Args:
+            system_prompts: Словарь системных промптов для моделей
+                (model_name -> system_prompt)
+        """
+        if system_prompts is None:
+            system_prompts = {}
+
+        measured_models = set()
+        for key, result in self.results.items():
+            if result.get("dataset") == "MathDemon_Demidovich":
+                measured_models.add(result.get("model_name"))
+
+        config_models = set(self.config["model_list"])
+        new_models = config_models - measured_models
+        
+        # Определяем модели, которые нужно перетестировать
+        models_with_incomplete_results = set()
+        if self.retry_incomplete:
+            # Получаем модели, у которых есть результаты для MathDemon_Demidovich, 
+            # но возможно были ошибки или прерывания
+            models_with_physics_results = set()
+            for key, result in self.results.items():
+                if result.get("dataset") == "MathDemon_Demidovich" and result["model_name"] in config_models:
+                    if result.get("score") is not None and result.get("score") > 0:
+                        models_with_physics_results.add(result["model_name"])
+            
+            models_with_incomplete_results = config_models - models_with_physics_results - new_models
+            if models_with_incomplete_results:
+                print(f"\nFound models with incomplete MathDemon_Demidovich results that will be retested: {', '.join(models_with_incomplete_results)}")
+
+        if new_models:
+            print(
+                f"\nFound new models to evaluate on MathDemon_Demidovich: {', '.join(new_models)}"
+            )
+
+        # Объединяем новые модели и модели с неполными результатами
+        models_to_evaluate = new_models | models_with_incomplete_results
+
+        if models_to_evaluate:
+            uncached_args = [
+                (model_name, system_prompts.get(model_name))
+                for model_name in models_to_evaluate
+            ]
+
+            print(f"\nEvaluating {len(uncached_args)} models on MathDemon_Demidovich...")
+
+            def handle_sigint(signum: int, frame: Any) -> None:
+                print(
+                    "\nGracefully shutting down... Please wait for current evaluations to complete."
+                )
+                executor.shutdown(wait=True)
+                sys.exit(0)
+
+            original_sigint = signal.getsignal(signal.SIGINT)
+            signal.signal(signal.SIGINT, handle_sigint)
+
+            try:
+                with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+                    futures = [
+                        executor.submit(self.evaluate_demidovich_model_parallel, args)
+                        for args in uncached_args
+                    ]
+
+                    pbar = tqdm(
+                        total=len(futures),
+                        desc="Evaluating models on MathDemon_Demidovich",
+                        leave=True,
+                    )
+
+                    completed = 0
+                    while completed < len(futures):
+                        for i, future in enumerate(futures):
+                            if future.done() and not hasattr(future, "_processed"):
+                                try:
+                                    result = future.result(timeout=1)
+                                    if result:
+                                        key = f"{result['model_name']}_{result['timestamp']}_demidovich"
+                                        self.results[key] = result
+                                        self._save_to_cache(
+                                            f"{self._get_cache_key(result['model_name'], result.get('system_prompt'))}_demidovich",
+                                            result,
+                                        )
+                                    setattr(future, "_processed", True)
+                                    completed += 1
+                                    pbar.update(1)
+                                except TimeoutError:
+                                    print(
+                                        "\nWarning: Evaluation timed out for one of the models"
+                                    )
+                                except Exception as e:
+                                    print(f"\nError during evaluation: {str(e)}")
+                                    setattr(future, "_processed", True)
+                                    completed += 1
+                                    pbar.update(1)
+
+                        time.sleep(0.1)
+
+                    pbar.close()
+
+            finally:
+                signal.signal(signal.SIGINT, original_sigint)
+                self._save_results()
+        else:
+            print("\nNo new models to evaluate on MathDemon_Demidovich, using cached results")
+
+        missing_models = config_models - set(
+            result["model_name"]
+            for result in self.results.values()
+            if result.get("dataset") == "MathDemon_Demidovich"
+        )
+        if missing_models:
+            print(
+                f"\nWarning: Missing MathDemon_Demidovich results for models: {', '.join(missing_models)}"
+            )
+
+        self._save_results()
+
+
     def _evaluate_subset_parallel(self, subset_name: str) -> Dict[str, Dict[str, Any]]:
         """
         Оценивает все модели на одном подмножестве MathDemon параллельно.
@@ -1158,6 +1420,7 @@ def calculate_combined_scores(self) -> None:
                 model_results[model_name] = {
                     "RussianMath": None,
                     "RussianPhysics": None,
+                    "MathDemon_Demidovich": None
                 }
 
             dataset = result.get("dataset", "RussianMath")
@@ -1181,24 +1444,26 @@ def calculate_combined_scores(self) -> None:
             if results["RussianMath"] and results["RussianPhysics"]:
                 math_score = results["RussianMath"]["score"]
                 physics_score = results["RussianPhysics"]["score"]
-                combined_score = (math_score + physics_score) / 2.0
+                demidovich_score = results["MathDemon_Demidovich"]["score"]
+                combined_score = (math_score + physics_score + demidovich_score) / 3.0
 
                 total_tokens = results["RussianMath"].get("total_tokens", 0) + results[
                     "RussianPhysics"
-                ].get("total_tokens", 0)
+                ].get("total_tokens", 0) + results["MathDemon_Demidovich"].get("total_tokens", 0)
                 total_time = results["RussianMath"].get("evaluation_time", 0) + results[
                     "RussianPhysics"
-                ].get("evaluation_time", 0)
+                ].get("evaluation_time", 0) + results["MathDemon_Demidovich"].get("evaluation_time", 0)
 
                 system_prompt = results["RussianMath"].get("system_prompt") or results[
                     "RussianPhysics"
-                ].get("system_prompt")
+                ].get("system_prompt") or results["MathDemon_Demidovich"].get("system_prompt")
 
                 self.results[f"{model_name}_Combined_{timestamp}"] = {
                     "model_name": model_name,
                     "score": combined_score,
                     "math_score": math_score,
                     "physics_score": physics_score,
+                    "demidovich_score": demidovich_score,
                     "total_tokens": total_tokens,
                     "evaluation_time": total_time,
                     "system_prompt": system_prompt,
@@ -1207,7 +1472,7 @@ def calculate_combined_scores(self) -> None:
                 }
 
                 print(
-                    f"Model {model_name} combined score: {combined_score:.3f} (Math: {math_score:.3f}, Physics: {physics_score:.3f})"
+                    f"Model {model_name} combined score: {combined_score:.3f} (Math: {math_score:.3f}, Physics: {physics_score:.3f}, Demidovich: {demidovich_score:.3f})"
                 )
 
             elif results["RussianMath"]:
@@ -1219,6 +1484,10 @@ def calculate_combined_scores(self) -> None:
                 print(
                     f"Warning: Model {model_name} has only RussianPhysics results, skipping combined score calculation"
                 )
+            elif results["MathDemon_Demidovich"]:
+                print(
+                    f"Warning: Model {model_name} has only MathDemon_Demidovich results, skipping combined score calculation"
+                )
 
         self._save_results()
 
@@ -1234,8 +1503,8 @@ def generate_markdown(self) -> str:
         md = "# Math Evaluation Leaderboard\n\n"
         md += f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
 
-        md += "| Model | Combined Score | RussianMath Score | RussianPhysics Score | Tokens Used | System Prompt | Evaluation Time | Details |\n"
-        md += "|-------|---------------|-------------------|----------------------|-------------|---------------|----------------|--------|\n"
+        md += "| Model | Combined Score | RussianMath Score | RussianPhysics Score | Demidovich Score | Tokens Used | System Prompt | Evaluation Time | Details |\n"
+        md += "|-------|---------------|-------------------|----------------------|------------------|-------------|---------------|----------------|--------|\n"
 
         self._prepare_combined_reports()
 
@@ -1249,6 +1518,7 @@ def generate_markdown(self) -> str:
                     "combined": None,
                     "russianmath": None,
                     "physics": None,
+                    "demidovich": None
                 }
 
             dataset = result.get("dataset")
@@ -1267,6 +1537,12 @@ def generate_markdown(self) -> str:
                     or result["score"] > model_data[model_name]["physics"]["score"]
                 ):
                     model_data[model_name]["physics"] = result
+            elif dataset == "MathDemon_Demidovich":
+                if (
+                    not model_data[model_name]["demidovich"]
+                    or result["score"] > model_data[model_name]["demidovich"]["score"]
+                ):
+                    model_data[model_name]["demidovich"] = result
 
         def get_sort_score(model_name: str) -> float:
             data = model_data[model_name]
@@ -1287,15 +1563,18 @@ def get_sort_score(model_name: str) -> float:
             combined_score = data["combined"]["score"] if data["combined"] else "-"
             rm_score = data["russianmath"]["score"] if data["russianmath"] else "-"
             physics_score = data["physics"]["score"] if data["physics"] else "-"
+            demidovich_score = data["demidovich"]["score"] if data["demidovich"] else "-"
 
             total_tokens = 0
             if data["russianmath"]:
                 total_tokens += data["russianmath"].get("total_tokens", 0)
             if data["physics"]:
                 total_tokens += data["physics"].get("total_tokens", 0)
+            if data["demidovich"]:
+                total_tokens += data["demidovich"].get("total_tokens", 0)
 
             system_prompt = None
-            for result_type in ["russianmath", "physics", "combined"]:
+            for result_type in ["russianmath", "physics", "demidovich", "combined"]:
                 if data[result_type] and data[result_type].get("system_prompt"):
                     system_prompt = data[result_type]["system_prompt"]
                     break
@@ -1310,13 +1589,16 @@ def get_sort_score(model_name: str) -> float:
                 eval_time += data["russianmath"].get("evaluation_time", 0)
             if data["physics"]:
                 eval_time += data["physics"].get("evaluation_time", 0)
+            if data["demidovich"]:
+                eval_time += data["demidovich"].get("evaluation_time", 0)
 
             details = ""
             safe_model_name = model_name.replace("/", "_")
 
-            if data["russianmath"] and data["physics"]:
+            if data["russianmath"] and data["physics"] and data["demidovich"]:
                 math_timestamp = data["russianmath"]["timestamp"]
                 physics_timestamp = data["physics"]["timestamp"]
+                demidovich_timestamp = data["demidovich"]["timestamp"]
 
                 combined_report = (
                     self.details_dir
@@ -1347,6 +1629,7 @@ def get_sort_score(model_name: str) -> float:
             md += f"| {combined_score if isinstance(combined_score, str) else f'{combined_score:.3f}'} "
             md += f"| {rm_score if isinstance(rm_score, str) else f'{rm_score:.3f}'} "
             md += f"| {physics_score if isinstance(physics_score, str) else f'{physics_score:.3f}'} "
+            md += f"| {demidovich_score if isinstance(demidovich_score, str) else f'{demidovich_score:.3f}'} "
             md += f"| {total_tokens} "
             md += f"| {system_prompt} "
             md += f"| {eval_time:.1f}s "
diff --git a/src/mat_boy.py b/src/mat_boy.py
index 58546eb..33bb314 100644
--- a/src/mat_boy.py
+++ b/src/mat_boy.py
@@ -254,9 +254,10 @@ class MathDemonEval(Eval):
 
     def __init__(
         self,
-        subset_name: str,
+        equality_checker: SamplerBase,
         num_examples: Optional[int] = None,
-        debug: bool = False, 
+        n_repeats: int = 1,
+        debug: bool = False,
     ) -> None:
         """
         Инициализирует оценку на подсетах MathDemon_Demidovich.
@@ -266,21 +267,24 @@ def __init__(
             num_examples: Количество примеров для оценки (по умолчанию 1)
             debug: Режим отладки для подробного вывода
         """
-        dataset = load_dataset("Vikhrmodels/MathDemon_Demidovich", subset_name)
-        examples = [
-            {"task": row["translated_conditions"], "Answer": row["translated_answers"]}
-            for row in dataset["train"]
-        ]
+        tasks = ['Approximation_by_Polynomials', 'Continuous_Functions', 'Convex_Functions', 'Diﬀerentiation', 'Improper_Integrals', 'Infinite_Series', 'Integration', 'Sequences_and_Limits', 'Series_of_Functions']
+        dss = [load_dataset("Vikhrmodels/MathDemon_Demidovich", task) for task in tasks]
+        examples = []
+        for dataset in dss:
+            examples += [
+                {"task": row["problem"], "Answer": row["answer"]}
+                for row in dataset["train"]
+            ]
 
         if num_examples and num_examples > 0:
             examples = examples[:num_examples]
 
         self.examples: List[Dict[str, str]] = examples
         self.debug: bool = debug
-        self.equality_checker: Optional[SamplerBase] = None
+        self.equality_checker: Optional[SamplerBase] = equality_checker
 
         if self.debug:
-            print(f"Loaded {len(self.examples)} examples for subset {subset_name}")
+            print(f"Loaded {len(self.examples)} examples")
 
     def set_equality_checker(self, equality_checker: SamplerBase) -> None:
         """