added phonetics, speech_disorder, and speech_enhancement tasks - stil… (#22)

pcsid · nhhoang96 · web-flow · commit a77e996fee5d · 2026-01-05T13:12:08.000-08:00
* added phonetics, speech_disorder, and speech_enhancement tasks - still in need of full model scoring. Fixed small inconsistency bug in config by changing judge_properties to judge_settings.

* Update the correct HF path for noise_detection task

* updated scores

---------

Co-authored-by: hoang &lt;huuhoang.nguyen@servicenow.com&gt;
diff --git a/evaluate.py b/evaluate.py
@@ -44,7 +44,7 @@ def main(cfg_path='config.yaml'):
         raise
 
     # 4. Load models and initialize central request controller
-    central_request_controller, model_configs = register_models_with_controller(run_config.get("models", []), run_config.get("judge_properties", {}))
+    central_request_controller, model_configs = register_models_with_controller(run_config.get("models", []), run_config.get("judge_settings", {}))
 
     # 5. Expand task-metric pairs
     task_payload = expand_task_metric_pairs(run_config, task_configs, task_ancestry)
diff --git a/tasks/phonetics/phonemes/voxangeles_phoneme_counting.yaml b/tasks/phonetics/phonemes/voxangeles_phoneme_counting.yaml
@@ -0,0 +1,20 @@
+task_name: voxangeles_phoneme_counting
+dataset_path: DynamicSuperb/PhoneSegmentCounting_VoxAngeles
+modality: audio
+language: en
+split: test
+preprocessor: GeneralPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio
+target_column: label
+instruction_column: instruction
+user_prompt: "You are an expert at counting phones in the context of phonemes, and always attempt to answer. You will be given an audio sample, listen carefully."
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.0001
+  max_completion_tokens: 64
+
+metrics:
+  - metric: llm_judge_binary
+  - metric: detailed_judge_prompt
diff --git a/tasks/speech_disorder/voice_disorder/sep_28k.yaml b/tasks/speech_disorder/voice_disorder/sep_28k.yaml
@@ -0,0 +1,20 @@
+task_name: stuttering_detection
+dataset_path: DynamicSuperb/StutteringDetection_SEP28k
+modality: audio
+language: en
+split: test
+preprocessor: GeneralPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio
+target_column: label
+instruction_column: instruction
+user_prompt: "You are an expert at stuttering detection. Look for clear, obvious stuttering. Stuttering is obviously repeating a word or part of a word. The audio clip will ALWAYS be given after the following instructions, always attempt to answer."
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.0001
+  max_completion_tokens: 64
+
+metrics:
+  - metric: llm_judge_binary
+  - metric: detailed_judge_prompt
diff --git a/tasks/speech_enhancement/noise_detection/guassian_noise_detection.yaml b/tasks/speech_enhancement/noise_detection/guassian_noise_detection.yaml
@@ -0,0 +1,19 @@
+task_name: noise_detection
+dataset_path: DynamicSuperb/NoiseDetection_LJSpeech_MUSAN-Gaussian
+modality: audio
+language: en
+split: test
+preprocessor: GeneralPreprocessor
+postprocessor: GeneralPostprocessor
+audio_column: audio
+target_column: label
+instruction_column: instruction
+long_audio_processing_logic: truncate
+
+generation_kwargs:
+  temperature: 0.0001
+  max_completion_tokens: 64
+
+metrics:
+  - metric: llm_judge_binary
+  - metric: detailed_judge_prompt
diff --git a/utils/util.py b/utils/util.py
@@ -85,12 +85,12 @@ def validate_config(config: dict, task_configs: dict[Path, list[dict]]) -> Dict:
                 raise ValueError("'filters' must be a dictionary")
             _validate_filter_values(config['filters'])
         
-        # Validate judge_properties as a dictionary
-        logger.info("---------Validating judge properties---------")
-        if 'judge_properties' in config:
-            if not isinstance(config['judge_properties'], dict):
-                raise ValueError("'judge_properties' must be a dictionary")
-            _validate_judge_properties(config['judge_properties'])
+        # Validate judge_settings as a dictionary
+        logger.info("---------Validating judge settings---------")
+        if 'judge_settings' in config:
+            if not isinstance(config['judge_settings'], dict):
+                raise ValueError("'judge_settings' must be a dictionary")
+            _validate_judge_settings(config['judge_settings'])
 
         # Delegate validation for complex sections
         logger.info("---------Validating models---------")
@@ -175,11 +175,11 @@ def _validate_filter_values(filters: Dict) -> None:
         raise ValueError("'language' must be a string")
 
 
-def _validate_judge_properties(judge_props: Dict) -> None:
-    """Validate the values in the judge_properties dictionary.
-    
+def _validate_judge_settings(judge_props: Dict) -> None:
+    """Validate the values in the judge_settings dictionary.
+
     Args:
-        judge_props: Dictionary of judge properties to validate
+        judge_props: Dictionary of judge settings to validate
     
     Raises:
         ValueError: If any judge property is invalid