Adding SD for superb (speech-classification). (#225)

Narsil · web-flow · commit ac51c4f9229b · 2021-07-27T19:13:12.000+02:00
* Adding SD for superb (speech-classification). * Style. * Style. * Test dependencies. * Addressing @omar 's comments. * Forgot pytest.. * `speech-classification` -> `speech-segmentation`. * Fixing test cache in common + adding simple tests.
diff --git a/.github/workflows/python-api-tests.yaml b/.github/workflows/python-api-tests.yaml
@@ -25,7 +25,6 @@ jobs:
         working-directory: api-inference-community
         run: |
           pip install --upgrade pip
-          pip install pytest pillow httpx huggingface_hub
-          pip install -e .
+          pip install -e .[test]
       - run: make test
         working-directory: api-inference-community
diff --git a/api-inference-community/api_inference_community/normalizers.py b/api-inference-community/api_inference_community/normalizers.py
@@ -0,0 +1,41 @@
+"""
+Helper classes to modify pipeline outputs from tensors to expected pipeline output
+"""
+
+from typing import TYPE_CHECKING, Dict, List, Union
+
+
+Classes = Dict[str, Union[str, float]]
+
+if TYPE_CHECKING:
+    try:
+        import torch
+    except Exception:
+        pass
+
+
+def speaker_diarization_normalize(
+    tensor: "torch.Tensor", sampling_rate: int, classnames: List[str]
+) -> List[Classes]:
+    N = tensor.shape[1]
+    if len(classnames) != N:
+        raise ValueError(
+            f"There is a mismatch between classnames ({len(classnames)}) and number of speakers ({N})"
+        )
+    classes = []
+    for i in range(N):
+        values, counts = tensor[:, i].unique_consecutive(return_counts=True)
+        offset = 0
+        for v, c in zip(values, counts):
+            if v == 1:
+                classes.append(
+                    {
+                        "class": classnames[i],
+                        "start": offset / sampling_rate,
+                        "end": (offset + c.item()) / sampling_rate,
+                    }
+                )
+            offset += c.item()
+
+    classes = sorted(classes, key=lambda x: x["start"])
+    return classes
diff --git a/api-inference-community/api_inference_community/validation.py b/api-inference-community/api_inference_community/validation.py
@@ -211,12 +211,14 @@ def normalize_payload(
     if task in {
         "automatic-speech-recognition",
         "audio-to-audio",
+        "speech-segmentation",
     }:
         if sampling_rate is None:
             raise EnvironmentError(
                 "We cannot normalize audio file if we don't know the sampling rate"
             )
-        return normalize_payload_audio(bpayload, sampling_rate)
+        outputs = normalize_payload_audio(bpayload, sampling_rate)
+        return outputs
     elif task in {
         "image-classification",
         "image-to-text",
diff --git a/api-inference-community/docker_images/common/app/pipelines/__init__.py b/api-inference-community/docker_images/common/app/pipelines/__init__.py
@@ -8,6 +8,7 @@
 from app.pipelines.image_classification import ImageClassificationPipeline
 from app.pipelines.question_answering import QuestionAnsweringPipeline
 from app.pipelines.sentence_similarity import SentenceSimilarityPipeline
+from app.pipelines.speech_segmentation import SpeechSegmentationPipeline
 from app.pipelines.structured_data_classification import (
     StructuredDataClassificationPipeline,
 )
diff --git a/api-inference-community/docker_images/common/app/pipelines/automatic_speech_recognition.py b/api-inference-community/docker_images/common/app/pipelines/automatic_speech_recognition.py
@@ -21,9 +21,7 @@ def __call__(self, inputs: np.array) -> Dict[str, str]:
         """
         Args:
             inputs (:obj:`np.array`):
-                The raw waveform of audio received. By default at 16KHz.
-                Check `app.validation` if a different sample rate is required
-                or if it depends on the model
+                The raw waveform of audio received. By default at self.sampling_rate, otherwise 16KHz.
         Return:
             A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
             the detected langage from the input audio
diff --git a/api-inference-community/docker_images/common/app/pipelines/speech_segmentation.py b/api-inference-community/docker_images/common/app/pipelines/speech_segmentation.py
@@ -0,0 +1,34 @@
+from typing import Dict
+
+import numpy as np
+from app.pipelines import Pipeline
+
+
+class SpeechSegmentationPipeline(Pipeline):
+    def __init__(self, model_id: str):
+        # IMPLEMENT_THIS
+        # Preload all the elements you are going to need at inference.
+        # For instance your model, processors, tokenizer that might be needed.
+        # This function is only called once, so do all the heavy processing I/O here
+        # IMPLEMENT_THIS : Please define a `self.sampling_rate` for this pipeline
+        # to automatically read the input correctly
+        self.sampling_rate = 16000
+        raise NotImplementedError(
+            "Please implement SpeechSegmentationPipeline __init__ function"
+        )
+
+    def __call__(self, inputs: np.array) -> Dict[str, str]:
+        """
+        Args:
+            inputs (:obj:`np.array`):
+                The raw waveform of audio received. By default at self.sampling_rate, otherwise 16KHz.
+        Return:
+            A :obj:`list`:. Each item in the list is like {"class": "XXX", "start": float, "end": float}
+            "class" is the associated class of the audio segment, "start" and "end" are markers expressed in seconds
+            within the audio file.
+        """
+        # IMPLEMENT_THIS
+        # api_inference_community.normalizers.speaker_diarization_normalize could help.
+        raise NotImplementedError(
+            "Please implement SpeechSegmentationPipeline __call__ function"
+        )
diff --git a/api-inference-community/docker_images/common/tests/test_api_audio_to_audio.py b/api-inference-community/docker_images/common/tests/test_api_audio_to_audio.py
@@ -24,6 +24,12 @@ def setUp(self):
 
         self.app = app
 
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
     def tearDown(self):
         if self.old_model_id is not None:
             os.environ["MODEL_ID"] = self.old_model_id
diff --git a/api-inference-community/docker_images/common/tests/test_api_automatic_speech_recognition.py b/api-inference-community/docker_images/common/tests/test_api_automatic_speech_recognition.py
@@ -22,6 +22,12 @@ def setUp(self):
 
         self.app = app
 
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
     def tearDown(self):
         if self.old_model_id is not None:
             os.environ["MODEL_ID"] = self.old_model_id
diff --git a/api-inference-community/docker_images/common/tests/test_api_feature_extraction.py b/api-inference-community/docker_images/common/tests/test_api_feature_extraction.py
@@ -22,6 +22,12 @@ def setUp(self):
 
         self.app = app
 
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
     def tearDown(self):
         if self.old_model_id is not None:
             os.environ["MODEL_ID"] = self.old_model_id
diff --git a/api-inference-community/docker_images/common/tests/test_api_image_classification.py b/api-inference-community/docker_images/common/tests/test_api_image_classification.py
@@ -22,6 +22,12 @@ def setUp(self):
 
         self.app = app
 
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
     def tearDown(self):
         if self.old_model_id is not None:
             os.environ["MODEL_ID"] = self.old_model_id
diff --git a/api-inference-community/docker_images/common/tests/test_api_question_answering.py b/api-inference-community/docker_images/common/tests/test_api_question_answering.py
@@ -22,6 +22,12 @@ def setUp(self):
 
         self.app = app
 
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
     def tearDown(self):
         if self.old_model_id is not None:
             os.environ["MODEL_ID"] = self.old_model_id
diff --git a/api-inference-community/docker_images/common/tests/test_api_sentence_similarity.py b/api-inference-community/docker_images/common/tests/test_api_sentence_similarity.py
@@ -22,6 +22,12 @@ def setUp(self):
 
         self.app = app
 
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
     def tearDown(self):
         if self.old_model_id is not None:
             os.environ["MODEL_ID"] = self.old_model_id
diff --git a/api-inference-community/docker_images/common/tests/test_api_speech_segmentation.py b/api-inference-community/docker_images/common/tests/test_api_speech_segmentation.py
@@ -0,0 +1,112 @@
+import json
+import os
+from unittest import TestCase, skipIf
+
+from app.main import ALLOWED_TASKS
+from starlette.testclient import TestClient
+from tests.test_api import TESTABLE_MODELS
+
+
+@skipIf(
+    "speech-segmentation" not in ALLOWED_TASKS,
+    "speech-segmentation not implemented",
+)
+class SpeechSegmentationTestCase(TestCase):
+    def setUp(self):
+        model_id = TESTABLE_MODELS["speech-segmentation"]
+        self.old_model_id = os.getenv("MODEL_ID")
+        self.old_task = os.getenv("TASK")
+        os.environ["MODEL_ID"] = model_id
+        os.environ["TASK"] = "speech-segmentation"
+        from app.main import app
+
+        self.app = app
+
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
+    def tearDown(self):
+        if self.old_model_id is not None:
+            os.environ["MODEL_ID"] = self.old_model_id
+        else:
+            del os.environ["MODEL_ID"]
+        if self.old_task is not None:
+            os.environ["TASK"] = self.old_task
+        else:
+            del os.environ["TASK"]
+
+    def read(self, filename: str) -> bytes:
+        dirname = os.path.dirname(os.path.abspath(__file__))
+        filename = os.path.join(dirname, "samples", filename)
+        with open(filename, "rb") as f:
+            bpayload = f.read()
+        return bpayload
+
+    def test_original_audiofile(self):
+        bpayload = self.read("sample1.flac")
+
+        with TestClient(self.app) as client:
+            response = client.post("/", data=bpayload)
+
+        self.assertEqual(
+            response.status_code,
+            200,
+        )
+        content = json.loads(response.content)
+        self.assertIsInstance(content, list)
+        for c in content:
+            self.assertEqual(set(c.keys()), {"class", "start", "end"})
+            self.assertIsInstance(c["class"], str)
+            self.assertIsInstance(c["start"], float)
+            self.assertIsInstance(c["end"], float)
+
+    def test_malformed_audio(self):
+        bpayload = self.read("malformed.flac")
+
+        with TestClient(self.app) as client:
+            response = client.post("/", data=bpayload)
+
+        self.assertEqual(
+            response.status_code,
+            400,
+        )
+        self.assertEqual(response.content, b'{"error":"Malformed soundfile"}')
+
+    def test_dual_channel_audiofile(self):
+        bpayload = self.read("sample1_dual.ogg")
+
+        with TestClient(self.app) as client:
+            response = client.post("/", data=bpayload)
+
+        self.assertEqual(
+            response.status_code,
+            200,
+        )
+        content = json.loads(response.content)
+        self.assertIsInstance(content, list)
+        for c in content:
+            self.assertEqual(set(c.keys()), {"class", "start", "end"})
+            self.assertIsInstance(c["class"], str)
+            self.assertIsInstance(c["start"], float)
+            self.assertIsInstance(c["end"], float)
+
+    def test_webm_audiofile(self):
+        bpayload = self.read("sample1.webm")
+
+        with TestClient(self.app) as client:
+            response = client.post("/", data=bpayload)
+
+        self.assertEqual(
+            response.status_code,
+            200,
+        )
+        content = json.loads(response.content)
+        self.assertIsInstance(content, list)
+        for c in content:
+            self.assertEqual(set(c.keys()), {"class", "start", "end"})
+            self.assertIsInstance(c["class"], str)
+            self.assertIsInstance(c["start"], float)
+            self.assertIsInstance(c["end"], float)
diff --git a/api-inference-community/docker_images/common/tests/test_api_structured_data_classification.py b/api-inference-community/docker_images/common/tests/test_api_structured_data_classification.py
@@ -23,6 +23,12 @@ def setUp(self):
 
         self.app = app
 
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
     def tearDown(self):
         if self.old_model_id is not None:
             os.environ["MODEL_ID"] = self.old_model_id
diff --git a/api-inference-community/docker_images/common/tests/test_api_text_to_speech.py b/api-inference-community/docker_images/common/tests/test_api_text_to_speech.py
@@ -22,6 +22,12 @@ def setUp(self):
 
         self.app = app
 
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
     def tearDown(self):
         if self.old_model_id is not None:
             os.environ["MODEL_ID"] = self.old_model_id
diff --git a/api-inference-community/docker_images/common/tests/test_api_token_classification.py b/api-inference-community/docker_images/common/tests/test_api_token_classification.py
@@ -22,6 +22,12 @@ def setUp(self):
 
         self.app = app
 
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
     def tearDown(self):
         if self.old_model_id is not None:
             os.environ["MODEL_ID"] = self.old_model_id
diff --git a/api-inference-community/docker_images/superb/app/main.py b/api-inference-community/docker_images/superb/app/main.py
@@ -4,7 +4,11 @@
 from typing import Dict, Type
 
 from api_inference_community.routes import pipeline_route, status_ok
-from app.pipelines import AutomaticSpeechRecognitionPipeline, Pipeline
+from app.pipelines import (
+    AutomaticSpeechRecognitionPipeline,
+    Pipeline,
+    SpeechSegmentationPipeline,
+)
 from starlette.applications import Starlette
 from starlette.middleware import Middleware
 from starlette.middleware.gzip import GZipMiddleware
@@ -34,6 +38,7 @@
 # directories. Implement directly within the directories.
 ALLOWED_TASKS: Dict[str, Type[Pipeline]] = {
     "automatic-speech-recognition": AutomaticSpeechRecognitionPipeline,
+    "speech-segmentation": SpeechSegmentationPipeline,
 }
 
 
diff --git a/api-inference-community/docker_images/superb/app/pipelines/__init__.py b/api-inference-community/docker_images/superb/app/pipelines/__init__.py
@@ -3,3 +3,4 @@
 from app.pipelines.automatic_speech_recognition import (
     AutomaticSpeechRecognitionPipeline,
 )
+from app.pipelines.speech_segmentation import SpeechSegmentationPipeline
diff --git a/api-inference-community/docker_images/superb/app/pipelines/speech_segmentation.py b/api-inference-community/docker_images/superb/app/pipelines/speech_segmentation.py
diff --git a/api-inference-community/docker_images/superb/requirements.txt b/api-inference-community/docker_images/superb/requirements.txt
diff --git a/api-inference-community/docker_images/superb/tests/test_api.py b/api-inference-community/docker_images/superb/tests/test_api.py
diff --git a/api-inference-community/docker_images/superb/tests/test_api_automatic_speech_recognition.py b/api-inference-community/docker_images/superb/tests/test_api_automatic_speech_recognition.py
diff --git a/api-inference-community/docker_images/superb/tests/test_api_speech_segmentation.py b/api-inference-community/docker_images/superb/tests/test_api_speech_segmentation.py
diff --git a/api-inference-community/setup.py b/api-inference-community/setup.py
diff --git a/api-inference-community/tests/test_dockers.py b/api-inference-community/tests/test_dockers.py
diff --git a/api-inference-community/tests/test_normalizers.py b/api-inference-community/tests/test_normalizers.py

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@`
`8`	`8`	`from app.pipelines.image_classification import ImageClassificationPipeline`
`9`	`9`	`from app.pipelines.question_answering import QuestionAnsweringPipeline`
`10`	`10`	`from app.pipelines.sentence_similarity import SentenceSimilarityPipeline`
	`11`	`+from app.pipelines.speech_segmentation import SpeechSegmentationPipeline`
`11`	`12`	`from app.pipelines.structured_data_classification import (`
`12`	`13`	`StructuredDataClassificationPipeline,`
`13`	`14`	`)`