Add fairseq S2T/S2S pipelines (#570)

kahne · web-flow · commit 0f0585ea3500 · 2022-01-07T10:24:36.000+01:00
* add s2s and s2t

* updates

* update fairseq commit

* update fairseq commit

* resolve PR comment

* isort

* isort latest version

* fix sample rate

* fix

* update

* fix

* update test

* update test

* update

* update
diff --git a/api-inference-community/docker_images/fairseq/app/main.py b/api-inference-community/docker_images/fairseq/app/main.py
@@ -4,7 +4,7 @@
 from typing import Dict, Type
 
 from api_inference_community.routes import pipeline_route, status_ok
-from app.pipelines import Pipeline, TextToSpeechPipeline
+from app.pipelines import Pipeline, SpeechToSpeechPipeline, TextToSpeechPipeline
 from starlette.applications import Starlette
 from starlette.middleware import Middleware
 from starlette.middleware.gzip import GZipMiddleware
@@ -34,6 +34,7 @@
 # directories. Implement directly within the directories.
 ALLOWED_TASKS: Dict[str, Type[Pipeline]] = {
     "text-to-speech": TextToSpeechPipeline,
+    "audio-to-audio": SpeechToSpeechPipeline,
 }
 
 
diff --git a/api-inference-community/docker_images/fairseq/app/pipelines/__init__.py b/api-inference-community/docker_images/fairseq/app/pipelines/__init__.py
@@ -1,3 +1,4 @@
 from app.pipelines.base import Pipeline, PipelineException  # isort:skip
 
+from app.pipelines.audio_to_audio import SpeechToSpeechPipeline
 from app.pipelines.text_to_speech import TextToSpeechPipeline
diff --git a/api-inference-community/docker_images/fairseq/app/pipelines/audio_to_audio.py b/api-inference-community/docker_images/fairseq/app/pipelines/audio_to_audio.py
@@ -0,0 +1,78 @@
+import os
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from app.pipelines import Pipeline
+from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
+from fairseq.models.speech_to_text.hub_interface import S2THubInterface
+from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
+
+
+class SpeechToSpeechPipeline(Pipeline):
+    def __init__(self, model_id: str):
+        models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
+            model_id,
+            arg_overrides={"config_yaml": "config.yaml"},
+            cache_dir=os.getenv("HUGGINGFACE_HUB_CACHE"),
+        )
+        self.model = models[0].cpu()
+        self.model.eval()
+        cfg["task"].cpu = True
+        self.task = task
+        self.generator = task.build_generator([self.model], cfg)
+
+        self.sampling_rate = getattr(self.task, "sr", None) or 16_000
+
+        tgt_lang = self.task.data_cfg.hub.get("tgt_lang", None)
+        pfx = f"{tgt_lang}_" if self.task.data_cfg.prepend_tgt_lang_tag else ""
+        tts_model_id = self.task.data_cfg.hub.get(f"{pfx}tts_model_id", None)
+        self.tts_model, self.tts_task, self.tts_generator = None, None, None
+        if tts_model_id is not None:
+            _repo, _id = tts_model_id.split(":")
+            (
+                tts_models,
+                tts_cfg,
+                self.tts_task,
+            ) = load_model_ensemble_and_task_from_hf_hub(
+                f"facebook/{_id}",
+                arg_overrides={"vocoder": "griffin_lim", "fp16": False},
+                cache_dir=os.getenv("HUGGINGFACE_HUB_CACHE"),
+            )
+            self.tts_model = tts_models[0].cpu()
+            self.tts_model.eval()
+            tts_cfg["task"].cpu = True
+            TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, self.tts_task.data_cfg)
+            self.tts_generator = self.tts_task.build_generator(
+                [self.tts_model], tts_cfg
+            )
+
+    def __call__(self, inputs: np.array) -> Tuple[np.array, int, List[str]]:
+        """
+        Args:
+            inputs (:obj:`np.array`):
+                The raw waveform of audio received. By default sampled at `self.sampling_rate`.
+                The shape of this array is `T`, where `T` is the time axis
+        Return:
+            A :obj:`tuple` containing:
+              - :obj:`np.array`:
+                 The return shape of the array must be `C'`x`T'`
+              - a :obj:`int`: the sampling rate as an int in Hz.
+              - a :obj:`List[str]`: the annotation for each out channel.
+                    This can be the name of the instruments for audio source separation
+                    or some annotation for speech enhancement. The length must be `C'`.
+        """
+        _inputs = torch.from_numpy(inputs).unsqueeze(0)
+        sample = S2THubInterface.get_model_input(self.task, _inputs)
+        text = S2THubInterface.get_prediction(
+            self.task, self.model, self.generator, sample
+        )
+
+        if self.tts_model is None:
+            return np.zeros((0,)), self.sampling_rate, [text]
+        else:
+            tts_sample = TTSHubInterface.get_model_input(self.tts_task, text)
+            wav, sr = TTSHubInterface.get_prediction(
+                self.tts_task, self.tts_model, self.tts_generator, tts_sample
+            )
+            return wav.numpy(), sr, [text]
diff --git a/api-inference-community/docker_images/fairseq/requirements.txt b/api-inference-community/docker_images/fairseq/requirements.txt
@@ -4,4 +4,5 @@ g2pc==0.9.9.3
 phonemizer==2.2.1
 librosa==0.8.1
 hanziconv==0.3.2
-git+git://github.com/pytorch/fairseq.git@43defa1bcb9cc3d5c532d12cba5e01f37dad0350
+sentencepiece==0.1.91
+git+git://github.com/pytorch/fairseq.git@1d5da6d5b954ba01fc3df12d25d63df27437e20e
diff --git a/api-inference-community/docker_images/fairseq/tests/test_api.py b/api-inference-community/docker_images/fairseq/tests/test_api.py
@@ -15,6 +15,7 @@
 
 ALL_TASKS = {
     "text-to-speech",
+    "audio-to-audio",
 }
 
 
diff --git a/api-inference-community/docker_images/fairseq/tests/test_api_audio_to_audio.py b/api-inference-community/docker_images/fairseq/tests/test_api_audio_to_audio.py
@@ -0,0 +1,74 @@
+import base64
+import json
+import os
+from unittest import TestCase, skipIf
+
+from api_inference_community.validation import ffmpeg_read
+from app.main import ALLOWED_TASKS
+from starlette.testclient import TestClient
+from tests.test_api import TESTABLE_MODELS
+
+
+@skipIf(
+    "audio-to-audio" not in ALLOWED_TASKS,
+    "audio-to-audio not implemented",
+)
+class AudioToAudioTestCase(TestCase):
+    def setUp(self):
+        model_id = TESTABLE_MODELS["audio-to-audio"]
+        self.old_model_id = os.getenv("MODEL_ID")
+        self.old_task = os.getenv("TASK")
+        os.environ["MODEL_ID"] = model_id
+        os.environ["TASK"] = "audio-to-audio"
+        from app.main import app
+
+        self.app = app
+
+    @classmethod
+    def setUpClass(cls):
+        from app.main import get_pipeline
+
+        get_pipeline.cache_clear()
+
+    def tearDown(self):
+        if self.old_model_id is not None:
+            os.environ["MODEL_ID"] = self.old_model_id
+        else:
+            del os.environ["MODEL_ID"]
+        if self.old_task is not None:
+            os.environ["TASK"] = self.old_task
+        else:
+            del os.environ["TASK"]
+
+    def test_simple(self):
+        bpayload = self.read("sample1.flac")
+
+        with TestClient(self.app) as client:
+            response = client.post("/", data=bpayload)
+        self.assertEqual(
+            response.status_code,
+            200,
+        )
+        self.assertEqual(response.headers["content-type"], "application/json")
+        audio = json.loads(response.content)
+
+        self.assertTrue(isinstance(audio, list))
+        self.assertEqual(set(audio[0].keys()), {"blob", "content-type", "label"})
+
+        data = base64.b64decode(audio[0]["blob"])
+        wavform = ffmpeg_read(data, 16000)
+        self.assertGreater(wavform.shape[0], 1000)
+        self.assertTrue(isinstance(audio[0]["content-type"], str))
+        self.assertTrue(isinstance(audio[0]["label"], str))
+
+    def test_malformed_audio(self):
+        bpayload = self.read("malformed.flac")
+
+        with TestClient(self.app) as client:
+            response = client.post("/", data=bpayload)
+
+        self.assertEqual(
+            response.status_code,
+            400,
+        )
+        self.assertEqual(response.content, b'{"error":"Malformed soundfile"}')
diff --git a/api-inference-community/tests/test_dockers.py b/api-inference-community/tests/test_dockers.py
@@ -85,6 +85,11 @@ def test_fairseq(self):
             "text-to-speech",
             "facebook/fastspeech2-en-ljspeech",
         )
+        self.framework_docker_test(
+            "fairseq",
+            "audio-to-audio",
+            "facebook/xm_transformer_600m-es_en-multi_domain",
+        )
         self.framework_invalid_test("fairseq")
 
     def test_fasttext(self):

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`from app.pipelines.base import Pipeline, PipelineException # isort:skip`
`2`	`2`
	`3`	`+from app.pipelines.audio_to_audio import SpeechToSpeechPipeline`
`3`	`4`	`from app.pipelines.text_to_speech import TextToSpeechPipeline`
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`
`16`	`16`	`ALL_TASKS = {`
`17`	`17`	`"text-to-speech",`
	`18`	`+ "audio-to-audio",`
`18`	`19`	`}`
`19`	`20`
`20`	`21`