TartuNLP
diff --git a/‎.dockerignore‎
Lines changed: 3 additions & 1 deletion b/‎.dockerignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 6 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎Dockerfile‎
Lines changed: 3 additions & 2 deletions b/‎Dockerfile‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 8 additions & 27 deletions b/‎README.md‎
Lines changed: 8 additions & 27 deletions
diff --git a/‎TransformerTTS‎ b/‎TransformerTTS‎
diff --git a/‎config/config.yaml‎
Lines changed: 14 additions & 39 deletions b/‎config/config.yaml‎
Lines changed: 14 additions & 39 deletions
diff --git a/‎environment.yml‎
Lines changed: 0 additions & 25 deletions b/‎environment.yml‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎main.py‎
Lines changed: 8 additions & 6 deletions b/‎main.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎requirements.txt‎
Lines changed: 17 additions & 14 deletions b/‎requirements.txt‎
Lines changed: 17 additions & 14 deletions
@@ -7,4 +7,6 @@ config/*.env
 /logs/*
 *.iml
 deepvoice3_pytorch/
-environment.yml
+environment.yml
+*.ipynb
+.ipynb_checkpoints
@@ -3,4 +3,8 @@ __pycache__/
 config/.env
 *.iml
 deepvoice3_pytorch/
-config/config.*.yaml
+config/config.*.yaml
+.ipynb_checkpoints
+.DS_Store
+.vscode/
+*.ipynb
@@ -1,6 +1,3 @@
-[submodule "TransformerTTS"]
-	path = TransformerTTS
-	url = https://github.com/TartuNLP/TransformerTTS.git
-[submodule "tts_preprocess_et"]
-	path = tts_preprocess_et
-	url = https://github.com/TartuNLP/tts_preprocess_et.git
+[submodule "TransformerTTS"]
+	path = TransformerTTS
+	url = https://github.com/TartuNLP/TransformerTTS.git
@@ -1,4 +1,5 @@
-FROM python:3.9
+# Latest version of TensorFlow is compatible with Python <= 3.12
+FROM python:3.10
 
 # Install system dependencies
 RUN apt-get update && \
@@ -26,4 +27,4 @@ RUN pip install --user -r requirements.txt && \
 
 COPY --chown=app:app . .
 
-ENTRYPOINT ["python", "main.py"]
+ENTRYPOINT ["python", "main.py", "--max-input-length", "500"]
@@ -17,17 +17,9 @@ structure:
 
 ```
 models
-├── hifigan
-│   ├── ljspeech
-│   │   ├── config.json
-│   │   └── model.pt
-│   ├── vctk
-│   │   ├── config.json
-│   │   └── model.pt
-└── tts
-    └── multispeaker
-        ├── config.yaml
-        └── model_weights.hdf5
+└── multispeaker
+    ├── config.yaml
+    └── model_weights.hdf5
 ```
 
 ## Setup
@@ -52,8 +44,7 @@ The following environment variables should be configured when running the contai
 - `MKL_NUM_THREADS` (optional) - number of threads used for intra-op parallelism by PyTorch (used for the vocoder model)
   . `16` by default. If set to a blank value, it defaults to the number of CPU cores which may cause computational
   overhead when deployed on larger nodes. Alternatively, the `docker run` flag `--cpuset-cpus` can be used to control
-  this. For more details, refer to the [performance and hardware requirements](#performance-and-hardware-requirements)
-  section below.
+  this.
 
 By default, the container entrypoint is `main.py` without additional arguments, but arguments should be defined with the
 `COMMAND` option. The only required flag is `--model-name` to select which model is loaded by the worker. The full list
@@ -77,7 +68,6 @@ optional arguments:
 The setup can be tested with the following sample `docker-compose.yml` configuration:
 
 ```yaml
-version: '3'
 services:
   rabbitmq:
     image: 'rabbitmq'
@@ -95,6 +85,7 @@ services:
       - '8000:8000'
     depends_on:
       - rabbitmq
+    restart: always
   tts_worker:
     image: ghcr.io/tartunlp/text-to-speech-worker:latest
     environment:
@@ -107,6 +98,7 @@ services:
       - ./models:/app/models
     depends_on:
       - rabbitmq
+    restart: always
 ```
 
 ### Manual setup
@@ -116,23 +108,12 @@ The following steps have been tested on Ubuntu and is both CPU and GPU compatibl
 - Clone this repository with submodules
 - Install prerequisites:
     - GNU Compiler Collection (`sudo apt install build-essential`)
-    - For a **CPU** installation we recommend using the included `requirements.txt` file in a clean environment (tested with
-      Python 3.9)
+    - For a **GPU** installation, make sure you have CUDA installed (see https://developer.nvidia.com/cuda-downloads)
+    - Use the included `requirements.txt` file in a clean environment (check the compatible python version from the `Dockerfile`)
       ```commandline
       pip install -r requirements.txt
       ```
 
-    - For a **GPU** installation, use the `environment.yml` file instead.
-        - Make sure you have the following prerequisites installed:
-            - CUDA (see https://developer.nvidia.com/cuda-downloads)
-            - Conda (see https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html)
-
-        - Then create and activate a Conda environment with all dependencies:
-          ```commandline
-          conda env create -f environment.yml -n tts
-          conda activate tts
-          ```
-
 - Download the models from the [releases section](https://github.com/TartuNLP/text-to-speech-worker/releases) and
   place inside the `models/` directory.
 
 
@@ -1,46 +1,21 @@
-vocoders:
-  vctk: models/hifigan/vctk # the directory which should contain a .json and .pt file
-  ljspeech: models/hifigan/ljspeech
 tts_models:
   multispeaker:
-    model_path: models/tts/multispeaker # the directory that contains a yaml and hdf5 files for the model
+    model_path: models/multispeaker # the directory that contains a yaml and hdf5 files for the model
     frontend: 'est'
-    speakers: # a mapping of speaker names (as they will be used in routing keys, speaker-ids in the model and the vocoder to be used)
-      albert:
-        speaker_id: 1
-        vocoder: vctk
-      indrek:
-        speaker_id: 2
-        vocoder: vctk
-      kalev:
-        speaker_id: 3
-        vocoder: vctk
-      kylli:
-        speaker_id: 4
-        vocoder: ljspeech
-      liivika:
-        speaker_id: 5
-        vocoder: ljspeech
-      mari:
-        speaker_id: 6
-        vocoder: ljspeech
-      meelis:
-        speaker_id: 7
-        vocoder: vctk
-      peeter:
-        speaker_id: 8
-        vocoder: vctk
-      tambet:
-        speaker_id: 9
-        vocoder: vctk
-      vesta:
-        speaker_id: 10
-        vocoder: vctk
+    speakers: # a mapping of speaker names as they will be used in routing keys and speaker-ids in the model
+      albert: 1
+      indrek: 2
+      kalev: 3
+      kylli: 4
+      liivika: 5
+      mari: 6
+      meelis: 7
+      peeter: 8
+      tambet: 9
+      vesta: 10
   # single-speaker example:
   #  mari:
-  #    model_path: models/tts/mari
+  #    model_path: models/mari
   #    frontend: 'est'
   #    speakers:
-  #      lee:
-  #        speaker_id: 0
-  #        vocoder: ljspeech
+  #      mari: 0
@@ -1,18 +1,20 @@
 import logging.config
-from argparse import ArgumentParser, FileType
+from argparse import ArgumentParser
 
-from tts_worker import read_model_config, Synthesizer, MQConsumer
+from tts_worker.config import read_model_config
+from tts_worker.synthesizer import Synthesizer
+from tts_worker.mq_consumer import MQConsumer
 
 
 def parse_args():
     parser = ArgumentParser(
         description="A text-to-speech worker that processes incoming TTS requests via RabbitMQ."
     )
-    parser.add_argument('--model-config', type=FileType('r'), default='config/config.yaml',
+    parser.add_argument('--model-config', type=str, default='config/config.yaml',
                         help="The model config YAML file to load.")
     parser.add_argument('--model-name', type=str,
                         help="The model to load. Refers to the model name in the config file.")
-    parser.add_argument('--log-config', type=FileType('r'), default='config/logging.prod.ini',
+    parser.add_argument('--log-config', type=str, default='config/logging.prod.ini',
                         help="Path to log config file.")
     parser.add_argument('--max-input-length', type=int, default=0,
                         help="Optional max input length configuration - "
@@ -26,8 +28,8 @@ def parse_args():
 
 def main():
     args = parse_args()
-    logging.config.fileConfig(args.log_config.name)
-    model_config = read_model_config(args.model_config.name, args.model_name)
+    logging.config.fileConfig(args.log_config)
+    model_config = read_model_config(args.model_config, args.model_name)
 
     tts = Synthesizer(model_config, args.max_input_length)
     consumer = MQConsumer(tts)
 
@@ -1,14 +1,17 @@
-librosa==0.9.2
-tensorflow-cpu==2.11.0
-nltk==3.8.1
-estnltk==1.6.9.1b0
-pika==1.3.1
-torch==1.13.1
-torchvision
-torchaudio
-pyyaml==6.0
-pydantic==1.10.4
-python-dotenv==0.21.0
-ruamel.yaml==0.17.21
-phonemizer==3.2.1
-unidecode==1.3.6
+# TransformerTTS requirements
+librosa==0.11.0
+tensorflow==2.13.0
+ruamel.yaml
+# Worker requirements:
+nltk==3.9.2
+pika==1.3.2
+pydantic
+pydantic-settings
+python-dotenv
+# Preprocessing requirements:
+git+https://github.com/TartuNLP/tts_preprocess_et.git@v1.1.0
+# Vocoder requirements:
+speechbrain==1.0.2
+torch==2.1.2
+torchaudio==2.1.2
+huggingface-hub==0.29.2