jsk-ros-pkg · mqcmd196 · Apr 24, 2025
diff --git a/3rdparty/voicevox3/CMakeLists.txt b/3rdparty/voicevox3/CMakeLists.txt
@@ -0,0 +1,64 @@
+cmake_minimum_required(VERSION 2.8.3)
+project(voicevox3)
+
+find_package(catkin REQUIRED COMPONENTS catkin_virtualenv)
+
+catkin_python_setup()
+
+catkin_package()
+
+catkin_generate_virtualenv(
+  INPUT_REQUIREMENTS requirements.txt
+  PYTHON_INTERPRETER python3
+  USE_SYSTEM_PACKAGES FALSE
+  CHECK_VENV FALSE  # Default TRUE
+  )
+
+include(ExternalProject)
+ExternalProject_Add(voicevox_engine
+  GIT_REPOSITORY    https://github.com/VOICEVOX/voicevox_engine
+  GIT_TAG           0.14.7   # latest version before Python 3.11, Oct 5, 2023
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ${CMAKE_COMMAND} -E remove_directory ${PROJECT_SOURCE_DIR}/voicevox_engine/speaker_info.orig
+  INSTALL_COMMAND   ${CMAKE_COMMAND} -E rename ${PROJECT_SOURCE_DIR}/voicevox_engine/speaker_info ${PROJECT_SOURCE_DIR}/voicevox_engine/speaker_info.orig
+  SOURCE_DIR  ${PROJECT_SOURCE_DIR}/voicevox_engine
+  )
+ExternalProject_Add(voicevox_resource
+  GIT_REPOSITORY    https://github.com/VOICEVOX/voicevox_resource
+  GIT_TAG           main
+  GIT_PROGRESS      TRUE
+  GIT_SHALLOW       TRUE
+  BUILD_IN_SOURCE   TRUE
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   python3 ./scripts/clean_character_info.py --output_dir ${PROJECT_SOURCE_DIR}/voicevox_engine/speaker_info
+  DEPENDS voicevox_engine
+  )
+#set(CORE_VERSION 0.15.7) ## Dec 29, 2024
+#set(CORE_HASH "444dc362d98e065b8581e5a9e403b8fc")
+set(CORE_VERSION 0.14.6) ## Jan 11, 2024
+set(CORE_HASH "26719dab23a8e0b4559516d1f2a78833")
+ExternalProject_Add(voicevox_core
+  URL               https://github.com/VOICEVOX/voicevox_core/releases/download/${CORE_VERSION}/voicevox_core-linux-x64-cpu-${CORE_VERSION}.zip
+  URL_HASH          MD5=${CORE_HASH}
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  SOURCE_DIR  ${PROJECT_SOURCE_DIR}/voicevox_core
+)
+
+catkin_install_python(
+  PROGRAMS node_scripts/request_synthesis.py node_scripts/list_speakers.py
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/node_scripts/)
+install(
+  PROGRAMS bin/text2wave
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/bin)
+
+install(DIRECTORY launch
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+  USE_SOURCE_PERMISSIONS)
+
+install(DIRECTORY
+  voicevox_engine voicevox_core
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+  USE_SOURCE_PERMISSIONS)
diff --git a/3rdparty/voicevox3/bin/run b/3rdparty/voicevox3/bin/run
@@ -0,0 +1,16 @@
+#!/usr/bin/bash
+
+# Original arguments
+args=("$@")
+
+# Filtered arguments (up to first "--")
+filtered_args=()
+for arg in "${args[@]}"; do
+  if [[ "$arg" == -- ]]; then
+    break
+  fi
+  filtered_args+=("$arg")
+done
+
+# Now use "${filtered_args[@]}" instead of "$@"
+exec rosrun voicevox3 python3 $(rospack find voicevox3)/voicevox_engine/run.py ${filtered_args[@]}
diff --git a/3rdparty/voicevox3/bin/text2wave b/3rdparty/voicevox3/bin/text2wave
@@ -0,0 +1,3 @@
+#!/usr/bin/bash
+
+exec rosrun voicevox3 request_synthesis.py $@
diff --git a/3rdparty/voicevox3/launch/voicevox_texttospeech.launch b/3rdparty/voicevox3/launch/voicevox_texttospeech.launch
@@ -0,0 +1,31 @@
+<launch>
+
+  <arg name="device" default="" />
+  <arg name="launch_sound_play" default="true" />
+  <arg name="sound_play_respawn" default="true"
+       doc="Respawn sound_play node or not (default: true)" />
+  <arg name="default_speaker" default="2"
+       doc="Default speaker for VOICEVOX" />
+  <arg name="cpu_num_threads" default="1"
+       doc="Number of cpu threads" />
+
+  <node name="voicevox_server"
+        pkg="voicevox3" type="run"
+        args="--voicelib_dir=$(find voicevox3)/voicevox_core --cpu_num_threads=$(arg cpu_num_threads) --load_all_models --"
+        respawn="$(arg sound_play_respawn)"
+        output="screen" >
+  </node>
+
+  <node if="$(arg launch_sound_play)"
+        name="sound_play_jp"
+        pkg="sound_play" type="soundplay_node.py"
+        respawn="$(arg sound_play_respawn)"
+        output="screen" >
+    <remap from="robotsound" to="robotsound_jp"/>
+    <remap from="sound_play" to="robotsound_jp"/>
+    <param name="default_voice" value="$(arg default_speaker)" />
+    <env name="VOICEVOX_DEFAULT_SPEAKER_ID" value="$(arg default_speaker)" />
+    <env name="PATH" value="$(find voicevox3)/bin:$(env PATH)" />
+    <env name="PYTHONIOENCODING" value="utf-8" />
+  </node>
+</launch>
diff --git a/3rdparty/voicevox3/node_scripts/list_speakers.py b/3rdparty/voicevox3/node_scripts/list_speakers.py
@@ -0,0 +1,25 @@
+from voicevox import Client
+import asyncio
+
+
+async def main():
+    async with Client() as client:
+        # check core
+        for version in await client.fetch_core_versions():
+            print("Core version: {}".format(version))
+        # check engine
+        engine_version = await client.fetch_engine_version()
+        print("Engine version: {}".format(engine_version))
+        # check device
+        for device in await client.http.request("GET", "/supported_devices"):
+            print("Device: {}".format(device))
+        # check speaker
+        for speaker in await client.fetch_speakers():
+            print(speaker.uuid, speaker.name, speaker.supported_features.permitted_synthesis_morphing)
+            for styles in speaker.styles:
+                print(styles.id, speaker.name, styles.name)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+
diff --git a/3rdparty/voicevox3/node_scripts/request_synthesis.py b/3rdparty/voicevox3/node_scripts/request_synthesis.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+from voicevox import Client
+import asyncio
+
+import argparse
+import os
+import shutil
+import sys
+
+#import requests
+
+# from voicevox.filecheck_utils import checksum_md5
+# from voicevox.filecheck_utils import get_cache_dir
+
+
+speaker_id_to_name = {
+    '0': '四国めたん-あまあま',
+    '1': 'ずんだもん-あまあま',
+    '2': '四国めたん-ノーマル',
+    '3': 'ずんだもん-ノーマル',
+    '4': '四国めたん-セクシー',
+    '5': 'ずんだもん-セクシー',
+    '6': '四国めたん-ツンツン',
+    '7': 'ずんだもん-ツンツン',
+    '8': '春日部つむぎ-ノーマル',
+    '9': '波音リツ-ノーマル',
+    '10': '雨晴はう-ノーマル',
+    '11': '玄野武宏-ノーマル',
+    '12': '白上虎太郎-ノーマル',
+    '13': '青山龍星-ノーマル',
+    '14': '冥鳴ひまり-ノーマル',
+    '15': '九州そら-あまあま',
+    '16': '九州そら-ノーマル',
+    '17': '九州そら-セクシー',
+    '18': '九州そら-ツンツン',
+    '19': '九州そら-ささやき',
+}
+
+name_to_speaker_id = {
+    b: a for a, b in speaker_id_to_name.items()
+}
+
+
+DEFAULT_SPEAKER_ID = os.environ.get(
+    'VOICEVOX_DEFAULT_SPEAKER_ID', '2')
+if not DEFAULT_SPEAKER_ID.isdigit():
+    DEFAULT_SPEAKER_ID = name_to_speaker_id[DEFAULT_SPEAKER_ID]
+VOICEVOX_TEXTTOSPEECH_URL = os.environ.get(
+    'VOICEVOX_TEXTTOSPEECH_URL', 'localhost')
+VOICEVOX_TEXTTOSPEECH_PORT = os.environ.get(
+    'VOICEVOX_TEXTTOSPEECH_PORT', 50021)
+cache_enabled = os.environ.get(
+    'ROS_VOICEVOX_TEXTTOSPEECH_CACHE_ENABLED', True)
+cache_enabled = cache_enabled is True \
+    or cache_enabled == 'true'  # for launch env tag.
+
+
+def determine_voice_name(voice_name):
+    if len(voice_name) == 0:
+        speaker_id = DEFAULT_SPEAKER_ID
+    else:
+        if voice_name.isdigit():
+            if voice_name in speaker_id_to_name:
+                speaker_id = voice_name
+            else:
+                print(
+                    '[Text2Wave] Invalid speaker_id ({}). Use default voice.'
+                    .format(speaker_id_to_name[DEFAULT_SPEAKER_ID]))
+                speaker_id = DEFAULT_SPEAKER_ID
+        else:
+            candidates = list(filter(
+                lambda name: name.startswith(voice_name),
+                name_to_speaker_id))
+            if candidates:
+                speaker_id = name_to_speaker_id[candidates[0]]
+            else:
+                print('[Text2Wave] Invalid voice_name ({}). Use default voice.'
+                      .format(speaker_id_to_name[DEFAULT_SPEAKER_ID]))
+                speaker_id = DEFAULT_SPEAKER_ID
+    print('[Text2Wave] Speak using voice_name ({})..'.format(
+        speaker_id_to_name[speaker_id]))
+    return speaker_id
+
+
+def convert_to_str(x):
+    if isinstance(x, str):
+        pass
+    elif isinstance(x, bytes):
+        x = x.decode('utf-8')
+    else:
+        raise ValueError(
+            'Invalid input x type: {}'
+            .format(type(x)))
+    return x
+
+async def request_synthesis(
+        sentence, output_path, speaker_id='1'):
+    async with Client() as client:
+        audio_query = await client.create_audio_query(sentence, speaker=speaker_id)
+        print(audio_query)
+        with open(output_path, "wb") as f:
+            f.write(await audio_query.synthesis(speaker=speaker_id))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('-eval', '--evaluate')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('text')
+    args = parser.parse_args()
+
+    with open(args.text, 'rb') as f:
+        speech_text = convert_to_str(f.readline())
+
+    print('args')
+    print(args)
+    speaker_id = args.evaluate.lstrip('(').rstrip(')')
+    print('id')
+    print(speaker_id)
+
+    asyncio.run(request_synthesis(speech_text,
+                                  args.output,
+                                  speaker_id))
diff --git a/3rdparty/voicevox3/package.xml b/3rdparty/voicevox3/package.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+<?xml-model
+  href="http://download.ros.org/schema/package_format3.xsd"
+  schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>voicevox3</name>
+  <version>2.1.24</version>
+  <description>VOICEVOX: AI speech synthesis</description>
+  <maintainer email="yanokura@jsk.imi.i.u-tokyo.ac.jp">Iori Yanokura</maintainer>
+
+  <license>MIT</license>
+
+  <url type="website">http://ros.org/wiki/voicevox</url>
+
+  <author>Iori Yanokura</author>
+
+  <buildtool_depend>catkin</buildtool_depend>
+  <build_depend>catkin_virtualenv</build_depend>
+
+  <export>
+    <pip_requirements>requirements.txt</pip_requirements>
+  </export>
+
+</package>
diff --git a/3rdparty/voicevox3/requirements.txt b/3rdparty/voicevox3/requirements.txt
@@ -0,0 +1,36 @@
+aiofiles==0.7.0 ; python_version >= "3.8" and python_version < "3.9"
+anyio==3.6.2 ; python_version >= "3.8" and python_version < "3.9"
+appdirs==1.4.4 ; python_version >= "3.8" and python_version < "3.9"
+asgiref==3.6.0 ; python_version >= "3.8" and python_version < "3.9"
+certifi==2022.12.7 ; python_version >= "3.8" and python_version < "3.9"
+cffi==1.15.1 ; python_version >= "3.8" and python_version < "3.9"
+charset-normalizer==2.1.1 ; python_version >= "3.8" and python_version < "3.9"
+click==8.0.4 ; python_version >= "3.8" and python_version < "3.9"
+colorama==0.4.4 ; python_version >= "3.8" and python_version < "3.9" and platform_system == "Windows"
+cython==0.29.24 ; python_version >= "3.8" and python_version < "3.9"
+fastapi==0.70.0 ; python_version >= "3.8" and python_version < "3.9"
+h11==0.14.0 ; python_version >= "3.8" and python_version < "3.9"
+idna==3.4 ; python_version >= "3.8" and python_version < "3.9"
+jinja2==3.1.2 ; python_version >= "3.8" and python_version < "3.9"
+markupsafe==2.1.1 ; python_version >= "3.8" and python_version < "3.9"
+numpy==1.20.0 ; python_version >= "3.8" and python_version < "3.9"
+pycparser==2.21 ; python_version >= "3.8" and python_version < "3.9"
+pydantic==1.10.2 ; python_version >= "3.8" and python_version < "3.9"
+# Commits on Mar 30, 2025 (https://github.com/VOICEVOX/pyopenjtalk/pull/22, use cmake<4.0.0)
+pyopenjtalk @ git+https://github.com/VOICEVOX/pyopenjtalk@5b70b94f3460ece07ea183227db088ce8d5212a6
+## pyopenjtalk @ git+https://github.com/VOICEVOX/pyopenjtalk.git@b35fc89fe42948a28e33aed886ea145a51113f88 ; python_version >= "3.8" and python_version < "3.9"
+python-multipart==0.0.5 ; python_version >= "3.8" and python_version < "3.9"
+pyworld==0.3.0 ; python_version >= "3.8" and python_version < "3.9"
+pyyaml==6.0 ; python_version >= "3.8" and python_version < "3.9"
+requests==2.28.1 ; python_version >= "3.8" and python_version < "3.9"
+scipy==1.7.1 ; python_version >= "3.8" and python_version < "3.9"
+six==1.16.0 ; python_version >= "3.8" and python_version < "3.9"
+sniffio==1.3.0 ; python_version >= "3.8" and python_version < "3.9"
+soundfile==0.10.3.post1 ; python_version >= "3.8" and python_version < "3.9"
+starlette==0.16.0 ; python_version >= "3.8" and python_version < "3.9"
+tqdm==4.64.1 ; python_version >= "3.8" and python_version < "3.9"
+typing-extensions==4.4.0 ; python_version >= "3.8" and python_version < "3.9"
+urllib3==1.26.13 ; python_version >= "3.8" and python_version < "3.9"
+uvicorn==0.15.0 ; python_version >= "3.8" and python_version < "3.9"
+##
+voicevox-client==v0.1.5
diff --git a/3rdparty/voicevox3/setup.py b/3rdparty/voicevox3/setup.py
@@ -0,0 +1,12 @@
+from distutils.core import setup
+
+from catkin_pkg.python_setup import generate_distutils_setup
+from setuptools import find_packages
+
+
+d = generate_distutils_setup(
+    packages=find_packages('python'),
+    package_dir={'': 'python'},
+)
+
+setup(**d)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/usr/bin/bash

		exec rosrun voicevox3 request_synthesis.py $@