diff --git a/.github/workflows/python2.yml b/.github/workflows/python2.yml index a3dec10d8..3ce20c87f 100644 --- a/.github/workflows/python2.yml +++ b/.github/workflows/python2.yml @@ -15,4 +15,4 @@ jobs: - name: Check Python2 run: | apt update -q && apt install -y -q python2 - python2 -m compileall -x 'voicevox/' . + python2 -m compileall -x '[voicevox|emotion_analyzer]/' . diff --git a/emotion_analyzer/.gitignore b/emotion_analyzer/.gitignore new file mode 100644 index 000000000..4414fc1e2 --- /dev/null +++ b/emotion_analyzer/.gitignore @@ -0,0 +1 @@ +requirements.txt diff --git a/emotion_analyzer/CMakeLists.txt b/emotion_analyzer/CMakeLists.txt new file mode 100644 index 000000000..cf1eea0a2 --- /dev/null +++ b/emotion_analyzer/CMakeLists.txt @@ -0,0 +1,48 @@ +cmake_minimum_required(VERSION 2.8.3) +project(emotion_analyzer) + +find_package(catkin REQUIRED COMPONENTS + catkin_virtualenv + std_msgs + message_generation + ) + +catkin_python_setup() + +add_service_files( + FILES + AnalyzeText.srv + AnalyzeAudio.srv + ) + +generate_messages( + DEPENDENCIES + std_msgs + ) + +catkin_package( + CATKIN_DEPENDS rospy std_msgs message_runtime +) + +find_package(Python3 3.8 QUIET COMPONENTS Interpreter) +if(NOT Python3_FOUND) + message(WARNING "emotion_analyzer (Hume AI) requires python3.8 or newer") + return() +endif() +message(STATUS "Found Python: ${Python3_EXECUTABLE}") +message(STATUS "Python Version: ${Python3_VERSION}") + +catkin_generate_virtualenv( + INPUT_REQUIREMENTS requirements.in + PYTHON_INTERPRETER python3 + CHECK_VENV FALSE +) + +file(GLOB PYTHON_SCRIPTS scripts/*.py) +catkin_install_python(PROGRAMS ${PYTHON_SCRIPTS} + DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +) + +install(DIRECTORY launch + DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} +) diff --git a/emotion_analyzer/README.md b/emotion_analyzer/README.md new file mode 100644 index 000000000..3aac77431 --- /dev/null +++ b/emotion_analyzer/README.md @@ -0,0 +1,64 @@ +# Emotion Analyzer Service using Hume API (ROS1) + +This ROS1 package provides a service to analyze emotions from a given text using the [Hume AI](https://www.hume.ai/) API. + +## Requirements + +- ROS1 Noetic +- Python 3.8+ +- An API key from Hume AI + +## Installation + +Clone this repository and move to this directory + +``` bash +rosdep install -iry --from-paths . +catkin build --this +``` + +then source your workspace + +## Usage (Quick) + +Using your microphone + +``` bash +roslaunch emotion_analyzer sample_emotion_analyzer.launch api_key:= +``` + + +## Usage + +### 1. Launch Emotion_Analyzer +```bash +roslaunch emotion_analyzer emotion_analyzer.launch api_key:= +``` + +### 2. Call the service +For text, +```bash +rosservice call /analyze_text "text: ''" +``` +For prepared audio (up to 5 seconds), +```bash +rosservice call /analyze_audio "audio_file: " +``` +As a sample, you can use `'/home/leus/ros/catkin_ws/src/jsk_3rdparty/emotion_analyzer/data/purugacha_short.wav'` as . + +For audio from microphone, +```bash +roslaunch audio_capture capture.launch format:=wave +rosservice call /analyze_audio "audio_file: ''" +``` +You can check the device information by `arecord -l`. +Sometimes you need to replace "hw" with "plughw": +for example, `roslaunch audio_capture capture.launch format:=wave device:=plughw:1,0`. +When the device is busy, you can try `fuser -v /dev/snd/*` to get PID and kill it by `kill -9 `. + + + + + + + diff --git a/emotion_analyzer/data/purugacha.wav b/emotion_analyzer/data/purugacha.wav new file mode 100644 index 000000000..dd9892373 Binary files /dev/null and b/emotion_analyzer/data/purugacha.wav differ diff --git a/emotion_analyzer/data/purugacha_short.wav b/emotion_analyzer/data/purugacha_short.wav new file mode 100644 index 000000000..b5d5577d1 Binary files /dev/null and b/emotion_analyzer/data/purugacha_short.wav differ diff --git a/emotion_analyzer/launch/emotion_analyzer.launch b/emotion_analyzer/launch/emotion_analyzer.launch new file mode 100644 index 000000000..d4fe2acb7 --- /dev/null +++ b/emotion_analyzer/launch/emotion_analyzer.launch @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/emotion_analyzer/launch/sample_emotion_analyzer.launch b/emotion_analyzer/launch/sample_emotion_analyzer.launch new file mode 100644 index 000000000..bcc4d35aa --- /dev/null +++ b/emotion_analyzer/launch/sample_emotion_analyzer.launch @@ -0,0 +1,14 @@ + + + + + + + + + + + + + diff --git a/emotion_analyzer/package.xml b/emotion_analyzer/package.xml new file mode 100644 index 000000000..847b7745d --- /dev/null +++ b/emotion_analyzer/package.xml @@ -0,0 +1,30 @@ + + + emotion_analyzer + 0.0.0 + The emotion_analyzer package + + Kei Okada + Yoshiki Obinata + + BSD + + Ayaha Nagata + + catkin + + catkin_virtualenv + message_generation + + audio_capture + message_runtime + python3-pydub + python3-soundfile + rospy + + std_msgs + + + requirements.txt + + diff --git a/emotion_analyzer/requirements.in b/emotion_analyzer/requirements.in new file mode 100644 index 000000000..0fb2121d9 --- /dev/null +++ b/emotion_analyzer/requirements.in @@ -0,0 +1 @@ +hume[stream] diff --git a/emotion_analyzer/scripts/analyze_audio_service.py b/emotion_analyzer/scripts/analyze_audio_service.py new file mode 100644 index 000000000..3daeec4c9 --- /dev/null +++ b/emotion_analyzer/scripts/analyze_audio_service.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +import rospy +import asyncio +from hume import HumeStreamClient +from hume.models.config import ProsodyConfig, BurstConfig +from emotion_analyzer.srv import AnalyzeAudio, AnalyzeAudioResponse +from emotion_analyzer.utils.audio_buffer import AudioBuffer +import soundfile as sf +from pydub import AudioSegment +from io import BytesIO +from base64 import b64encode +import os +import pprint +import json +from std_msgs.msg import String +from audio_common_msgs.msg import AudioInfo + +class AudioServiceNode: + def __init__(self): + self.api_key = rospy.get_param("hume_api_key", None) + if not self.api_key: + rospy.logerr("API key has not been set") + exit(1) + + self.client = HumeStreamClient(self.api_key) + self.config = [BurstConfig(), ProsodyConfig()] + self.audio_buffer = AudioBuffer(topic_name="~audio", + window_size=2.0, + auto_start=True) + self.expected_coding_format = "wave" + self.audio_info_sub = rospy.Subscriber("/audio/audio_info", AudioInfo, self.audio_info_callback) + rospy.Service("analyze_audio", AnalyzeAudio, self.handle_request) + rospy.loginfo("Audio-to-Emotion Analysis Service ready.") + + def audio_info_callback(self, msg): + #check the audio format + if msg.coding_format != self.expected_coding_format: + rospy.logwarn(f"Coding_format mismatch: expected {self.expected_coding_format}, got {msg.coding_format}") + + def handle_request(self, req): + rospy.loginfo("Received request for analysis") + result = asyncio.run(self.analyze_audio(req.audio_file)) + rospy.loginfo("Finished analysis") + if isinstance(result, dict): + result_json = json.dumps(result) + else: + result_json = str(result) + return AnalyzeAudioResponse(result=result_json) + + async def analyze_audio(self, audio_file): + if audio_file: + segment = AudioSegment.from_file(audio_file) + wav_bytes = segment.raw_data + sample_width = segment.sample_width + channels = segment.channels + frame_rate = segment.frame_rate + else: + samples = self.audio_buffer.read() + if samples is None or len(samples) == 0: + rospy.logwarn("Audio data cannot be found. Check the audio topic name.") + return {"error": "No audio data received."} + wav_bytes = samples.tobytes() + sample_width = self.audio_buffer.bitdepth // 8 + channels = self.audio_buffer.n_channel + frame_rate = self.audio_buffer.input_sample_rate + segment = AudioSegment( + data=wav_bytes, + sample_width=sample_width, + channels=channels, + frame_rate=frame_rate, + ) + # check the length of the audio + duration_ms = len(segment) + if duration_ms > 5000: + raise Exception(f"Audio is too long: audio length = {duration_ms}ms") + + buf = BytesIO() + segment.export(buf, format="wav") + wav_bytes = buf.getvalue() + b64_audio_str = b64encode(wav_bytes).decode("utf-8") + + async with self.client.connect(self.config) as socket: + result = await socket.send_bytes(b64_audio_str.encode("utf-8")) + pprint.pprint(result) + + result_prosody = None + result_burst = None + + if result and isinstance(result, dict): + if 'prosody' in result and 'predictions' in result['prosody']: + result_prosody = result['prosody']['predictions'][0]['emotions'] + if 'burst' in result and 'predictions' in result['burst']: + result_burst = result['burst']['predictions'][0]['emotions'] + # if not predictions: + # rospy.logwarn("No predictions found in the result.") + return {"prosody": result_prosody, "burst": result_burst} + else: + rospy.logerr("Error in receiving valid result.") + return {"prosody": None, "burst": None} + #emotions = result["prosody"]["predictions"][0]["emotions"] + #return str(emotions) + +if __name__ == "__main__": + rospy.init_node("analyze_audio_service_node") + AudioServiceNode() + rospy.spin() diff --git a/emotion_analyzer/scripts/analyze_text_service.py b/emotion_analyzer/scripts/analyze_text_service.py new file mode 100644 index 000000000..bd5ba44f5 --- /dev/null +++ b/emotion_analyzer/scripts/analyze_text_service.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +import rospy +import asyncio +from hume import HumeStreamClient +from hume.models.config import LanguageConfig +from emotion_analyzer.srv import AnalyzeText, AnalyzeTextResponse +import pprint +import json + +class TextServiceNode: + def __init__(self): + self.api_key = rospy.get_param("hume_api_key", None) + if self.api_key is None: + rospy.logerr("API key has not been set") + exit(1) + + self.client = HumeStreamClient(self.api_key) + self.config = LanguageConfig(granularity="sentence") + #granularity="word": analyze each word / granularity="turn": analyze whole text + rospy.Service("analyze_text", AnalyzeText, self.handle_request) + rospy.loginfo("Text-to-Emotion Analysis Service ready.") + + def handle_request(self, req): + rospy.loginfo("Received text for analysis") + result = asyncio.run(self.analyze_text(req.text)) + rospy.loginfo("Finished analysis") + result_json = json.dumps(result) + return AnalyzeTextResponse(result_json) + + async def analyze_text(self, text): + async with self.client.connect([self.config]) as socket: + result = await socket.send_text(text) + pprint.pprint(result) + emotions = result["language"]["predictions"][0]["emotions"] + return {"emotions": emotions} # return dict + +if __name__ == "__main__": + rospy.init_node("analyze_text_service_node") + TextServiceNode() + rospy.spin() diff --git a/emotion_analyzer/setup.py b/emotion_analyzer/setup.py new file mode 100644 index 000000000..101cae1a0 --- /dev/null +++ b/emotion_analyzer/setup.py @@ -0,0 +1,11 @@ +# ~/ros/catkin_ws/src/jsk_3rdparty/emotion_analyzer/setup.py + +from distutils.core import setup +from catkin_pkg.python_setup import generate_distutils_setup + +d = generate_distutils_setup( + packages=['emotion_analyzer'], + package_dir={'': 'src'} +) + +setup(**d) diff --git a/emotion_analyzer/src/emotion_analyzer/__init__.py b/emotion_analyzer/src/emotion_analyzer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/emotion_analyzer/src/emotion_analyzer/utils/audio_buffer.py b/emotion_analyzer/src/emotion_analyzer/utils/audio_buffer.py new file mode 100644 index 000000000..d7ebef9db --- /dev/null +++ b/emotion_analyzer/src/emotion_analyzer/utils/audio_buffer.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +from __future__ import division + +import array +import asyncio +from base64 import b64encode +from io import BytesIO +from threading import Lock +import traceback +import time + +from audio_common_msgs.msg import AudioData +from hume import HumeStreamClient +from hume.models.config import BurstConfig +from hume.models.config import ProsodyConfig +from hume import StreamSocket +import numpy as np +from pydub import AudioSegment +import rospy +import soundfile as sf + +class AudioBuffer(object): + + def __init__(self, topic_name='~audio', + input_sample_rate=16000, + window_size=10.0, + bitdepth=16, + n_channel=1, target_channel=0, + get_latest_data=False, + discard_data=False, + auto_start=False): + self.is_subscribing = True + self.get_latest_data = get_latest_data + self.discard_data = discard_data + self._window_size = window_size + self.audio_buffer_len = int(self._window_size * input_sample_rate) + self.lock = Lock() + self.bitdepth = bitdepth + self.n_channel = n_channel + self.target_channel = min(self.n_channel - 1, max(0, target_channel)) + self.input_sample_rate = input_sample_rate + self.type_code = {} + for code in ['b', 'h', 'i', 'l']: + self.type_code[array.array(code).itemsize] = code + + self.dtype = self.type_code[self.bitdepth / 8] + self.audio_buffer = np.array([], dtype=self.dtype) + + self.max_value = 2 ** (self.bitdepth - 1) - 1 + + self.topic_name = topic_name + + if auto_start: + self.subscribe() + + def __len__(self): + return len(self.audio_buffer) + + @property + def window_size(self): + return self._window_size + + @window_size.setter + def window_size(self, size): + with self.lock: + self._window_size = size + self.audio_buffer_len = int(self._window_size + * self.input_sample_rate) + self.audio_buffer = np.array([], dtype=self.dtype) + + @staticmethod + def from_rosparam(**kwargs): + n_channel = rospy.get_param('~n_channel', 1) + target_channel = rospy.get_param('~target_channel', 0) + mic_sampling_rate = rospy.get_param('~mic_sampling_rate', 16000) + bitdepth = rospy.get_param('~bitdepth', 16) + return AudioBuffer(input_sample_rate=mic_sampling_rate, + bitdepth=bitdepth, + n_channel=n_channel, + target_channel=target_channel, + **kwargs) + + def subscribe(self): + self.audio_buffer = np.array([], dtype=self.dtype) + self.sub_audio = rospy.Subscriber( + self.topic_name, AudioData, self.audio_cb) + + def unsubscribe(self): + self.sub_audio.unregister() + + def _read(self, size, normalize=False): + with self.lock: + if self.get_latest_data: + audio_buffer = self.audio_buffer[-size:] + else: + audio_buffer = self.audio_buffer[:size] + if self.discard_data: + self.audio_buffer = self.audio_buffer[size:] + if normalize is True: + audio_buffer = audio_buffer / self.max_value + return audio_buffer + + def sufficient_data(self, size): + return len(self.audio_buffer) < size + + def read(self, size=None, wait=False, normalize=False): + if size is None: + size = int(self.audio_buffer_len) + while wait is True \ + and not rospy.is_shutdown() and len(self.audio_buffer) < size: + rospy.sleep(0.001) + return self._read(size, normalize=normalize) + + def close(self): + try: + self.sub_audio.unregister() + except Exception: + pass + self.audio_buffer = np.array([], dtype=self.dtype) + + def audio_cb(self, msg): + audio_buffer = np.frombuffer(msg.data, dtype=self.dtype) + audio_buffer = audio_buffer[self.target_channel::self.n_channel] + with self.lock: + self.audio_buffer = np.append( + self.audio_buffer, audio_buffer) + self.audio_buffer = self.audio_buffer[ + -self.audio_buffer_len:] diff --git a/emotion_analyzer/srv/AnalyzeAudio.srv b/emotion_analyzer/srv/AnalyzeAudio.srv new file mode 100644 index 000000000..fdb5f72a4 --- /dev/null +++ b/emotion_analyzer/srv/AnalyzeAudio.srv @@ -0,0 +1,3 @@ +string audio_file +--- +string result diff --git a/emotion_analyzer/srv/AnalyzeText.srv b/emotion_analyzer/srv/AnalyzeText.srv new file mode 100644 index 000000000..ed170238e --- /dev/null +++ b/emotion_analyzer/srv/AnalyzeText.srv @@ -0,0 +1,3 @@ +string text +--- +string result