diff --git a/duui-video-to-audio/.dockeringore b/duui-video-to-audio/.dockeringore new file mode 100644 index 00000000..9d9808c2 --- /dev/null +++ b/duui-video-to-audio/.dockeringore @@ -0,0 +1,4 @@ +.idea/ +target/ +venv/ +*.mp4 diff --git a/duui-video-to-audio/.gitignore b/duui-video-to-audio/.gitignore new file mode 100644 index 00000000..f5f085f0 --- /dev/null +++ b/duui-video-to-audio/.gitignore @@ -0,0 +1,3 @@ +.idea/ +target/ +venv*/ diff --git a/duui-video-to-audio/README.md b/duui-video-to-audio/README.md new file mode 100644 index 00000000..df2f23c3 --- /dev/null +++ b/duui-video-to-audio/README.md @@ -0,0 +1,13 @@ +#### Video2Audio component for + +Uses ffmpeg-python package to convert video to audio. + +#### Input/Output: + +input: video + +output: audio + +#### Parameter: + +none diff --git a/duui-video-to-audio/pom.xml b/duui-video-to-audio/pom.xml new file mode 100644 index 00000000..f5c4d81e --- /dev/null +++ b/duui-video-to-audio/pom.xml @@ -0,0 +1,155 @@ + + + 4.0.0 + + org.texttechnologylab.duui + duui_text_to_image + 0.1.0 + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + aabusale + Ali Abusaleh + a.abusaleh@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/ali-abusaleh/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + Research assistant + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + + 17 + 17 + 2.4.0 + + + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 1.4 + + + + + + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.5 + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + \ No newline at end of file diff --git a/duui-video-to-audio/python/TypeSystem.xml b/duui-video-to-audio/python/TypeSystem.xml new file mode 100644 index 00000000..a86c216a --- /dev/null +++ b/duui-video-to-audio/python/TypeSystem.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/duui-video-to-audio/python/duui-video-to-audio.py b/duui-video-to-audio/python/duui-video-to-audio.py new file mode 100644 index 00000000..117ce245 --- /dev/null +++ b/duui-video-to-audio/python/duui-video-to-audio.py @@ -0,0 +1,122 @@ +from typing import List, Optional, Dict, Union +from time import time +from fastapi import FastAPI, Response +from fastapi.encoders import jsonable_encoder +from cassis import load_typesystem +from functools import lru_cache +from io import BytesIO +import base64 +import uvicorn + +import warnings +import os + +import ffmpeg +import sys + +from starlette.responses import PlainTextResponse, JSONResponse + + +def convert_to_mp3(video_path): + # Define the output audio path + audio_path = "tempAudio.mp3" + + # Use ffmpeg to extract audio from the video + ffmpeg.input(video_path).output(audio_path).run() + + # Read the audio file and convert it to base64 + with open(audio_path, "rb") as audio_file: + audio_base64 = base64.b64encode(audio_file.read()).decode('utf-8') + + # Remove the temporary files + os.remove(video_path) + os.remove(audio_path) + + return audio_base64 + + +class DUUIRequest(BaseModel): + # The texts language + video: str + +class DUUIResponse(BaseModel): + # The texts language + audio: str + mimetype: str + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title= "Video2Audio", + description="Video To Audio Component", + version= "1.0", + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "Peter Kannewitz", + "email": "pk35momo@studserv.uni-leipzig.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +lua_communication_script_filename = "duui_video_to_audio.lua" + +with open(lua_communication_script_filename, 'rb') as f: + lua_communication_script = f.read().decode("utf-8") + +typesystem_filename = 'TypeSystem.xml' +with open(typesystem_filename, 'rb') as f: + typesystem = load_typesystem(f) + + +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO rimgve cassis dependency, as only needed for typesystem at the moment? + xml = typesystem.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + +# Process request from DUUI +@app.post("/v1/process") +def post_process(request: DUUIRequest): + # Fetch model-related information + # this video is base64 + video = request.video + + try: + # convert base64 to video mp4, and saved it locallz + with open("tempVideo.mp4", "wb") as f: + f.write(base64.b64decode(request.video)) + + # 1 - read the file + # 2- extract the audio + # 3- covert the audio into base64 + # 4- send back base64 audio + # Convert the video to MP3 and get the base64-encoded audio + audio_base64 = convert_to_mp3("tempVideo.mp4") + + # Print or send back the base64-encoded audio + print(audio_base64) + + return DUUIResponse( + audio = audio_base64, + mimetype = "audio/mp3" + ) + except Exception as e: + print(str(e)) + + +if __name__ == "__main__": + uvicorn.run("duui-video-to-audio:app", host="0.0.0.0", port=9714, workers=1) diff --git a/duui-video-to-audio/python/duui_video_to_audio.lua b/duui-video-to-audio/python/duui_video_to_audio.lua new file mode 100644 index 00000000..fb3551a4 --- /dev/null +++ b/duui-video-to-audio/python/duui_video_to_audio.lua @@ -0,0 +1,39 @@ +-- Bind static classes from java +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +util = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") + +-- This "serialize" function is called to transform the CAS object into an stream that is sent to the annotator +-- Inputs: +-- - inputCas: The actual CAS object to serialize +-- - outputStream: Stream that is sent to the annotator, can be e.g. a string, JSON payload, ... +function serialize(inputCas, outputStream, params) + -- Get data from CAS + print("Start serialize") + local videoBase64 = inputCas:getSofaDataString() --inputCas:getView(audioView):getSofaDataString() + print ("Video ", videoBase64) + -- Encode data as JSON object and write to stream + outputStream:write(json.encode({ + video = videoBase64, + language = language + })) +end + +-- This "deserialize" function is called on receiving the results from the annotator that have to be transformed into a CAS object +-- Inputs: +-- - inputCas: The actual CAS object to deserialize into +-- - inputStream: Stream that is received from to the annotator, can be e.g. a string, JSON payload, ... +function deserialize(inputCas, inputStream) + --print("deserialize") + -- Get string from stream, assume UTF-8 encoding + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + + -- Parse JSON data from string into object + local results = json.decode(inputString) + + --print("results", results) + -- Add tokens to jcas + if results["audio"] ~= nil then + + inputCas:setSofaDataString(results["audio"], "audio/mp3") + end +end \ No newline at end of file diff --git a/duui-video-to-audio/requirements.txt b/duui-video-to-audio/requirements.txt new file mode 100644 index 00000000..38d2d139 --- /dev/null +++ b/duui-video-to-audio/requirements.txt @@ -0,0 +1,9 @@ +scipy==1.13.1 +protobuf==4.25.3 +fastapi==0.110.0 +dkpro-cassis==0.9.1 +uvicorn[standard]==0.27.1 +pydantic-settings==2.0.2 +six==1.16.0 +peft==0.10.0 +ffmpeg-python \ No newline at end of file diff --git a/duui-video-to-audio/src/test/java/Video2Audio.java b/duui-video-to-audio/src/test/java/Video2Audio.java new file mode 100644 index 00000000..5768f961 --- /dev/null +++ b/duui-video-to-audio/src/test/java/Video2Audio.java @@ -0,0 +1,109 @@ +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.io.FileUtils; +import org.apache.uima.cas.CASException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.InvalidXMLException; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.xml.sax.SAXException; + +import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Base64; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import org.texttechnologylab.DockerUnifiedUIMAInterface.io.writer.TTLabXmiWriter; +import org.dkpro.core.io.xmi.XmiWriter; + +import javax.imageio.ImageIO; + +public class Video2Audio { + + ClassLoader classLoader = Video2Audio.class.getClassLoader(); + URL fVideo = classLoader.getResource("interview2.mp4"); + + private static void saveBase64ToAudio(String base64String, String outputPath) { + try { + // Decode the Base64 string into a byte array + byte[] decodedBytes = Base64.getDecoder().decode(base64String); + + // Save the image to the specified output file + File outputFile = new File(outputPath); + Files.write(outputFile.toPath(), decodedBytes); + + System.out.println("Video saved as: " + outputPath); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @Test + @DisplayName("ExtractionTest") + public void ExtractionTest() throws Exception { + + + DUUIComposer composer = new DUUIComposer().withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver duuiuimaDriver = new DUUIUIMADriver(); + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver, duuiuimaDriver); + + + File fFile = new File(fVideo.getPath()); + byte[] bFile = FileUtils.readFileToByteArray(fFile); + String encodedString = Base64.getEncoder().encodeToString(bFile); + String pMimeType = Files.probeContentType(Path.of(fVideo.getPath())); + + JCas pCas = JCasFactory.createText("Programm"); + + JCas videoView = pCas.createView("video"); + JCas transcript = pCas.createView("transcript"); + videoView.setSofaDataString(encodedString, pMimeType); + videoView.setDocumentLanguage("de"); + + JCas audioView = pCas.createView("audio"); + + composer.add( + new DUUIRemoteDriver.Component("http://localhost:9714") + .withTargetView("audio") + .withSourceView("video") + .withScale(1).build()); + + + + composer.add(new DUUIRemoteDriver.Component("http://whisperx.lehre.texttechnologylab.org") + .withScale(1) + .withSourceView("audio") + .withTargetView("transcript") + .build() + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, "/tmp/xmi/", + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_COMPRESSION, "GZIP")).build()); + + composer.run(pCas); + + saveBase64ToAudio(audioView.getSofa().getSofaString(), "/tmp/audio/test_audi.mp3"); + + + } + +}