Speech to text new note type: integrate whisper-like client #7320

rmkni · 2025-08-27T20:46:27Z

rmkni
Aug 27, 2025

Describe feature

Revolutionize note-taking by introducing a novel note type that triggers a voice command, capturing audio input and leveraging APIs for transcription and AI-powered reformulation, streamlining the process and boosting productivity.

Proposition to reuse an existing project like Whispering https://github.com/epicenter-so/epicenter/tree/main/apps/whispering. This tool runs only in the browser locally and directly calls APIs. Could be easy to integrate?

Additional Information

No response

eliandoran · 2025-10-14T16:51:00Z

eliandoran
Oct 14, 2025
Maintainer

For now we are not planning on adding new note types, due to the considerable maintenance burden, however it's an interesting idea to be implemented as some kind of plugin.

0 replies

thfrei · 2025-12-22T14:51:22Z

thfrei
Dec 22, 2025

@rmkni I created this little RenderNote combo, that uses an openai compatible API to create notes with the transcribed content.
I did have some problems adding an attachmenet (the original recording), however it serves me well already.

Create RenderNote (name it like "Voice Recording") - this note you can bookmark
Create HTML Note (best as child)
Create Note, type JS Frontend it MUST be a child of this HTML Note. Make sure to add labels, as described in code
Once you record, and click stop and upload, it will create a new child note to the current Day note in the journal, with the transcript.

Example:

Notes:

I'm working on some more features to keep last recording note in local storage. Since i'm using litellm proxy, i had some network timeouts, because of the too large body payload.
Hurray to the trilium HTML/JS extension features which make this possible!! :-P
PPS this was quite some llm assisted coding and manual fixing to account for trilium API. So do not trust it too much ;-)

HTML Note

<style>
    .voice-recorder { max-width: 600px; margin: 20px auto; padding: 20px; }
    .voice-recorder button { padding: 10px 20px; margin: 5px; }
    .voice-recorder button:disabled { opacity: 0.5; cursor: not-allowed; }
    .voice-recorder .message { padding: 10px; margin: 10px 0; border: 1px solid; }
    .voice-recorder .success { background: #d4edda; color: #155724; border-color: #c3e6cb; }
    .voice-recorder .error { background: #f8d7da; color: #721c24; border-color: #f5c6cb; }
    .voice-recorder audio { width: 100%; margin: 10px 0; }
</style>

<div class="voice-recorder">
    <button id="startBtn">🔴 Start Recording</button>
    <button id="stopBtn" disabled>⏸️ Stop</button>
    <button id="stopAndUploadBtn" disabled>✅ Stop & Upload</button>
    <div id="timer">00:00</div>
    <div id="audioPreview"></div>
    <div id="message"></div>
</div>

<!-- voice-recorder.js is included via trilium note-child feature -->

JS Frontend:

// Installation: 
// Create labels: `#OPENAI_API_BASE="https://url" #OPENAI_API_KEY=sk-abc`

const AUDIO_FORMAT = 'audio/webm';
let mediaRecorder, audioChunks = [], startTime, timerInterval, audioBlob;
let $, timer, audioPreview, message;

async function initVoiceRecorder() {
    $ = (sel) => api.$container.find(sel);
    [timer, audioPreview, message] = ['#timer', '#audioPreview', '#message'].map(s => $(s)[0]);
    
    $('#startBtn').on('click', startRecording);
    $('#stopBtn').on('click', stopRecording);
    $('#stopAndUploadBtn').on('click', stopAndUpload);
}

const updateTimer = () => {
    const elapsed = Date.now() - startTime;
    const totalSeconds = Math.floor(elapsed / 1000);
    const minutes = Math.floor(totalSeconds / 60);
    const seconds = totalSeconds % 60;
    
    let dots = '';
    for (let i = 0; i < totalSeconds; i++) {
        if (i > 0 && i % 30 === 0) {
            dots += '<br />';
        }
        dots += '.';
    }
    
    timer.innerHTML = `${String(minutes).padStart(2, '0')}:${String(seconds).padStart(2, '0')} ${dots}`;
};

const showMessage = (text, type) => {
    message.className = `message ${type}`;
    message.innerHTML = text;
};

const hideMessage = () => message.className = message.innerHTML = '';

async function startRecording() {
    try {
        hideMessage();
        audioPreview.innerHTML = '';
        audioChunks = [];

        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
        mediaRecorder = new MediaRecorder(stream, MediaRecorder.isTypeSupported(AUDIO_FORMAT) ? { mimeType: AUDIO_FORMAT } : {});
        
        if (!MediaRecorder.isTypeSupported(AUDIO_FORMAT)) {
            showMessage(`Format ${AUDIO_FORMAT} not supported. Using default.`, 'error');
        }

        mediaRecorder.ondataavailable = e => audioChunks.push(e.data);
        mediaRecorder.onstop = () => {
            audioBlob = new Blob(audioChunks, { type: mediaRecorder.mimeType });
            audioPreview.innerHTML = `
                <h3>Recording Raw</h3>
                <audio controls src="${URL.createObjectURL(audioBlob)}"></audio>
                <p>Size: ${(audioBlob.size / 1024).toFixed(2)} KB | Format: WEBM</p>
                <button id="uploadBtn">Upload to Trilium</button>
            `;
            $('#uploadBtn').on('click', transcribeAndSave);
        };

        mediaRecorder.start();
        startTime = Date.now();
        timerInterval = setInterval(updateTimer, 100);
        setButtonStates(true, false, false, 'Recording...');
    } catch (error) {
        showMessage(`Error: ${error.message}`, 'error');
    }
}

const setButtonStates = (start, stop, upload, statusText) => {
    $('#startBtn')[0].disabled = start;
    $('#stopBtn')[0].disabled = stop;
    $('#stopAndUploadBtn')[0].disabled = upload;
};

function stopRecording() {
    if (mediaRecorder?.state === 'recording') {
        mediaRecorder.stop();
        mediaRecorder.stream.getTracks().forEach(t => t.stop());
        clearInterval(timerInterval);
        setButtonStates(false, true, true, 'Recording stopped');
    }
}

async function stopAndUpload() {
    if (mediaRecorder?.state === 'recording') {
        mediaRecorder.stop();
        mediaRecorder.stream.getTracks().forEach(t => t.stop());
        clearInterval(timerInterval);
        setButtonStates(true, true, true, 'Processing...');

        await new Promise(resolve => mediaRecorder.addEventListener('stop', resolve, { once: true }));
        await new Promise(resolve => setTimeout(resolve, 100));
        
        if (audioBlob) await transcribeAndSave();
    }
}

async function transcribeAudio(audioBlob, openaiApiKey, openaiApiBase) {
    const formData = new FormData();
    formData.append('file', audioBlob, 'recording.webm');
    formData.append('model', 'openai/whisper-1');

    const res = await fetch(`${openaiApiBase}/v1/audio/transcriptions`, {
        method: 'POST',
        headers: { 'Authorization': `Bearer ${openaiApiKey}` },
        body: formData
    });

    if (!res.ok) throw new Error(`Transcription failed: ${res.status} - ${await res.text()}`);
    return (await res.json()).text;
}

async function createVoiceNote(transcription, parentNoteId = '') {
    const recordedDate = new Date();
    const timestamp = recordedDate.toISOString().replace(/[:.]/g, '-').slice(0, -5);
    const noteTitle = `Voice Note ${recordedDate.toLocaleString()}`;
    const noteContent = `<h2>Transcription</h2><p>${transcription}</p><hr><p><em>Recorded on ${recordedDate.toLocaleString()}</em></p>`;
    
    const voiceNoteId = await api.runOnBackend((parentNoteId, noteTitle, noteContent) => {
        return api.createTextNote(parentNoteId, noteTitle, noteContent).note.noteId;
    }, [parentNoteId, noteTitle, noteContent]);
    
    await api.waitUntilSynced();
    return voiceNoteId;
}

async function transcribeAndSave() {
    const openaiApiKey = api.currentNote.getLabelValue('OPENAI_API_KEY');
    const openaiApiBase = api.currentNote.getLabelValue('OPENAI_API_BASE') || 'https://api.openai.com';

    if (!openaiApiKey) {
        showMessage('Please add label #OPENAI_API_KEY to this note with your API key.', 'error');
        return;
    }

    const uploadBtn = $('#uploadBtn');
    uploadBtn.prop('disabled', true).text('Transcribing...');

    try {
        const transcription = await transcribeAudio(audioBlob, openaiApiKey, openaiApiBase);
        uploadBtn.text('Creating note...');

        const dayNote = await api.getTodayNote();
        const voiceNoteId = await createVoiceNote(transcription, dayNote.noteId);
        
        await api.activateNewNote(voiceNoteId);
        api.showMessage('Created voice note');
        
        showMessage(
            `Successfully transcribed and uploaded!<br><strong>Transcription:</strong> "${transcription.substring(0, 100)}${transcription.length > 100 ? '...' : ''}"`,
            'success'
        );

        setTimeout(() => {
            audioPreview.innerHTML = '';
            timer.textContent = '00:00';
            hideMessage();
        }, 5000);
    } catch (error) {
        showMessage(`Upload failed: ${error.message}`, 'error');
        uploadBtn.prop('disabled', false).text('Upload to Trilium');
    }
}

initVoiceRecorder();

0 replies

Userwei0418 · 2025-12-26T20:06:22Z

Userwei0418
Dec 26, 2025

A more convenient implementation：
https://github.com/Userwei0418/trilium_AI_Voice_Note-

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Trilium Next

Speech to text new note type: integrate whisper-like client #7320

Uh oh!

{{title}}

Uh oh!

Replies: 3 comments

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Trilium Next

Speech to text new note type: integrate whisper-like client #7320

Uh oh!

rmkni Aug 27, 2025

Describe feature

Additional Information

Replies: 3 comments

Uh oh!

eliandoran Oct 14, 2025 Maintainer

Uh oh!

Uh oh!

thfrei Dec 22, 2025

Uh oh!

Userwei0418 Dec 26, 2025

rmkni
Aug 27, 2025

eliandoran
Oct 14, 2025
Maintainer

thfrei
Dec 22, 2025

Userwei0418
Dec 26, 2025