|
| 1 | +(ns simulflow-examples.local-w-interruption-support |
| 2 | + "This example demonstrates interruption handling When the LLM is responding to |
| 3 | + the user, if the user says something, the current playback is interrupted and |
| 4 | + processors that have interruption support drop their activity until the user stops speaking." |
| 5 | + (:require |
| 6 | + [clojure.core.async :as a] |
| 7 | + [clojure.core.async.flow :as flow] |
| 8 | + [simulflow-examples.local :as local] |
| 9 | + [simulflow.async :refer [vthread-loop]] |
| 10 | + [simulflow.processors.activity-monitor :as activity-monitor] |
| 11 | + [simulflow.processors.deepgram :as deepgram] |
| 12 | + [simulflow.processors.elevenlabs :as xi] |
| 13 | + [simulflow.processors.llm-context-aggregator :as context] |
| 14 | + [simulflow.processors.openai :as openai] |
| 15 | + [simulflow.processors.system-frame-router :as system-router] |
| 16 | + [simulflow.secrets :refer [secret]] |
| 17 | + [simulflow.transport :as transport] |
| 18 | + [simulflow.transport.in :as transport-in] |
| 19 | + [simulflow.transport.out :as transport-out] |
| 20 | + [simulflow.vad.silero :as silero] |
| 21 | + [taoensso.telemere :as t])) |
| 22 | + |
| 23 | +(def llm-context |
| 24 | + {:messages |
| 25 | + [{:role "system" |
| 26 | + :content "You are a voice agent operating via phone. Be |
| 27 | + concise in your answers. The input you receive comes from a |
| 28 | + speech-to-text (transcription) system that isn't always |
| 29 | + efficient and may send unclear text. Ask for |
| 30 | + clarification when you're unsure what the person said."}] |
| 31 | + :tools |
| 32 | + [{:type :function |
| 33 | + :function |
| 34 | + {:name "get_weather" |
| 35 | + :handler (fn [{:keys [town]}] (str "The weather in " town " is 17 degrees celsius")) |
| 36 | + :description "Get the current weather of a location" |
| 37 | + :parameters {:type :object |
| 38 | + :required [:town] |
| 39 | + :properties {:town {:type :string |
| 40 | + :description "Town for which to retrieve the current weather"}} |
| 41 | + :additionalProperties false} |
| 42 | + :strict true}}]}) |
| 43 | + |
| 44 | +(def chunk-duration 20) |
| 45 | + |
| 46 | +(def flow-processors |
| 47 | + {;; Capture audio from microphone and send raw-audio-input frames further in the pipeline |
| 48 | + :transport-in {:proc transport-in/microphone-transport-in |
| 49 | + :args {:vad/analyser :vad.analyser/silero |
| 50 | + :pipeline/supports-interrupt? true}} |
| 51 | + ;; raw-audio-input -> transcription frames |
| 52 | + :transcriptor {:proc deepgram/deepgram-processor |
| 53 | + :args {:transcription/api-key (secret [:deepgram :api-key]) |
| 54 | + :transcription/interim-results? true |
| 55 | + :transcription/punctuate? false |
| 56 | + ;; We use silero for computing VAD |
| 57 | + :transcription/vad-events? false |
| 58 | + :transcription/smart-format? true |
| 59 | + :transcription/model :nova-2 |
| 60 | + :transcription/utterance-end-ms 1000 |
| 61 | + :transcription/language :en}} |
| 62 | + |
| 63 | + ;; user transcription & llm message frames -> llm-context frames |
| 64 | + ;; responsible for keeping the full conversation history |
| 65 | + :context-aggregator {:proc context/context-aggregator |
| 66 | + :args {:llm/context llm-context}} |
| 67 | + |
| 68 | + ;; Takes llm-context frames and produces new llm-text-chunk & llm-tool-call-chunk frames |
| 69 | + :llm {:proc openai/openai-llm-process |
| 70 | + :args {:openai/api-key (secret [:openai :new-api-sk]) |
| 71 | + :llm/model :gpt-4.1-mini}} |
| 72 | + |
| 73 | + ;; llm-text-chunk & llm-tool-call-chunk -> llm-context-messages-append frames |
| 74 | + :assistant-context-assembler {:proc context/assistant-context-assembler |
| 75 | + :args {}} |
| 76 | + |
| 77 | + ;; llm-text-chunk -> sentence speak frames (faster for text to speech) |
| 78 | + :llm-sentence-assembler {:proc context/llm-sentence-assembler} |
| 79 | + |
| 80 | + ;; speak-frames -> audio-output-raw frames |
| 81 | + :tts {:proc xi/elevenlabs-tts-process |
| 82 | + :args {:elevenlabs/api-key (secret [:elevenlabs :api-key]) |
| 83 | + :elevenlabs/model-id "eleven_flash_v2_5" |
| 84 | + :elevenlabs/voice-id (secret [:elevenlabs :voice-id]) |
| 85 | + :voice/stability 0.5 |
| 86 | + :voice/similarity-boost 0.8 |
| 87 | + :voice/use-speaker-boost? true |
| 88 | + :pipeline/language :en}} |
| 89 | + |
| 90 | + ;; audio-output-raw -> smaller audio-output-raw frames (used for sending audio in realtime) |
| 91 | + :audio-splitter {:proc transport/audio-splitter |
| 92 | + :args {:audio.out/duration-ms chunk-duration}} |
| 93 | + |
| 94 | + ;; speakers out |
| 95 | + :transport-out {:proc transport-out/realtime-speakers-out-processor |
| 96 | + :args {:audio.out/sending-interval chunk-duration |
| 97 | + :audio.out/duration-ms chunk-duration}} |
| 98 | + |
| 99 | + :activity-monitor {:proc activity-monitor/process |
| 100 | + :args {::activity-monitor/timeout-ms 5000}} |
| 101 | + :system-router {:proc system-router/system-frame-router}}) |
| 102 | + |
| 103 | +(def flow-conns (into (system-router/generate-system-router-connections flow-processors) |
| 104 | + [[[:transport-in :out] [:transcriptor :in]] |
| 105 | + |
| 106 | + [[:transcriptor :out] [:context-aggregator :in]] |
| 107 | + [[:context-aggregator :out] [:llm :in]] |
| 108 | + |
| 109 | + ;; Aggregate full context |
| 110 | + [[:llm :out] [:assistant-context-assembler :in]] |
| 111 | + [[:assistant-context-assembler :out] [:context-aggregator :in]] |
| 112 | + |
| 113 | + ;; Assemble sentence by sentence for fast speech |
| 114 | + [[:llm :out] [:llm-sentence-assembler :in]] |
| 115 | + |
| 116 | + [[:tts :out] [:audio-splitter :in]] |
| 117 | + [[:audio-splitter :out] [:transport-out :in]] |
| 118 | + |
| 119 | + ;; Activity detection |
| 120 | + [[:activity-monitor :out] [:context-aggregator :in]] |
| 121 | + [[:activity-monitor :out] [:tts :in]]])) |
| 122 | + |
| 123 | +(def flow-config |
| 124 | + {:procs flow-processors |
| 125 | + :conns flow-conns}) |
| 126 | + |
| 127 | +(comment |
| 128 | + |
| 129 | + (def local-ai (local/make-local-flow {:vad-analyser (silero/create-silero-vad) |
| 130 | + :extra-procs {}})) |
| 131 | + |
| 132 | + (defonce flow-started? (atom false)) |
| 133 | + |
| 134 | + ;; Start local ai flow - starts paused |
| 135 | + (let [{:keys [report-chan error-chan]} (flow/start local-ai)] |
| 136 | + (reset! flow-started? true) |
| 137 | + ;; Resume local ai -> you can now speak with the AI |
| 138 | + (flow/resume local-ai) |
| 139 | + (vthread-loop [] |
| 140 | + (when @flow-started? |
| 141 | + (when-let [[msg c] (a/alts!! [report-chan error-chan])] |
| 142 | + (when (map? msg) |
| 143 | + (t/log! (cond-> {:level :debug :id (if (= c error-chan) :error :report)} |
| 144 | + (= c error-chan) (assoc :error msg)) msg)) |
| 145 | + (recur))))) |
| 146 | + |
| 147 | + ;; Stop the conversation |
| 148 | + (do |
| 149 | + (flow/stop local-ai) |
| 150 | + (reset! flow-started? false)) |
| 151 | + |
| 152 | + ,) |
0 commit comments