@@ -37,94 +37,146 @@ This project's status is *experimental*. Expect breaking changes.
3737
3838<a id =" org71c1ebd " ></a >
3939
40- ## Quick Start: Twilio WebSocket Example
41-
40+ ## Quick Start: Local example
4241``` clojure
43- (defn make-twilio-flow
44- [in out]
45- (let [encoding :ulaw
46- sample-rate 8000
47- sample-size-bits 8
48- channels 1 ; ; mono
49- chunk-duration-ms 20
50- llm-context {:messages [{:role " system"
51- :content " You are a voice agent operating via phone. Be concise. The input you receive comes from a speech-to-text (transcription) system that isn't always efficient and may send unclear text. Ask for clarification when you're unsure what the person said." }]
52- :tools [{:type :function
53- :function
54- {:name " get_weather"
55- :description " Get the current weather of a location"
56- :parameters {:type :object
57- :required [:town ]
58- :properties {:town {:type :string
59- :description " Town for which to retrieve the current weather" }}
60- :additionalProperties false }
61- :strict true }}]}]
62- {:procs
63- {:transport-in {:proc transport/twilio-transport-in
64- :args {:transport/in-ch in}}
65- :deepgram-transcriptor {:proc asr/deepgram-processor
66- :args {:transcription/api-key (secret [:deepgram :api-key ])
67- :transcription/interim-results? true
68- :transcription/vad-events? true
69- :transcription/smart-format? true
70- :transcription/model :nova-2
71- :transcription/utterance-end-ms 1000
72- :transcription/language :en
73- :transcription/encoding :mulaw
74- :transcription/sample-rate sample-rate}}
75- :user-context-aggregator {:proc context/user-aggregator-process
76- :args {:llm/context llm-context}}
77- :assistant-context-aggregator {:proc context/assistant-context-aggregator
78- :args {:llm/context llm-context
79- :debug? true
80- :llm/registered-tools {" get_weather" {:async false
81- :tool (fn [{:keys [town]}] (str " The weather in " town " is 17 degrees celsius" ))}}}}
82- :llm {:proc llm/openai-llm-process
83- :args {:openai/api-key (secret [:openai :new-api-sk ])
84- :llm/model " gpt-4o-mini" }}
85-
86- :llm-sentence-assembler {:proc (flow/step-process #'context/sentence-assembler)}
87- :tts {:proc tts/elevenlabs-tts-process
88- :args {:elevenlabs/api-key (secret [:elevenlabs :api-key ])
89- :elevenlabs/model-id " eleven_flash_v2_5"
90- :elevenlabs/voice-id " 7sJPxFeMXAVWZloGIqg2"
91- :voice/stability 0.5
92- :voice/similarity-boost 0.8
93- :voice/use-speaker-boost? true
94- :flow/language :en
95- :audio.out/encoding encoding
96- :audio.out/sample-rate sample-rate}}
97- :transport-out {:proc transport/realtime-transport-out-processor
98- :args {:transport/out-chan out}}}
99-
100- :conns [[[:transport-in :sys-out ] [:deepgram-transcriptor :sys-in ]]
101- [[:transport-in :out ] [:deepgram-transcriptor :in ]]
102- [[:deepgram-transcriptor :out ] [:user-context-aggregator :in ]]
103- [[:user-context-aggregator :out ] [:llm :in ]]
104- [[:llm :out ] [:assistant-context-aggregator :in ]]
105-
106- ; ; cycle so that context aggregators are in sync
107- [[:assistant-context-aggregator :out ] [:user-context-aggregator :in ]]
108- [[:user-context-aggregator :out ] [:assistant-context-aggregator :in ]]
109-
110- [[:llm :out ] [:llm-sentence-assembler :in ]]
111- [[:llm-sentence-assembler :out ] [:tts :in ]]
112-
113- [[:tts :out ] [:transport-out :in ]]
114- [[:transport-in :sys-out ] [:transport-out :sys-in ]]
115- [[:audio-splitter :out ] [:realtime-out :in ]]]}))
116-
117- (defn start-flow []
118- (let [in (a/chan 1024 )
119- out (a/chan 1024 )
120- flow (flow/create-flow (make-twilio-flow in out))]
121- (flow/start flow)
122- {:in in :out out :flow flow}))
123-
124- (defn stop-flow [{:keys [flow in out]}]
125- (flow/stop flow)
126- (a/close! in)
127- (a/close! out))
42+ (ns voice-fn-examples.local
43+ (:require
44+ [clojure.core.async :as a]
45+ [clojure.core.async.flow :as flow]
46+ [taoensso.telemere :as t]
47+ [voice-fn.processors.deepgram :as asr]
48+ [voice-fn.processors.elevenlabs :as tts]
49+ [voice-fn.processors.llm-context-aggregator :as context]
50+ [voice-fn.processors.openai :as llm]
51+ [voice-fn.secrets :refer [secret]]
52+ [voice-fn.transport :as transport]
53+ [voice-fn.utils.core :as u]))
54+
55+ (defn make-local-flow
56+ " This example showcases a voice AI agent for the local computer. Audio is
57+ usually encoded as PCM at 16kHz frequency (sample rate) and it is mono (1
58+ channel).
59+
60+ :transport-in & :transport-out don't specify the audio configuration because
61+ these are the defaults. See each process for details
62+ "
63+ ([] (make-local-flow {}))
64+ ([{:keys [llm-context extra-procs extra-conns encoding debug?
65+ sample-rate language sample-size-bits channels chunk-duration-ms]
66+ :or {llm-context {:messages [{:role " system"
67+ :content " You are a helpful assistant " }]}
68+ encoding :pcm-signed
69+ sample-rate 16000
70+ sample-size-bits 16
71+ channels 1
72+ chunk-duration-ms 20
73+ language :en
74+ debug? false
75+ extra-procs {}
76+ extra-conns []}}]
77+
78+ (flow/create-flow
79+ {:procs
80+ (u/deep-merge
81+ {; ; Capture audio from microphone and send raw-audio-input frames further in the pipeline
82+ :transport-in {:proc transport/microphone-transport-in
83+ :args {:audio-in/sample-rate sample-rate
84+ :audio-in/channels channels
85+ :audio-in/sample-size-bits sample-size-bits}}
86+ ; ; raw-audio-input -> transcription frames
87+ :transcriptor {:proc asr/deepgram-processor
88+ :args {:transcription/api-key (secret [:deepgram :api-key ])
89+ :transcription/interim-results? true
90+ :transcription/punctuate? false
91+ :transcription/vad-events? true
92+ :transcription/smart-format? true
93+ :transcription/model :nova-2
94+ :transcription/utterance-end-ms 1000
95+ :transcription/language language
96+ :transcription/encoding encoding
97+ :transcription/sample-rate sample-rate}}
98+
99+ ; ; user transcription & llm message frames -> llm-context frames
100+ ; ; responsible for keeping the full conversation history
101+ :context-aggregator {:proc context/context-aggregator
102+ :args {:llm/context llm-context
103+ :aggregator/debug? debug?}}
104+
105+ ; ; Takes llm-context frames and produces new llm-text-chunk & llm-tool-call-chunk frames
106+ :llm {:proc llm/openai-llm-process
107+ :args {:openai/api-key (secret [:openai :new-api-sk ])
108+ :llm/model " gpt-4o-mini" }}
109+
110+ ; ; llm-text-chunk & llm-tool-call-chunk -> llm-context-messages-append frames
111+ :assistant-context-assembler {:proc context/assistant-context-assembler
112+ :args {:debug? debug?}}
113+
114+ ; ; llm-text-chunk -> sentence speak frames (faster for text to speech)
115+ :llm-sentence-assembler {:proc context/llm-sentence-assembler}
116+
117+ ; ; speak-frames -> audio-output-raw frames
118+ :tts {:proc tts/elevenlabs-tts-process
119+ :args {:elevenlabs/api-key (secret [:elevenlabs :api-key ])
120+ :elevenlabs/model-id " eleven_flash_v2_5"
121+ :elevenlabs/voice-id " 7sJPxFeMXAVWZloGIqg2"
122+ :voice/stability 0.5
123+ :voice/similarity-boost 0.8
124+ :voice/use-speaker-boost? true
125+ :flow/language language
126+ :audio.out/encoding encoding
127+ :audio.out/sample-rate sample-rate}}
128+
129+ ; ; audio-output-raw -> smaller audio-output-raw frames (used for sending audio in realtime)
130+ :audio-splitter {:proc transport/audio-splitter
131+ :args {:audio.out/sample-rate sample-rate
132+ :audio.out/sample-size-bits sample-size-bits
133+ :audio.out/channels channels
134+ :audio.out/duration-ms chunk-duration-ms}}
135+
136+ ; ; speakers out
137+ :transport-out {:proc transport/realtime-speakers-out-processor
138+ :args {:audio.out/sample-rate sample-rate
139+ :audio.out/sample-size-bits sample-size-bits
140+ :audio.out/channels channels
141+ :audio.out/duration-ms chunk-duration-ms}}}
142+ extra-procs)
143+ :conns (concat
144+ [[[:transport-in :out ] [:transcriptor :in ]]
145+
146+ [[:transcriptor :out ] [:context-aggregator :in ]]
147+ [[:context-aggregator :out ] [:llm :in ]]
148+
149+ ; ; Aggregate full context
150+ [[:llm :out ] [:assistant-context-assembler :in ]]
151+ [[:assistant-context-assembler :out ] [:context-aggregator :in ]]
152+
153+ ; ; Assemble sentence by sentence for fast speech
154+ [[:llm :out ] [:llm-sentence-assembler :in ]]
155+ [[:llm-sentence-assembler :out ] [:tts :in ]]
156+
157+ [[:tts :out ] [:audio-splitter :in ]]
158+ [[:audio-splitter :out ] [:transport-out :in ]]]
159+ extra-conns)})))
160+
161+ (def local-ai (make-local-flow ))
162+
163+ (comment
164+
165+ ; ; Start local ai flow - starts paused
166+ (let [{:keys [report-chan error-chan]} (flow/start local-ai)]
167+ (a/go-loop []
168+ (when-let [[msg c] (a/alts! [report-chan error-chan])]
169+ (when (map? msg)
170+ (t/log! {:level :debug :id (if (= c error-chan) :error :report )} msg))
171+ (recur ))))
172+
173+ ; ; Resume local ai -> you can now speak with the AI
174+ (flow/resume local-ai)
175+
176+ ; ; Stop the conversation
177+ (flow/stop local-ai)
178+
179+ ,)
128180```
129181
130182Which roughly translates to:
0 commit comments