Skip to content

Commit 05c0ea6

Browse files
committed
Change example from README
1 parent aea6457 commit 05c0ea6

File tree

2 files changed

+143
-87
lines changed

2 files changed

+143
-87
lines changed

README.md

Lines changed: 139 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -37,94 +37,146 @@ This project's status is *experimental*. Expect breaking changes.
3737

3838
<a id="org71c1ebd"></a>
3939

40-
## Quick Start: Twilio WebSocket Example
41-
40+
## Quick Start: Local example
4241
```clojure
43-
(defn make-twilio-flow
44-
[in out]
45-
(let [encoding :ulaw
46-
sample-rate 8000
47-
sample-size-bits 8
48-
channels 1 ;; mono
49-
chunk-duration-ms 20
50-
llm-context {:messages [{:role "system"
51-
:content "You are a voice agent operating via phone. Be concise. The input you receive comes from a speech-to-text (transcription) system that isn't always efficient and may send unclear text. Ask for clarification when you're unsure what the person said."}]
52-
:tools [{:type :function
53-
:function
54-
{:name "get_weather"
55-
:description "Get the current weather of a location"
56-
:parameters {:type :object
57-
:required [:town]
58-
:properties {:town {:type :string
59-
:description "Town for which to retrieve the current weather"}}
60-
:additionalProperties false}
61-
:strict true}}]}]
62-
{:procs
63-
{:transport-in {:proc transport/twilio-transport-in
64-
:args {:transport/in-ch in}}
65-
:deepgram-transcriptor {:proc asr/deepgram-processor
66-
:args {:transcription/api-key (secret [:deepgram :api-key])
67-
:transcription/interim-results? true
68-
:transcription/vad-events? true
69-
:transcription/smart-format? true
70-
:transcription/model :nova-2
71-
:transcription/utterance-end-ms 1000
72-
:transcription/language :en
73-
:transcription/encoding :mulaw
74-
:transcription/sample-rate sample-rate}}
75-
:user-context-aggregator {:proc context/user-aggregator-process
76-
:args {:llm/context llm-context}}
77-
:assistant-context-aggregator {:proc context/assistant-context-aggregator
78-
:args {:llm/context llm-context
79-
:debug? true
80-
:llm/registered-tools {"get_weather" {:async false
81-
:tool (fn [{:keys [town]}] (str "The weather in " town " is 17 degrees celsius"))}}}}
82-
:llm {:proc llm/openai-llm-process
83-
:args {:openai/api-key (secret [:openai :new-api-sk])
84-
:llm/model "gpt-4o-mini"}}
85-
86-
:llm-sentence-assembler {:proc (flow/step-process #'context/sentence-assembler)}
87-
:tts {:proc tts/elevenlabs-tts-process
88-
:args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
89-
:elevenlabs/model-id "eleven_flash_v2_5"
90-
:elevenlabs/voice-id "7sJPxFeMXAVWZloGIqg2"
91-
:voice/stability 0.5
92-
:voice/similarity-boost 0.8
93-
:voice/use-speaker-boost? true
94-
:flow/language :en
95-
:audio.out/encoding encoding
96-
:audio.out/sample-rate sample-rate}}
97-
:transport-out {:proc transport/realtime-transport-out-processor
98-
:args {:transport/out-chan out}}}
99-
100-
:conns [[[:transport-in :sys-out] [:deepgram-transcriptor :sys-in]]
101-
[[:transport-in :out] [:deepgram-transcriptor :in]]
102-
[[:deepgram-transcriptor :out] [:user-context-aggregator :in]]
103-
[[:user-context-aggregator :out] [:llm :in]]
104-
[[:llm :out] [:assistant-context-aggregator :in]]
105-
106-
;; cycle so that context aggregators are in sync
107-
[[:assistant-context-aggregator :out] [:user-context-aggregator :in]]
108-
[[:user-context-aggregator :out] [:assistant-context-aggregator :in]]
109-
110-
[[:llm :out] [:llm-sentence-assembler :in]]
111-
[[:llm-sentence-assembler :out] [:tts :in]]
112-
113-
[[:tts :out] [:transport-out :in]]
114-
[[:transport-in :sys-out] [:transport-out :sys-in]]
115-
[[:audio-splitter :out] [:realtime-out :in]]]}))
116-
117-
(defn start-flow []
118-
(let [in (a/chan 1024)
119-
out (a/chan 1024)
120-
flow (flow/create-flow (make-twilio-flow in out))]
121-
(flow/start flow)
122-
{:in in :out out :flow flow}))
123-
124-
(defn stop-flow [{:keys [flow in out]}]
125-
(flow/stop flow)
126-
(a/close! in)
127-
(a/close! out))
42+
(ns voice-fn-examples.local
43+
(:require
44+
[clojure.core.async :as a]
45+
[clojure.core.async.flow :as flow]
46+
[taoensso.telemere :as t]
47+
[voice-fn.processors.deepgram :as asr]
48+
[voice-fn.processors.elevenlabs :as tts]
49+
[voice-fn.processors.llm-context-aggregator :as context]
50+
[voice-fn.processors.openai :as llm]
51+
[voice-fn.secrets :refer [secret]]
52+
[voice-fn.transport :as transport]
53+
[voice-fn.utils.core :as u]))
54+
55+
(defn make-local-flow
56+
"This example showcases a voice AI agent for the local computer. Audio is
57+
usually encoded as PCM at 16kHz frequency (sample rate) and it is mono (1
58+
channel).
59+
60+
:transport-in & :transport-out don't specify the audio configuration because
61+
these are the defaults. See each process for details
62+
"
63+
([] (make-local-flow {}))
64+
([{:keys [llm-context extra-procs extra-conns encoding debug?
65+
sample-rate language sample-size-bits channels chunk-duration-ms]
66+
:or {llm-context {:messages [{:role "system"
67+
:content "You are a helpful assistant "}]}
68+
encoding :pcm-signed
69+
sample-rate 16000
70+
sample-size-bits 16
71+
channels 1
72+
chunk-duration-ms 20
73+
language :en
74+
debug? false
75+
extra-procs {}
76+
extra-conns []}}]
77+
78+
(flow/create-flow
79+
{:procs
80+
(u/deep-merge
81+
{;; Capture audio from microphone and send raw-audio-input frames further in the pipeline
82+
:transport-in {:proc transport/microphone-transport-in
83+
:args {:audio-in/sample-rate sample-rate
84+
:audio-in/channels channels
85+
:audio-in/sample-size-bits sample-size-bits}}
86+
;; raw-audio-input -> transcription frames
87+
:transcriptor {:proc asr/deepgram-processor
88+
:args {:transcription/api-key (secret [:deepgram :api-key])
89+
:transcription/interim-results? true
90+
:transcription/punctuate? false
91+
:transcription/vad-events? true
92+
:transcription/smart-format? true
93+
:transcription/model :nova-2
94+
:transcription/utterance-end-ms 1000
95+
:transcription/language language
96+
:transcription/encoding encoding
97+
:transcription/sample-rate sample-rate}}
98+
99+
;; user transcription & llm message frames -> llm-context frames
100+
;; responsible for keeping the full conversation history
101+
:context-aggregator {:proc context/context-aggregator
102+
:args {:llm/context llm-context
103+
:aggregator/debug? debug?}}
104+
105+
;; Takes llm-context frames and produces new llm-text-chunk & llm-tool-call-chunk frames
106+
:llm {:proc llm/openai-llm-process
107+
:args {:openai/api-key (secret [:openai :new-api-sk])
108+
:llm/model "gpt-4o-mini"}}
109+
110+
;; llm-text-chunk & llm-tool-call-chunk -> llm-context-messages-append frames
111+
:assistant-context-assembler {:proc context/assistant-context-assembler
112+
:args {:debug? debug?}}
113+
114+
;; llm-text-chunk -> sentence speak frames (faster for text to speech)
115+
:llm-sentence-assembler {:proc context/llm-sentence-assembler}
116+
117+
;; speak-frames -> audio-output-raw frames
118+
:tts {:proc tts/elevenlabs-tts-process
119+
:args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
120+
:elevenlabs/model-id "eleven_flash_v2_5"
121+
:elevenlabs/voice-id "7sJPxFeMXAVWZloGIqg2"
122+
:voice/stability 0.5
123+
:voice/similarity-boost 0.8
124+
:voice/use-speaker-boost? true
125+
:flow/language language
126+
:audio.out/encoding encoding
127+
:audio.out/sample-rate sample-rate}}
128+
129+
;; audio-output-raw -> smaller audio-output-raw frames (used for sending audio in realtime)
130+
:audio-splitter {:proc transport/audio-splitter
131+
:args {:audio.out/sample-rate sample-rate
132+
:audio.out/sample-size-bits sample-size-bits
133+
:audio.out/channels channels
134+
:audio.out/duration-ms chunk-duration-ms}}
135+
136+
;; speakers out
137+
:transport-out {:proc transport/realtime-speakers-out-processor
138+
:args {:audio.out/sample-rate sample-rate
139+
:audio.out/sample-size-bits sample-size-bits
140+
:audio.out/channels channels
141+
:audio.out/duration-ms chunk-duration-ms}}}
142+
extra-procs)
143+
:conns (concat
144+
[[[:transport-in :out] [:transcriptor :in]]
145+
146+
[[:transcriptor :out] [:context-aggregator :in]]
147+
[[:context-aggregator :out] [:llm :in]]
148+
149+
;; Aggregate full context
150+
[[:llm :out] [:assistant-context-assembler :in]]
151+
[[:assistant-context-assembler :out] [:context-aggregator :in]]
152+
153+
;; Assemble sentence by sentence for fast speech
154+
[[:llm :out] [:llm-sentence-assembler :in]]
155+
[[:llm-sentence-assembler :out] [:tts :in]]
156+
157+
[[:tts :out] [:audio-splitter :in]]
158+
[[:audio-splitter :out] [:transport-out :in]]]
159+
extra-conns)})))
160+
161+
(def local-ai (make-local-flow))
162+
163+
(comment
164+
165+
;; Start local ai flow - starts paused
166+
(let [{:keys [report-chan error-chan]} (flow/start local-ai)]
167+
(a/go-loop []
168+
(when-let [[msg c] (a/alts! [report-chan error-chan])]
169+
(when (map? msg)
170+
(t/log! {:level :debug :id (if (= c error-chan) :error :report)} msg))
171+
(recur))))
172+
173+
;; Resume local ai -> you can now speak with the AI
174+
(flow/resume local-ai)
175+
176+
;; Stop the conversation
177+
(flow/stop local-ai)
178+
179+
,)
128180
```
129181

130182
Which roughly translates to:

examples/src/voice_fn_examples/local.clj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,14 +121,18 @@
121121

122122
(comment
123123

124+
;; Start local ai flow - starts paused
124125
(let [{:keys [report-chan error-chan]} (flow/start local-ai)]
125126
(a/go-loop []
126127
(when-let [[msg c] (a/alts! [report-chan error-chan])]
127128
(when (map? msg)
128129
(t/log! {:level :debug :id (if (= c error-chan) :error :report)} msg))
129130
(recur))))
130131

132+
;; Resume local ai -> you can now speak with the AI
131133
(flow/resume local-ai)
134+
135+
;; Stop the conversation
132136
(flow/stop local-ai)
133137

134138
,)

0 commit comments

Comments
 (0)