-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathlocal.clj
More file actions
167 lines (142 loc) · 7.34 KB
/
local.clj
File metadata and controls
167 lines (142 loc) · 7.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
(ns simulflow-examples.local
{:clj-reload/no-unload true}
(:require
[clojure.core.async :as a]
[clojure.core.async.flow :as flow]
[simulflow.async :refer [vthread-loop]]
[simulflow.processors.activity-monitor :as activity-monitor]
[simulflow.processors.deepgram :as deepgram]
[simulflow.processors.elevenlabs :as xi]
[simulflow.processors.llm-context-aggregator :as context]
[simulflow.processors.openai :as openai]
[simulflow.secrets :refer [secret]]
[simulflow.transport :as transport]
[simulflow.transport.in :as transport-in]
[simulflow.transport.out :as transport-out]
[simulflow.utils.core :as u]
[simulflow.vad.silero :as silero]
[taoensso.telemere :as t]))
(t/set-min-level! :debug)
(defn make-local-flow
"This example showcases a voice AI agent for the local computer. Audio is
usually encoded as PCM at 16kHz frequency (sample rate) and it is mono (1
channel).
:transport-in & :transport-out don't specify the audio configuration because
these are the defaults. See each process for details
"
([] (make-local-flow {}))
([{:keys [llm-context extra-procs extra-conns debug?
language chunk-duration-ms]
:or {llm-context {:messages
[{:role "system"
:content "You are a voice agent operating via phone. Be
concise in your answers. The input you receive comes from a
speech-to-text (transcription) system that isn't always
efficient and may send unclear text. Ask for
clarification when you're unsure what the person said."}]
:tools
[{:type :function
:function
{:name "get_weather"
:handler (fn [{:keys [town]}] (str "The weather in " town " is 17 degrees celsius"))
:description "Get the current weather of a location"
:parameters {:type :object
:required [:town]
:properties {:town {:type :string
:description "Town for which to retrieve the current weather"}}
:additionalProperties false}
:strict true}}]}
language :en
debug? false
chunk-duration-ms 20
extra-procs {}
extra-conns []}}]
(flow/create-flow
{:procs
(u/deep-merge
{;; Capture audio from microphone and send raw-audio-input frames further in the pipeline
:transport-in {:proc transport-in/microphone-transport-in
:args {:vad/analyser :vad.analyser/silero}}
;; raw-audio-input -> transcription frames
:transcriptor {:proc deepgram/deepgram-processor
:args {:transcription/api-key (secret [:deepgram :api-key])
:transcription/interim-results? true
:transcription/punctuate? false
:transcription/vad-events? false
:transcription/smart-format? true
:transcription/model :nova-2
:transcription/utterance-end-ms 1000
:transcription/language language}}
;; user transcription & llm message frames -> llm-context frames
;; responsible for keeping the full conversation history
:context-aggregator {:proc context/context-aggregator
:args {:llm/context llm-context
:aggregator/debug? debug?}}
;; Takes llm-context frames and produces new llm-text-chunk & llm-tool-call-chunk frames
:llm {:proc openai/openai-llm-process
:args {:openai/api-key (secret [:openai :new-api-sk])
:llm/model :gpt-4.1-mini}}
;; llm-text-chunk & llm-tool-call-chunk -> llm-context-messages-append frames
:assistant-context-assembler {:proc context/assistant-context-assembler
:args {:debug? debug?}}
;; llm-text-chunk -> sentence speak frames (faster for text to speech)
:llm-sentence-assembler {:proc context/llm-sentence-assembler}
;; speak-frames -> audio-output-raw frames
:tts {:proc xi/elevenlabs-tts-process
:args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
:elevenlabs/model-id "eleven_flash_v2_5"
:elevenlabs/voice-id (secret [:elevenlabs :voice-id])
:voice/stability 0.5
:voice/similarity-boost 0.8
:voice/use-speaker-boost? true
:pipeline/language language}}
;; audio-output-raw -> smaller audio-output-raw frames (used for sending audio in realtime)
:audio-splitter {:proc transport/audio-splitter
:args {:audio.out/duration-ms chunk-duration-ms}}
;; speakers out
:transport-out {:proc transport-out/realtime-speakers-out-processor
:args {:audio.out/sending-interval chunk-duration-ms
:audio.out/duration-ms chunk-duration-ms}}
:activity-monitor {:proc activity-monitor/process
:args {::activity-monitor/timeout-ms 5000}}}
extra-procs)
:conns (concat
[[[:transport-in :out] [:transcriptor :in]]
[[:transcriptor :out] [:context-aggregator :in]]
[[:transport-in :sys-out] [:context-aggregator :sys-in]]
[[:context-aggregator :out] [:llm :in]]
;; Aggregate full context
[[:llm :out] [:assistant-context-assembler :in]]
[[:assistant-context-assembler :out] [:context-aggregator :in]]
;; Assemble sentence by sentence for fast speech
[[:llm :out] [:llm-sentence-assembler :in]]
[[:llm-sentence-assembler :sys-out] [:tts :sys-in]]
[[:tts :out] [:audio-splitter :in]]
[[:audio-splitter :out] [:transport-out :in]]
;; Activity detection
[[:transport-out :sys-out] [:activity-monitor :sys-in]]
[[:transport-in :sys-out] [:activity-monitor :sys-in]]
[[:transcriptor :sys-out] [:activity-monitor :sys-in]]
[[:activity-monitor :out] [:context-aggregator :in]]
[[:activity-monitor :out] [:tts :in]]]
extra-conns)})))
(comment
(def local-ai (make-local-flow))
(defonce flow-started? (atom false))
;; Start local ai flow - starts paused
(let [{:keys [report-chan error-chan]} (flow/start local-ai)]
(reset! flow-started? true)
;; Resume local ai -> you can now speak with the AI
(flow/resume local-ai)
(vthread-loop []
(when @flow-started?
(when-let [[msg c] (a/alts!! [report-chan error-chan])]
(when (map? msg)
(t/log! (cond-> {:level :debug :id (if (= c error-chan) :error :report)}
(= c error-chan) (assoc :error msg)) msg))
(recur)))))
;; Stop the conversation
(do
(flow/stop local-ai)
(reset! flow-started? false))
,)