Change example from README

ovistoica · ovistoica · commit 05c0ea695349 · 2025-02-06T08:37:34.000+02:00
diff --git a/README.md b/README.md
@@ -37,94 +37,146 @@ This project's status is *experimental*. Expect breaking changes.
 
 <a id="org71c1ebd"></a>
 
-## Quick Start: Twilio WebSocket Example
-
+## Quick Start: Local example
 ```clojure
-(defn make-twilio-flow
-  [in out]
-  (let [encoding :ulaw
-        sample-rate 8000
-        sample-size-bits 8
-        channels 1 ;; mono
-        chunk-duration-ms 20
-        llm-context {:messages [{:role "system"
-                                 :content  "You are a voice agent operating via phone. Be concise. The input you receive comes from a speech-to-text (transcription) system that isn't always efficient and may send unclear text. Ask for clarification when you're unsure what the person said."}]
-                     :tools [{:type :function
-                              :function
-                              {:name "get_weather"
-                               :description "Get the current weather of a location"
-                               :parameters {:type :object
-                                            :required [:town]
-                                            :properties {:town {:type :string
-                                                                :description "Town for which to retrieve the current weather"}}
-                                            :additionalProperties false}
-                               :strict true}}]}]
-    {:procs
-     {:transport-in {:proc transport/twilio-transport-in
-                     :args {:transport/in-ch in}}
-      :deepgram-transcriptor {:proc asr/deepgram-processor
-                              :args {:transcription/api-key (secret [:deepgram :api-key])
-                                     :transcription/interim-results? true
-                                     :transcription/vad-events? true
-                                     :transcription/smart-format? true
-                                     :transcription/model :nova-2
-                                     :transcription/utterance-end-ms 1000
-                                     :transcription/language :en
-                                     :transcription/encoding :mulaw
-                                     :transcription/sample-rate sample-rate}}
-      :user-context-aggregator  {:proc context/user-aggregator-process
-                                 :args {:llm/context llm-context}}
-      :assistant-context-aggregator {:proc context/assistant-context-aggregator
-                                     :args {:llm/context llm-context
-                                            :debug? true
-                                            :llm/registered-tools {"get_weather" {:async false
-                                                                                  :tool (fn [{:keys [town]}] (str "The weather in " town " is 17 degrees celsius"))}}}}
-      :llm {:proc llm/openai-llm-process
-            :args {:openai/api-key (secret [:openai :new-api-sk])
-                   :llm/model "gpt-4o-mini"}}
-
-      :llm-sentence-assembler {:proc (flow/step-process #'context/sentence-assembler)}
-      :tts {:proc tts/elevenlabs-tts-process
-            :args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
-                   :elevenlabs/model-id "eleven_flash_v2_5"
-                   :elevenlabs/voice-id "7sJPxFeMXAVWZloGIqg2"
-                   :voice/stability 0.5
-                   :voice/similarity-boost 0.8
-                   :voice/use-speaker-boost? true
-                   :flow/language :en
-                   :audio.out/encoding encoding
-                   :audio.out/sample-rate sample-rate}}
-       :transport-out {:proc transport/realtime-transport-out-processor
-                       :args {:transport/out-chan out}}}
-
-     :conns [[[:transport-in :sys-out] [:deepgram-transcriptor :sys-in]]
-             [[:transport-in :out] [:deepgram-transcriptor :in]]
-             [[:deepgram-transcriptor :out] [:user-context-aggregator :in]]
-             [[:user-context-aggregator :out] [:llm :in]]
-             [[:llm :out] [:assistant-context-aggregator :in]]
-
-             ;; cycle so that context aggregators are in sync
-             [[:assistant-context-aggregator :out] [:user-context-aggregator :in]]
-             [[:user-context-aggregator :out] [:assistant-context-aggregator :in]]
-
-             [[:llm :out] [:llm-sentence-assembler :in]]
-             [[:llm-sentence-assembler :out] [:tts :in]]
-
-             [[:tts :out] [:transport-out :in]]
-             [[:transport-in :sys-out] [:transport-out :sys-in]]
-             [[:audio-splitter :out] [:realtime-out :in]]]}))
-
-(defn start-flow []
-   (let [in (a/chan 1024)
-         out (a/chan 1024)
-         flow (flow/create-flow (make-twilio-flow in out))]
-     (flow/start flow)
-     {:in in :out out :flow flow}))
-
-(defn stop-flow [{:keys [flow in out]}]
-   (flow/stop flow)
-   (a/close! in)
-   (a/close! out))
+(ns voice-fn-examples.local
+  (:require
+   [clojure.core.async :as a]
+   [clojure.core.async.flow :as flow]
+   [taoensso.telemere :as t]
+   [voice-fn.processors.deepgram :as asr]
+   [voice-fn.processors.elevenlabs :as tts]
+   [voice-fn.processors.llm-context-aggregator :as context]
+   [voice-fn.processors.openai :as llm]
+   [voice-fn.secrets :refer [secret]]
+   [voice-fn.transport :as transport]
+   [voice-fn.utils.core :as u]))
+
+(defn make-local-flow
+  "This example showcases a voice AI agent for the local computer.  Audio is
+  usually encoded as PCM at 16kHz frequency (sample rate) and it is mono (1
+  channel).
+
+  :transport-in & :transport-out don't specify the audio configuration because
+  these are the defaults. See each process for details
+  "
+  ([] (make-local-flow {}))
+  ([{:keys [llm-context extra-procs extra-conns encoding debug?
+            sample-rate language sample-size-bits channels chunk-duration-ms]
+     :or {llm-context {:messages [{:role "system"
+                                   :content "You are a helpful assistant "}]}
+          encoding :pcm-signed
+          sample-rate 16000
+          sample-size-bits 16
+          channels 1
+          chunk-duration-ms 20
+          language :en
+          debug? false
+          extra-procs {}
+          extra-conns []}}]
+
+   (flow/create-flow
+     {:procs
+      (u/deep-merge
+        {;; Capture audio from microphone and send raw-audio-input frames further in the pipeline
+         :transport-in {:proc transport/microphone-transport-in
+                        :args {:audio-in/sample-rate sample-rate
+                               :audio-in/channels channels
+                               :audio-in/sample-size-bits sample-size-bits}}
+         ;; raw-audio-input -> transcription frames
+         :transcriptor {:proc asr/deepgram-processor
+                        :args {:transcription/api-key (secret [:deepgram :api-key])
+                               :transcription/interim-results? true
+                               :transcription/punctuate? false
+                               :transcription/vad-events? true
+                               :transcription/smart-format? true
+                               :transcription/model :nova-2
+                               :transcription/utterance-end-ms 1000
+                               :transcription/language language
+                               :transcription/encoding encoding
+                               :transcription/sample-rate sample-rate}}
+
+         ;; user transcription & llm message frames -> llm-context frames
+         ;; responsible for keeping the full conversation history
+         :context-aggregator  {:proc context/context-aggregator
+                               :args {:llm/context llm-context
+                                      :aggregator/debug? debug?}}
+
+         ;; Takes llm-context frames and produces new llm-text-chunk & llm-tool-call-chunk frames
+         :llm {:proc llm/openai-llm-process
+               :args {:openai/api-key (secret [:openai :new-api-sk])
+                      :llm/model "gpt-4o-mini"}}
+
+         ;; llm-text-chunk & llm-tool-call-chunk -> llm-context-messages-append frames
+         :assistant-context-assembler {:proc context/assistant-context-assembler
+                                       :args {:debug? debug?}}
+
+         ;; llm-text-chunk -> sentence speak frames (faster for text to speech)
+         :llm-sentence-assembler {:proc context/llm-sentence-assembler}
+
+         ;; speak-frames -> audio-output-raw frames
+         :tts {:proc tts/elevenlabs-tts-process
+               :args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
+                      :elevenlabs/model-id "eleven_flash_v2_5"
+                      :elevenlabs/voice-id "7sJPxFeMXAVWZloGIqg2"
+                      :voice/stability 0.5
+                      :voice/similarity-boost 0.8
+                      :voice/use-speaker-boost? true
+                      :flow/language language
+                      :audio.out/encoding encoding
+                      :audio.out/sample-rate sample-rate}}
+
+         ;; audio-output-raw -> smaller audio-output-raw frames (used for sending audio in realtime)
+         :audio-splitter {:proc transport/audio-splitter
+                          :args {:audio.out/sample-rate sample-rate
+                                 :audio.out/sample-size-bits sample-size-bits
+                                 :audio.out/channels channels
+                                 :audio.out/duration-ms chunk-duration-ms}}
+
+         ;; speakers out
+         :transport-out {:proc transport/realtime-speakers-out-processor
+                         :args {:audio.out/sample-rate sample-rate
+                                :audio.out/sample-size-bits sample-size-bits
+                                :audio.out/channels channels
+                                :audio.out/duration-ms chunk-duration-ms}}}
+        extra-procs)
+      :conns (concat
+               [[[:transport-in :out] [:transcriptor :in]]
+
+                [[:transcriptor :out] [:context-aggregator :in]]
+                [[:context-aggregator :out] [:llm :in]]
+
+                ;; Aggregate full context
+                [[:llm :out] [:assistant-context-assembler :in]]
+                [[:assistant-context-assembler :out] [:context-aggregator :in]]
+
+                ;; Assemble sentence by sentence for fast speech
+                [[:llm :out] [:llm-sentence-assembler :in]]
+                [[:llm-sentence-assembler :out] [:tts :in]]
+
+                [[:tts :out] [:audio-splitter :in]]
+                [[:audio-splitter :out] [:transport-out :in]]]
+               extra-conns)})))
+
+(def local-ai (make-local-flow))
+
+(comment
+
+  ;; Start local ai flow - starts paused
+  (let [{:keys [report-chan error-chan]} (flow/start local-ai)]
+    (a/go-loop []
+      (when-let [[msg c] (a/alts! [report-chan error-chan])]
+        (when (map? msg)
+          (t/log! {:level :debug :id (if (= c error-chan) :error :report)} msg))
+        (recur))))
+
+  ;; Resume local ai -> you can now speak with the AI
+  (flow/resume local-ai)
+
+  ;; Stop the conversation
+  (flow/stop local-ai)
+
+  ,)
 ```
 
 Which roughly translates to:
diff --git a/examples/src/voice_fn_examples/local.clj b/examples/src/voice_fn_examples/local.clj
@@ -121,14 +121,18 @@
 
 (comment
 
+  ;; Start local ai flow - starts paused
   (let [{:keys [report-chan error-chan]} (flow/start local-ai)]
     (a/go-loop []
       (when-let [[msg c] (a/alts! [report-chan error-chan])]
         (when (map? msg)
           (t/log! {:level :debug :id (if (= c error-chan) :error :report)} msg))
         (recur))))
 
+  ;; Resume local ai -> you can now speak with the AI
   (flow/resume local-ai)
+
+  ;; Stop the conversation
   (flow/stop local-ai)
 
   ,)