Add flow output transport

ovistoica · ovistoica · commit 2029f21cea5b · 2025-01-25T16:09:23.000+02:00
diff --git a/TODO.org b/TODO.org
@@ -106,6 +106,7 @@ CLOSED: [2025-01-20 Lun 07:43]
 
 * TODO add core.async.flow support
 :LOGBOOK:
-CLOCK: [2025-01-25 Sat 11:14]
+CLOCK: [2025-01-25 Sat 15:18]--[2025-01-25 Sat 15:43] =>  0:25
+CLOCK: [2025-01-25 Sat 11:14]--[2025-01-25 Sat 11:39] =>  0:25
 CLOCK: [2025-01-25 Sat 09:50]--[2025-01-25 Sat 10:15] =>  0:25
 :END:
diff --git a/core/src/voice_fn/experiments/flow.clj b/core/src/voice_fn/experiments/flow.clj
@@ -13,18 +13,24 @@
    [voice-fn.processors.llm-context-aggregator :as ca]
    [voice-fn.processors.openai :as openai :refer [OpenAILLMConfigSchema]]
    [voice-fn.secrets :refer [secret]]
+   [voice-fn.transport.async :refer [mono-time]]
+   [voice-fn.transport.protocols :as tp]
    [voice-fn.transport.serializers :refer [make-twilio-serializer]]
+   [voice-fn.utils.audio :as au]
    [voice-fn.utils.core :as u])
   (:import
    (java.nio HeapCharBuffer)))
 
 (t/set-min-level! :debug)
 
-(def transport-in
+(def twilio-transport-in
   (flow/process
-    {:describe (fn [] {:ins {:in "Channel for audio input "}
-                       :outs {:sys-out "Channel for system messages that have priority"
-                              :out "Channel on which audio frames are put"}})
+    {:describe (fn [] {:outs {:sys-out "Channel for system messages that have priority"
+                              :out "Channel on which audio frames are put"}
+                       :params {:transport/in-ch "Channel from which input comes"}})
+
+     :init (fn [{:transport/keys [in-ch]}]
+             {::flow/in-ports {:twilio-in in-ch}})
 
      :transform (fn [state _ input]
                   (let [data (u/parse-if-json input)]
@@ -101,15 +107,17 @@
                 ::flow/out-ports {:ws-write ws-write-chan}}))
 
      ;; Close ws when pipeline stops
-     :transition (fn [{:websocket/keys [conn] :as state} transition]
+     :transition (fn [{:websocket/keys [conn]
+                       ::flow/keys [in-ports out-ports] :as state} transition]
                    (t/log! {:level :debug} ["TRANSITION" transition])
                    (when (= transition ::flow/stop)
                      (t/log! {:id :deepgram-transcriptor :level :info} "Closing transcription websocket connection")
                      (reset! (:websocket/alive? state) false)
                      (when conn
                        (ws/send! conn deepgram/close-connection-payload)
                        (ws/close! conn))
-
+                     (doseq [port (concat (vals in-ports) (vals out-ports))]
+                       (a/close! port))
                      state)
                    state)
 
@@ -181,13 +189,17 @@
                 :websocket/alive? alive?
                 ::flow/in-ports {:ws-read ws-read}
                 ::flow/out-ports {:ws-write ws-write}}))
-     :transition (fn [{:websocket/keys [conn] :as state} transition]
+     :transition (fn [{:websocket/keys [conn]
+                       ::flow/keys [in-ports out-ports]
+                       :as state} transition]
                    (when (= transition ::flow/stop)
                      (t/log! {:id :elevenlabs :level :info} "Closing tts websocket connection")
                      (reset! (:websocket/alive? state) false)
                      (when conn
                        (ws/send! conn xi/close-stream-message)
-                       (ws/close! conn)))
+                       (ws/close! conn))
+                     (doseq [port (concat (vals in-ports) (vals out-ports))]
+                       (a/close! port)))
                    state)
 
      :transform (fn [{:audio/keys [acc] :as state} in-name msg]
@@ -235,6 +247,10 @@
               :llm/max-completion-tokens "Optional Max tokens in completion"
               :llm/extra "Optional extra model parameters"}
      :workload :io
+     :transition (fn [{::flow/keys [in-ports out-ports]} transition]
+                   (when (= transition ::flow/stop)
+                     (doseq [port (concat (vals in-ports) (vals out-ports))]
+                       (a/close! port))))
      :init (fn [params]
              (let [state (m/decode OpenAILLMConfigSchema params mt/default-value-transformer)
                    llm-write (a/chan 100)
@@ -269,9 +285,93 @@
          [{:acc accumulator} {:out [(frame/speak-frame sentence)]}]
          [{:acc accumulator}])))))
 
+(def audio-splitter
+  (flow/process
+    {:describe (fn [] {:ins {:in "Channel for raw audio frames"}
+                       :outs {:out "Channel for audio frames split by chunk size"}})
+     :params {:audio.out/chunk-size "The chunk size by which to split each audio
+     frame. Specify either this or the other parameters so that chunk size can be computed"
+
+              :audio.out/sample-rate "Sample rate of the output audio"
+              :audio.out/sample-size-bits "Size in bits for each sample"
+              :audio.out/channels "Number of channels. 1 or 2 (mono or stereo audio)"
+              :audio.out/duration-ms "Duration in ms of each chunk that will be streamed to output"}
+     :init (fn [{:audio.out/keys [chunk-size sample-rate sample-size-bits channels duration-ms]}]
+             (assert (or chunk-size (and sample-rate sample-size-bits channels duration-ms))
+                     "Either provide :audio.out/chunk-size or sample-rate, sample-size-bits, channels and chunk duration for the size to be computed")
+             {:audio.out/chunk-size (or chunk-size (au/audio-chunk-size {:sample-rate sample-rate
+                                                                         :sample-size-bits sample-size-bits
+                                                                         :channels channels
+                                                                         :duration-ms duration-ms}))})
+     :transform (fn [{:audio.out/keys [chunk-size] :as state} _ frame]
+                  (cond
+                    (frame/audio-output-raw? frame)
+                    (loop [audio (:frame/data frame)
+                           chunks []]
+                      (let [audio-size (count audio)
+                            chunk-actual-size (min chunk-size audio-size)
+                            chunk (byte-array chunk-actual-size)]
+                        ;; Copy chunk-size amount of data into next chunk
+                        (System/arraycopy audio 0 chunk 0 chunk-actual-size)
+                        (if (> audio-size chunk-actual-size)
+                          (let [new-audio-size (- audio-size chunk-actual-size)
+                                remaining-audio (byte-array new-audio-size)]
+                            (System/arraycopy audio chunk-actual-size remaining-audio 0 new-audio-size)
+                            (recur remaining-audio (conj chunks (frame/audio-output-raw chunk))))
+                          ;; No more chunks to process, return final result
+                          [state {:out (conj chunks (frame/audio-output-raw chunk))}])))
+
+                    :else [state]))}))
+
+(def realtime-transport-out-processor
+  "Processor that streams audio out in real time so we can account for
+  interruptions."
+  (flow/process
+    {:describe (fn [] {:ins {:in "Channel for audio output frames "}
+                       :outs {:out "Channel on which serialized buffered output is put"}})
+     :params {:transport/out-chan "Channel on which to put buffered serialized audio"
+              :audio.out/duration-ms "Duration of each audio chunk. Defaults to 20ms"
+              :transport/supports-interrupt? "Whether the processor supports interrupt or not"}
+     :transition (fn [{::flow/keys [in-ports out-ports]} transition]
+                   (when (= transition ::flow/stop)
+                     (doseq [port (concat (vals in-ports) (vals out-ports))]
+                       (a/close! port))))
+     :init (fn [{:audio.out/keys [duration-ms]
+                 :transport/keys [out-chan]}]
+             (assert out-chan "Required :transport/out-chan for sending output")
+             (let [;; send every 10ms to account for network
+                   duration (or duration-ms 20)
+                   sending-interval (/ duration 2)
+                   next-send-time (atom (mono-time))
+
+                   audio-write-c (a/chan 1024)
+                   realtime-loop #(loop []
+                                    (when-let [msg (a/<!! audio-write-c)]
+                                      (let [now (mono-time)]
+                                        (a/<!! (a/timeout (- @next-send-time now)))
+                                        (a/put! out-chan msg)
+                                        (reset! next-send-time (+ now sending-interval)))
+                                      (recur)))]
+               ((flow/futurize realtime-loop :exec :io))
+               {::flow/out-ports {:audio-write audio-write-c}}))
+
+     :transform (fn [{:transport/keys [serializer] :as state} _ msg]
+                  (cond
+                    (frame/audio-output-raw? msg)
+                    [state {:audio-write [(if serializer
+                                            (tp/serialize-frame serializer msg)
+                                            msg)]}]
+
+                    (frame/system-config-change? msg)
+                    (if-let [serializer (:transport/serializer (:frame/data msg))]
+                      [(assoc state :transport/serializer serializer)]
+                      [state])
+
+                    :else [state]))}))
+
 (def gdef
   {:procs
-   {:transport-in {:proc transport-in}
+   {:transport-in {:proc twilio-transport-in}
     :deepgram-transcriptor {:proc deepgram-processor
                             :args {:transcription/api-key (secret [:deepgram :api-key])
                                    :transcription/interim-results? true
@@ -312,6 +412,13 @@
                  :flow/language :en
                  :audio.out/encoding :ulaw
                  :audio.out/sample-rate 8000}}
+    :audio-splitter {:proc audio-splitter
+                     :args {:audio.out/sample-rate 8000
+                            :audio.out/sample-size-bits 8
+                            :audio.out/channels 1
+                            :audio.out/duration-ms 20}}
+    :realtime-out {:proc realtime-transport-out-processor
+                   :args {:transport/out-chan (a/chan 1024)}}
 
     :print-sink {:proc (flow/process
                          {:describe (fn [] {:ins {:in "Channel for receiving transcriptions"}})
@@ -332,7 +439,8 @@
            [[:llm :out] [:llm-sentence-assembler :in]]
            [[:llm-sentence-assembler :out] [:tts :in]]
 
-           [[:tts :out] [:print-sink :in]]]})
+           [[:tts :out] [:audio-splitter :in]]
+           [[:audio-splitter :out] [:realtime-out :in]]]})
 
 (comment
   (datafy (:proc (:deepgram-transcriptor (:procs gdef))))