diff --git a/README.md b/README.md index d1ccb8d..69b6090 100755 --- a/README.md +++ b/README.md @@ -167,6 +167,7 @@ Several sample applications demonstrating different scenarios are available in t - **LocalFunctions** – showcases the local function calling feature. - **GetPaid** – extends local functions to simulate payment requests. - **GetStartedDI** – illustrates using the library with .NET Dependency Injection. +- **GetStartedSIP** – demonstrates how to create a SIP-to-OpenAI WebRTC gateway that receives SIP (VoIP) calls and bridges to OpenAI. - **ASP.NET Get Started** – ASP.NET application bridging a browser WebRTC client to OpenAI. - **ASP.NET Local Function** – ASP.NET application that builds on the Get Started example and adds a local function to tailor OpenAI responses. diff --git a/examples/GetStartedSIP/GetStartedSIP.csproj b/examples/GetStartedSIP/GetStartedSIP.csproj new file mode 100755 index 0000000..05314f8 --- /dev/null +++ b/examples/GetStartedSIP/GetStartedSIP.csproj @@ -0,0 +1,31 @@ + + + + Exe + net8.0 + 12.0 + enable + true + true + + + + true + + + + + + + + + + + + + + + + + + diff --git a/examples/GetStartedSIP/Program.cs b/examples/GetStartedSIP/Program.cs new file mode 100755 index 0000000..3f5d7ec --- /dev/null +++ b/examples/GetStartedSIP/Program.cs @@ -0,0 +1,319 @@ +//----------------------------------------------------------------------------- +// Filename: Program.cs +// +// Description: An example showing how to use SIPSorcery with OpenAI's WebRTC endpoint. +// This demo shows the concept of how you could bridge SIP calls to OpenAI, though +// a complete implementation would require additional SIP handling logic. +// +// Usage: +// set OPENAI_API_KEY=your_openai_key +// dotnet run +// +// Author(s): +// Aaron Clauson (aaron@sipsorcery.com) +// +// History: +// 09 Aug 2025 Aaron Clauson Created, Dublin, Ireland. +// +// License: +// BSD 3-Clause "New" or "Revised" License and the additional +// BDS BY-NC-SA restriction, see included LICENSE.md file. +//----------------------------------------------------------------------------- + +using Microsoft.Extensions.Logging; +using Serilog; +using Serilog.Extensions.Logging; +using SIPSorcery.Net; +using SIPSorcery.OpenAIWebRTC; +using SIPSorcery.OpenAIWebRTC.Models; +using SIPSorcery.SIP; +using SIPSorcery.SIP.App; +using SIPSorceryMedia.Abstractions; +using System; +using System.Collections.Concurrent; +using System.Net; +using System.Threading.Tasks; + +namespace demo; + +record SIPToOpenAiCall(SIPUserAgent ua, RTPSession voip, WebRTCEndPoint? webrtc); + +class Program +{ + private static int SIP_LISTEN_PORT = 5060; + + /// + /// Keeps track of the current active calls. It includes both received and placed calls. + /// + private static ConcurrentDictionary _calls = new ConcurrentDictionary(); + + static async Task Main() + { + Log.Logger = new LoggerConfiguration() + .MinimumLevel.Debug() + //.MinimumLevel.Verbose() + .Enrich.FromLogContext() + .WriteTo.Console() + .CreateLogger(); + + var loggerFactory = new SerilogLoggerFactory(Log.Logger); + SIPSorcery.LogFactory.Set(loggerFactory); + + Log.Logger.Information("SIP-to-WebRTC OpenAI Demo Program"); + + var openAiKey = Environment.GetEnvironmentVariable("OPENAI_API_KEY"); + + if (string.IsNullOrWhiteSpace(openAiKey)) + { + Log.Logger.Error("Please provide your OpenAI key as an environment variable. For example: set OPENAI_API_KEY="); + return; + } + + var logger = loggerFactory.CreateLogger(); + + SIPSorcery.LogFactory.Set(loggerFactory); + var sipTransport = new SIPTransport(); + sipTransport.EnableTraceLogs(); + sipTransport.AddSIPChannel(new SIPUDPChannel(new IPEndPoint(IPAddress.Any, SIP_LISTEN_PORT))); + sipTransport.SIPTransportRequestReceived += (lep, rep, req) => OnRequest(lep, rep, req, sipTransport, openAiKey); + + Console.WriteLine("Wait for ctrl-c to indicate user exit."); + + var exitTcs = new TaskCompletionSource(); + Console.CancelKeyPress += (s, e) => + { + e.Cancel = true; + exitTcs.TrySetResult(null); + }; + + await exitTcs.Task; + } + + /// + /// Because this is a server user agent the SIP transport must start listening for client user agents. + /// + private static async Task OnRequest(SIPEndPoint localSIPEndPoint, SIPEndPoint remoteEndPoint, SIPRequest sipRequest, SIPTransport sipTransport, string openAiKey) + { + try + { + if (sipRequest.Header.From != null && + sipRequest.Header.From.FromTag != null && + sipRequest.Header.To != null && + sipRequest.Header.To.ToTag != null) + { + // This is an in-dialog request that will be handled directly by a user agent instance. + } + else if (sipRequest.Method == SIPMethodsEnum.INVITE) + { + Log.Information($"Incoming call request: {localSIPEndPoint}<-{remoteEndPoint} {sipRequest.URI}."); + + SIPUserAgent ua = new SIPUserAgent(sipTransport, null); + ua.OnCallHungup += OnHangup; + ua.ServerCallCancelled += (uas, cancelReq) => Log.Debug("Incoming call cancelled by remote party."); + ua.OnDtmfTone += (key, duration) => OnDtmfTone(ua, key, duration); + ua.OnRtpEvent += (evt, hdr) => Log.Debug($"rtp event {evt.EventID}, duration {evt.Duration}, end of event {evt.EndOfEvent}, timestamp {hdr.Timestamp}, marker {hdr.MarkerBit}."); + //ua.OnTransactionTraceMessage += (tx, msg) => Log.LogDebug($"uas tx {tx.TransactionId}: {msg}"); + ua.ServerCallRingTimeout += (uas) => + { + Log.Warning($"Incoming call timed out in {uas.ClientTransaction.TransactionState} state waiting for client ACK, terminating."); + ua.Hangup(); + }; + + //bool wasMangled = false; + //sipRequest.Body = SIPPacketMangler.MangleSDP(sipRequest.Body, remoteEndPoint.Address.ToString(), out wasMangled); + //Log.LogDebug("INVITE was mangled=" + wasMangled + " remote=" + remoteEndPoint.Address.ToString() + "."); + //sipRequest.Header.ContentLength = sipRequest.Body.Length; + + var uas = ua.AcceptCall(sipRequest); + var rtpSession = CreateRtpSession(ua); + + // Insert a brief delay to allow testing of the "Ringing" progress response. + // Without the delay the call gets answered before it can be sent. + //await Task.Delay(500); + + //if (!string.IsNullOrWhiteSpace(_publicIPAddress)) + //{ + // await ua.Answer(uas, rtpSession, IPAddress.Parse(_publicIPAddress)); + //} + //else + //{ + await ua.Answer(uas, rtpSession); + //} + + if (ua.IsCallActive) + { + await rtpSession.Start(); + _calls.TryAdd(ua.Dialogue.CallId, new SIPToOpenAiCall(ua, rtpSession, null)); + + Log.Information($"Call answered, call ID {ua.Dialogue.CallId}."); + + // Create a WebRTC session to OpenAI. + await CreateOpenAIWebRTCSession(new SerilogLoggerFactory(Log.Logger), openAiKey, ua.Dialogue.CallId, rtpSession); + } + } + else if (sipRequest.Method == SIPMethodsEnum.BYE) + { + SIPResponse byeResponse = SIPResponse.GetResponse(sipRequest, SIPResponseStatusCodesEnum.CallLegTransactionDoesNotExist, null); + await sipTransport.SendResponseAsync(byeResponse); + } + else if (sipRequest.Method == SIPMethodsEnum.SUBSCRIBE) + { + SIPResponse notAllowededResponse = SIPResponse.GetResponse(sipRequest, SIPResponseStatusCodesEnum.MethodNotAllowed, null); + await sipTransport.SendResponseAsync(notAllowededResponse); + } + else if (sipRequest.Method == SIPMethodsEnum.OPTIONS || sipRequest.Method == SIPMethodsEnum.REGISTER) + { + SIPResponse optionsResponse = SIPResponse.GetResponse(sipRequest, SIPResponseStatusCodesEnum.Ok, null); + await sipTransport.SendResponseAsync(optionsResponse); + } + } + catch (Exception reqExcp) + { + Log.Warning($"Exception handling {sipRequest.Method}. {reqExcp.Message}"); + } + } + + /// + /// Example of how to create a basic RTP session object and hook up the event handlers. + /// + /// The user agent the RTP session is being created for. + /// A new RTP session object. + private static RTPSession CreateRtpSession(SIPUserAgent ua) + { + var rtpSession = new RTPSession(false, false, false); + rtpSession.addTrack(new MediaStreamTrack(AudioCommonlyUsedFormats.OpusWebRTC)); + rtpSession.AcceptRtpFromAny = true; + + // Wire up the event handler for RTP packets received from the remote party. + //rtpSession.OnRtpPacketReceived += (ep, type, rtp) => OnRtpPacketReceived(ua, ep, type, rtp); + rtpSession.OnTimeout += (mediaType) => + { + if (ua?.Dialogue != null) + { + Log.Warning($"RTP timeout on call with {ua.Dialogue.RemoteTarget}, hanging up."); + } + else + { + Log.Warning($"RTP timeout on incomplete call, closing RTP session."); + } + + ua?.Hangup(); + }; + + return rtpSession; + } + + private static async Task CreateOpenAIWebRTCSession(ILoggerFactory loggerFactory, string openAiKey, string sipCallID, RTPSession rtpSession) + { + var logger = loggerFactory.CreateLogger(); + var webrtcEndPoint = new WebRTCEndPoint(openAiKey, logger); + + if (_calls.TryGetValue(sipCallID, out var existing)) + { + var updated = existing with { webrtc = webrtcEndPoint }; + _calls.TryUpdate(sipCallID, updated, existing); + } + + var negotiateConnectResult = await webrtcEndPoint.StartConnect(); + + if (negotiateConnectResult.IsLeft) + { + Log.Logger.Error($"Failed to negotiation connection to OpenAI Realtime WebRTC endpoint: {negotiateConnectResult.LeftAsEnumerable().First()}"); + return; + } + + webrtcEndPoint.OnPeerConnectionConnected += () => + { + Log.Logger.Information("WebRTC peer connection established."); + + webrtcEndPoint.ConnectRTPSession(rtpSession, AudioCommonlyUsedFormats.OpusWebRTC); + + var voice = RealtimeVoicesEnum.shimmer; + + // Optionally send a session update message to adjust the session parameters. + var sessionUpdateResult = webrtcEndPoint.DataChannelMessenger.SendSessionUpdate( + voice, + "Keep it short.", + transcriptionModel: TranscriptionModelEnum.Whisper1); + + if (sessionUpdateResult.IsLeft) + { + Log.Logger.Error($"Failed to send session update message: {sessionUpdateResult.LeftAsEnumerable().First()}"); + } + + // Trigger the conversation by sending a response create message. + var result = webrtcEndPoint.DataChannelMessenger.SendResponseCreate(voice, "Say Hi!"); + if (result.IsLeft) + { + Log.Logger.Error($"Failed to send response create message: {result.LeftAsEnumerable().First()}"); + } + }; + + webrtcEndPoint.OnDataChannelMessage += (dc, message) => + { + var log = message switch + { + RealtimeServerEventSessionUpdated sessionUpdated => $"Session updated: {sessionUpdated.ToJson()}", + //RealtimeServerEventConversationItemInputAudioTranscriptionDelta inputDelta => $"ME ⌛: {inputDelta.Delta?.Trim()}", + RealtimeServerEventConversationItemInputAudioTranscriptionCompleted inputTranscript => $"ME ✅: {inputTranscript.Transcript?.Trim()}", + //RealtimeServerEventResponseAudioTranscriptDelta responseDelta => $"AI ⌛: {responseDelta.Delta?.Trim()}", + RealtimeServerEventResponseAudioTranscriptDone responseTranscript => $"AI ✅: {responseTranscript.Transcript?.Trim()}", + //_ => $"Received {message.Type} -> {message.GetType().Name}" + _ => string.Empty + }; + + if (log != string.Empty) + { + Log.Information(log); + } + }; + } + + /// + /// Event handler for receiving RTP packets. + /// + /// The SIP user agent associated with the RTP session. + /// The media type of the RTP packet (audio or video). + /// The RTP packet received from the remote party. + private static void OnRtpPacketReceived(SIPUserAgent ua, IPEndPoint remoteEp, SDPMediaTypesEnum type, RTPPacket rtpPacket) + { + // The raw audio data is available in rtpPacket.Payload. + Log.Verbose($"OnRtpPacketReceived from {remoteEp}."); + } + + /// + /// Event handler for receiving a DTMF tone. + /// + /// The user agent that received the DTMF tone. + /// The DTMF tone. + /// The duration in milliseconds of the tone. + private static void OnDtmfTone(SIPUserAgent ua, byte key, int duration) + { + string callID = ua.Dialogue.CallId; + Log.Information($"Call {callID} received DTMF tone {key}, duration {duration}ms."); + } + + /// + /// Remove call from the active calls list. + /// + /// The dialogue that was hungup. + private static void OnHangup(SIPDialogue dialogue) + { + if (dialogue != null) + { + string callID = dialogue.CallId; + if (_calls.ContainsKey(callID)) + { + if (_calls.TryRemove(callID, out var call)) + { + Log.Information($"Call {callID} removed."); + + // This app only uses each SIP user agent once so here the agent is + // explicitly closed to prevent is responding to any new SIP requests. + call.ua.Close(); + call.webrtc?.Close(); + } + } + } + } +} \ No newline at end of file diff --git a/examples/GetStartedSIP/README.md b/examples/GetStartedSIP/README.md new file mode 100755 index 0000000..0945676 --- /dev/null +++ b/examples/GetStartedSIP/README.md @@ -0,0 +1,99 @@ +# OpenAI WebRTC SIP Gateway Example + +This example demonstrates how to create a SIP-to-OpenAI WebRTC gateway that receives incoming SIP calls and bridges the audio to OpenAI's real-time API. The caller can have a voice conversation with OpenAI through their SIP client or phone. + +## Features + +- **SIP Server**: Listens for incoming SIP calls on UDP port 5060 +- **Audio Bridging**: Routes audio from SIP caller to OpenAI and responses back to caller +- **Real-time Conversation**: Enables natural voice conversations between SIP callers and OpenAI +- **Call Management**: Handles call setup, teardown, and proper resource cleanup +- **Transcription Logging**: Displays conversation transcripts in real-time + +## How it Works + +1. The application starts a SIP server listening on UDP port 5060 +2. When a SIP call is received, it's automatically answered +3. A WebRTC connection is established with OpenAI's real-time endpoint +4. Audio is bridged bidirectionally: + - Caller's voice → OpenAI (for processing and response generation) + - OpenAI's response → Caller (through the SIP call) +5. Conversation transcripts are logged to the console +6. When the caller hangs up, connections are properly cleaned up + +## Requirements + +- Windows OS (due to media dependencies) +- [.NET 8.0 SDK](https://dotnet.microsoft.com/en-us/download/dotnet/8.0) +- OpenAI API key with access to the Realtime API +- SIP client or softphone for testing +- Network access on UDP port 5060 + +## Usage + +1. **Set your OpenAI API key**: +```bash +set OPENAI_API_KEY=your_openai_key +``` + +2. **Run the application**: +```bash +dotnet run +``` + +3. **Test with a SIP call**: + - Use any SIP client (like [microSIP](https://www.microsip.org/) or a hardware SIP phone) + - Call: `sip:test@{your_computer_ip}:5060` + - Start speaking once the call connects + +## Example Session + +``` +[14:30:15 INF] SIP user agent listening on UDP:*:5060... +Waiting for incoming SIP calls... +To test, call sip:test@192.168.1.100:5060 + +[14:30:45 INF] Incoming SIP call from sip:user@192.168.1.50:5060 +[14:30:46 INF] SIP call answered, connecting to OpenAI... +[14:30:47 INF] OpenAI WebRTC peer connection established. +[14:30:48 INF] Audio bridge established between SIP call and OpenAI. +[14:30:49 INF] AI: Hello! How can I help you today? +[14:30:52 INF] CALLER: Hi, can you tell me about the weather? +[14:30:54 INF] AI: I'd be happy to help with weather information, but I don't have access to current weather data... +``` + +## Configuration + +The demo uses these default settings: +- **SIP Port**: UDP 5060 +- **OpenAI Voice**: shimmer +- **Instructions**: "You are speaking with someone via a phone call. Keep responses brief and conversational." +- **Transcription**: Enabled with Whisper-1 model + +## Network Requirements + +- Ensure UDP port 5060 is accessible for incoming SIP calls +- If testing from external networks, configure firewall/router appropriately +- For RTP audio, ensure UDP ports in the range 10000-20000 are accessible (default SIPSorcery range) + +## Testing + +### Local Testing +1. Install a softphone like [microSIP](https://www.microsip.org/) +2. Configure it to call `sip:test@127.0.0.1:5060` +3. Make the call and start speaking + +### Network Testing +1. Find your computer's IP address: `ipconfig` (Windows) or `ifconfig` (Linux/Mac) +2. Call `sip:test@{your_ip}:5060` from any SIP client on the network +3. Ensure firewall allows UDP 5060 and RTP ports + +## Limitations + +- The only supported audio codec is Opus. +- No authentication or call routing (accepts all incoming calls) +- Designed for demonstration purposes + +## License + +BSD 3-Clause "New" or "Revised" License and the additional BDS BY-NC-SA restriction. See `LICENSE.md` for details. \ No newline at end of file diff --git a/examples/SIPSorcery.OpenAI.WebRTC.Examples.sln b/examples/SIPSorcery.OpenAI.WebRTC.Examples.sln index 364ca97..2ae96c2 100755 --- a/examples/SIPSorcery.OpenAI.WebRTC.Examples.sln +++ b/examples/SIPSorcery.OpenAI.WebRTC.Examples.sln @@ -19,6 +19,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AspNetLocalFunction", "AspN EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "GetStartedDI", "GetStartedDI\GetStartedDI.csproj", "{C0F1E5F2-826B-CD2D-581F-28D774181BD7}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "GetStartedSIP", "GetStartedSIP\GetStartedSIP.csproj", "{C9C1C709-F445-D075-41F8-21F55FFD4654}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -57,6 +59,10 @@ Global {C0F1E5F2-826B-CD2D-581F-28D774181BD7}.Debug|Any CPU.Build.0 = Debug|Any CPU {C0F1E5F2-826B-CD2D-581F-28D774181BD7}.Release|Any CPU.ActiveCfg = Release|Any CPU {C0F1E5F2-826B-CD2D-581F-28D774181BD7}.Release|Any CPU.Build.0 = Release|Any CPU + {C9C1C709-F445-D075-41F8-21F55FFD4654}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C9C1C709-F445-D075-41F8-21F55FFD4654}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C9C1C709-F445-D075-41F8-21F55FFD4654}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C9C1C709-F445-D075-41F8-21F55FFD4654}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/Extensions/WebRTCExtensions.cs b/src/Extensions/WebRTCExtensions.cs index 706ce57..edd1649 100755 --- a/src/Extensions/WebRTCExtensions.cs +++ b/src/Extensions/WebRTCExtensions.cs @@ -61,7 +61,7 @@ public static IServiceCollection AddOpenAIRealtimeWebRTC(this IServiceCollection } /// - /// Connects an audio endpoint to a WebRTC end point. The standard use case is to connect the audio from the OpeNAI end point + /// Connects an audio endpoint to a WebRTC end point. The standard use case is to connect the audio from the OpenAI end point /// to local audio devices (speakers and/or microphone). /// /// The WebRTC end point to connect. @@ -93,6 +93,43 @@ public static void ConnectAudioEndPoint(this IWebRTCEndPoint webRTCEndPoint, IAu }; } + /// + /// Connects an RTPSession to a WebRTC end point. The standard use case is to connect the audio packets from an RTP session established by a SIP call + /// to an OpenAI WebRTC endpoint. + /// + /// The WebRTC end point to connect. + /// The RTPSession to connect. + /// The audio format being used for both audio streams. + public static void ConnectRTPSession(this IWebRTCEndPoint webRTCEndPoint, RTPSession rtpSession, AudioFormat audioFormat) + { + rtpSession.OnAudioFrameReceived += (frame) => SendAudioToWebRTCEndPoint(frame, webRTCEndPoint, audioFormat); + webRTCEndPoint.OnAudioFrameReceived += (frame) => SendAudioToRTPSession(frame, rtpSession, audioFormat); + + webRTCEndPoint.OnPeerConnectionFailed += () => + { + rtpSession.OnAudioFrameReceived -= (frame) => SendAudioToWebRTCEndPoint(frame, webRTCEndPoint, audioFormat); + webRTCEndPoint.OnAudioFrameReceived -= (frame) => SendAudioToRTPSession(frame, rtpSession, audioFormat); + }; + + webRTCEndPoint.OnPeerConnectionClosed += () => + { + rtpSession.OnAudioFrameReceived -= (frame) => SendAudioToWebRTCEndPoint(frame, webRTCEndPoint, audioFormat); + webRTCEndPoint.OnAudioFrameReceived -= (frame) => SendAudioToRTPSession(frame, rtpSession, audioFormat); + }; + } + + private static void SendAudioToWebRTCEndPoint(EncodedAudioFrame frame, IWebRTCEndPoint webRTCEndPoint, AudioFormat audioFormat) + { + webRTCEndPoint.SendAudio( + RtpTimestampExtensions.ToRtpUnits(frame.DurationMilliSeconds, audioFormat.RtpClockRate), frame.EncodedAudio); + } + + private static void SendAudioToRTPSession(EncodedAudioFrame frame, RTPSession rtpSession, AudioFormat audioFormat) + { + rtpSession.SendAudio( + RtpTimestampExtensions.ToRtpUnits(frame.DurationMilliSeconds, audioFormat.RtpClockRate), frame.EncodedAudio); + } + /// /// Pipes encoded audio frames from the to the /// by directly forwarding RTP audio packets.