diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index eb8c16d1f..edd0d898b 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -84,7 +84,5 @@ jobs:
           enable-cache: true
       - name: Install dependencies
         run: make sync
-      - name: Install Python 3.9 dependencies
-        run: UV_PROJECT_ENVIRONMENT=.venv_39 uv sync --all-extras --all-packages --group dev
       - name: Run tests
         run: make old_version_tests
diff --git a/.gitignore b/.gitignore
index 2e9b92379..c0c4b3254 100644
--- a/.gitignore
+++ b/.gitignore
@@ -100,7 +100,8 @@ celerybeat.pid
 *.sage.py
 
 # Environments
-.env
+.python-version
+.env*
 .venv
 env/
 venv/
diff --git a/Makefile b/Makefile
index 470d97c14..506f198a9 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,8 @@ snapshots-create:
 	uv run pytest --inline-snapshot=create 
 
 .PHONY: old_version_tests
-old_version_tests: 
+old_version_tests:
+	UV_PROJECT_ENVIRONMENT=.venv_39 uv sync --python 3.9 --all-extras --all-packages --group dev
 	UV_PROJECT_ENVIRONMENT=.venv_39 uv run --python 3.9 -m pytest
 
 .PHONY: build-docs
diff --git a/docs/realtime/guide.md b/docs/realtime/guide.md
index b3cc6d982..3e36a6b1f 100644
--- a/docs/realtime/guide.md
+++ b/docs/realtime/guide.md
@@ -48,7 +48,7 @@ Key differences from regular agents:
 
 ### Model settings
 
-The session configuration allows you to control the underlying realtime model behavior. You can configure the model name (such as `gpt-4o-realtime-preview`), voice selection (alloy, echo, fable, onyx, nova, shimmer), and supported modalities (text and/or audio). Audio formats can be set for both input and output, with PCM16 being the default.
+The session configuration allows you to control the underlying realtime model behavior. You can configure the model name (such as `gpt-realtime`), voice selection (alloy, echo, fable, onyx, nova, shimmer), and supported modalities (text and/or audio). Audio formats can be set for both input and output, with PCM16 being the default.
 
 ### Audio configuration
 
diff --git a/docs/realtime/quickstart.md b/docs/realtime/quickstart.md
index 2cee550ea..bb50b0642 100644
--- a/docs/realtime/quickstart.md
+++ b/docs/realtime/quickstart.md
@@ -44,7 +44,7 @@ runner = RealtimeRunner(
     starting_agent=agent,
     config={
         "model_settings": {
-            "model_name": "gpt-4o-realtime-preview",
+            "model_name": "gpt-realtime",
             "voice": "alloy",
             "modalities": ["text", "audio"],
         }
@@ -95,7 +95,7 @@ async def main():
         starting_agent=agent,
         config={
             "model_settings": {
-                "model_name": "gpt-4o-realtime-preview",
+                "model_name": "gpt-realtime",
                 "voice": "alloy",
                 "modalities": ["text", "audio"],
                 "input_audio_transcription": {
@@ -135,7 +135,7 @@ if __name__ == "__main__":
 
 ### Model settings
 
--   `model_name`: Choose from available realtime models (e.g., `gpt-4o-realtime-preview`)
+-   `model_name`: Choose from available realtime models (e.g., `gpt-realtime`)
 -   `voice`: Select voice (`alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`)
 -   `modalities`: Enable text and/or audio (`["text", "audio"]`)
 
diff --git a/examples/realtime/app/README.md b/examples/realtime/app/README.md
index cb5519a79..420134bba 100644
--- a/examples/realtime/app/README.md
+++ b/examples/realtime/app/README.md
@@ -29,14 +29,19 @@ To use the same UI with your own agents, edit `agent.py` and ensure get_starting
 1. Click **Connect** to establish a realtime session
 2. Audio capture starts automatically - just speak naturally
 3. Click the **Mic On/Off** button to mute/unmute your microphone
-4. Watch the conversation unfold in the left pane
-5. Monitor raw events in the right pane (click to expand/collapse)
-6. Click **Disconnect** when done
+4. To send an image, enter an optional prompt and click **🖼️ Send Image** (select a file)
+5. Watch the conversation unfold in the left pane (image thumbnails are shown)
+6. Monitor raw events in the right pane (click to expand/collapse)
+7. Click **Disconnect** when done
 
 ## Architecture
 
 -   **Backend**: FastAPI server with WebSocket connections for real-time communication
 -   **Session Management**: Each connection gets a unique session with the OpenAI Realtime API
+-   **Image Inputs**: The UI uploads images and the server forwards a
+    `conversation.item.create` event with `input_image` (plus optional `input_text`),
+    followed by `response.create` to start the model response. The messages pane
+    renders image bubbles for `input_image` content.
 -   **Audio Processing**: 24kHz mono audio capture and playback
 -   **Event Handling**: Full event stream processing with transcript generation
 -   **Frontend**: Vanilla JavaScript with clean, responsive CSS
diff --git a/examples/realtime/app/server.py b/examples/realtime/app/server.py
index 26c544dd2..d4ff47e80 100644
--- a/examples/realtime/app/server.py
+++ b/examples/realtime/app/server.py
@@ -12,6 +12,8 @@
 from typing_extensions import assert_never
 
 from agents.realtime import RealtimeRunner, RealtimeSession, RealtimeSessionEvent
+from agents.realtime.config import RealtimeUserInputMessage
+from agents.realtime.model_inputs import RealtimeModelSendRawMessage
 
 # Import TwilioHandler class - handle both module and package use cases
 if TYPE_CHECKING:
@@ -64,6 +66,34 @@ async def send_audio(self, session_id: str, audio_bytes: bytes):
         if session_id in self.active_sessions:
             await self.active_sessions[session_id].send_audio(audio_bytes)
 
+    async def send_client_event(self, session_id: str, event: dict[str, Any]):
+        """Send a raw client event to the underlying realtime model."""
+        session = self.active_sessions.get(session_id)
+        if not session:
+            return
+        await session.model.send_event(
+            RealtimeModelSendRawMessage(
+                message={
+                    "type": event["type"],
+                    "other_data": {k: v for k, v in event.items() if k != "type"},
+                }
+            )
+        )
+
+    async def send_user_message(self, session_id: str, message: RealtimeUserInputMessage):
+        """Send a structured user message via the higher-level API (supports input_image)."""
+        session = self.active_sessions.get(session_id)
+        if not session:
+            return
+        await session.send_message(message)  # delegates to RealtimeModelSendUserInput path
+
+    async def interrupt(self, session_id: str) -> None:
+        """Interrupt current model playback/response for a session."""
+        session = self.active_sessions.get(session_id)
+        if not session:
+            return
+        await session.interrupt()
+
     async def _process_events(self, session_id: str):
         try:
             session = self.active_sessions[session_id]
@@ -101,7 +131,11 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
         elif event.type == "history_updated":
             base_event["history"] = [item.model_dump(mode="json") for item in event.history]
         elif event.type == "history_added":
-            pass
+            # Provide the added item so the UI can render incrementally.
+            try:
+                base_event["item"] = event.item.model_dump(mode="json")
+            except Exception:
+                base_event["item"] = None
         elif event.type == "guardrail_tripped":
             base_event["guardrail_results"] = [
                 {"name": result.guardrail.name} for result in event.guardrail_results
@@ -134,6 +168,7 @@ async def lifespan(app: FastAPI):
 @app.websocket("/ws/{session_id}")
 async def websocket_endpoint(websocket: WebSocket, session_id: str):
     await manager.connect(websocket, session_id)
+    image_buffers: dict[str, dict[str, Any]] = {}
     try:
         while True:
             data = await websocket.receive_text()
@@ -144,6 +179,124 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
                 int16_data = message["data"]
                 audio_bytes = struct.pack(f"{len(int16_data)}h", *int16_data)
                 await manager.send_audio(session_id, audio_bytes)
+            elif message["type"] == "image":
+                logger.info("Received image message from client (session %s).", session_id)
+                # Build a conversation.item.create with input_image (and optional input_text)
+                data_url = message.get("data_url")
+                prompt_text = message.get("text") or "Please describe this image."
+                if data_url:
+                    logger.info(
+                        "Forwarding image (structured message) to Realtime API (len=%d).",
+                        len(data_url),
+                    )
+                    user_msg: RealtimeUserInputMessage = {
+                        "type": "message",
+                        "role": "user",
+                        "content": (
+                            [
+                                {"type": "input_image", "image_url": data_url, "detail": "high"},
+                                {"type": "input_text", "text": prompt_text},
+                            ]
+                            if prompt_text
+                            else [
+                                {"type": "input_image", "image_url": data_url, "detail": "high"}
+                            ]
+                        ),
+                    }
+                    await manager.send_user_message(session_id, user_msg)
+                    # Acknowledge to client UI
+                    await websocket.send_text(
+                        json.dumps(
+                            {
+                                "type": "client_info",
+                                "info": "image_enqueued",
+                                "size": len(data_url),
+                            }
+                        )
+                    )
+                else:
+                    await websocket.send_text(
+                        json.dumps(
+                            {
+                                "type": "error",
+                                "error": "No data_url for image message.",
+                            }
+                        )
+                    )
+            elif message["type"] == "commit_audio":
+                # Force close the current input audio turn
+                await manager.send_client_event(session_id, {"type": "input_audio_buffer.commit"})
+            elif message["type"] == "image_start":
+                img_id = str(message.get("id"))
+                image_buffers[img_id] = {
+                    "text": message.get("text") or "Please describe this image.",
+                    "chunks": [],
+                }
+                await websocket.send_text(
+                    json.dumps({"type": "client_info", "info": "image_start_ack", "id": img_id})
+                )
+            elif message["type"] == "image_chunk":
+                img_id = str(message.get("id"))
+                chunk = message.get("chunk", "")
+                if img_id in image_buffers:
+                    image_buffers[img_id]["chunks"].append(chunk)
+                    if len(image_buffers[img_id]["chunks"]) % 10 == 0:
+                        await websocket.send_text(
+                            json.dumps(
+                                {
+                                    "type": "client_info",
+                                    "info": "image_chunk_ack",
+                                    "id": img_id,
+                                    "count": len(image_buffers[img_id]["chunks"]),
+                                }
+                            )
+                        )
+            elif message["type"] == "image_end":
+                img_id = str(message.get("id"))
+                buf = image_buffers.pop(img_id, None)
+                if buf is None:
+                    await websocket.send_text(
+                        json.dumps({"type": "error", "error": "Unknown image id for image_end."})
+                    )
+                else:
+                    data_url = "".join(buf["chunks"]) if buf["chunks"] else None
+                    prompt_text = buf["text"]
+                    if data_url:
+                        logger.info(
+                            "Forwarding chunked image (structured message) to Realtime API (len=%d).",
+                            len(data_url),
+                        )
+                        user_msg2: RealtimeUserInputMessage = {
+                            "type": "message",
+                            "role": "user",
+                            "content": (
+                                [
+                                    {"type": "input_image", "image_url": data_url, "detail": "high"},
+                                    {"type": "input_text", "text": prompt_text},
+                                ]
+                                if prompt_text
+                                else [
+                                    {"type": "input_image", "image_url": data_url, "detail": "high"}
+                                ]
+                            ),
+                        }
+                        await manager.send_user_message(session_id, user_msg2)
+                        await websocket.send_text(
+                            json.dumps(
+                                {
+                                    "type": "client_info",
+                                    "info": "image_enqueued",
+                                    "id": img_id,
+                                    "size": len(data_url),
+                                }
+                            )
+                        )
+                    else:
+                        await websocket.send_text(
+                            json.dumps({"type": "error", "error": "Empty image."})
+                        )
+            elif message["type"] == "interrupt":
+                await manager.interrupt(session_id)
 
     except WebSocketDisconnect:
         await manager.disconnect(session_id)
@@ -160,4 +313,10 @@ async def read_index():
 if __name__ == "__main__":
     import uvicorn
 
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8000,
+        # Increased WebSocket frame size to comfortably handle image data URLs.
+        ws_max_size=16 * 1024 * 1024,
+    )
diff --git a/examples/realtime/app/static/app.js b/examples/realtime/app/static/app.js
index 3ec8fcc99..6858428c6 100644
--- a/examples/realtime/app/static/app.js
+++ b/examples/realtime/app/static/app.js
@@ -8,26 +8,33 @@ class RealtimeDemo {
         this.processor = null;
         this.stream = null;
         this.sessionId = this.generateSessionId();
-        
+
         // Audio playback queue
         this.audioQueue = [];
         this.isPlayingAudio = false;
         this.playbackAudioContext = null;
         this.currentAudioSource = null;
-        
+        this.currentAudioGain = null; // per-chunk gain for smooth fades
+        this.playbackFadeSec = 0.02; // ~20ms fade to reduce clicks
+        this.messageNodes = new Map(); // item_id -> DOM node
+        this.seenItemIds = new Set(); // item_id set for append-only syncing
+
         this.initializeElements();
         this.setupEventListeners();
     }
-    
+
     initializeElements() {
         this.connectBtn = document.getElementById('connectBtn');
         this.muteBtn = document.getElementById('muteBtn');
+        this.imageBtn = document.getElementById('imageBtn');
+        this.imageInput = document.getElementById('imageInput');
+        this.imagePrompt = document.getElementById('imagePrompt');
         this.status = document.getElementById('status');
         this.messagesContent = document.getElementById('messagesContent');
         this.eventsContent = document.getElementById('eventsContent');
         this.toolsContent = document.getElementById('toolsContent');
     }
-    
+
     setupEventListeners() {
         this.connectBtn.addEventListener('click', () => {
             if (this.isConnected) {
@@ -36,52 +43,99 @@ class RealtimeDemo {
                 this.connect();
             }
         });
-        
+
         this.muteBtn.addEventListener('click', () => {
             this.toggleMute();
         });
+
+        // Image upload
+        this.imageBtn.addEventListener('click', (e) => {
+            e.preventDefault();
+            e.stopPropagation();
+            console.log('Send Image clicked');
+            // Programmatically open the hidden file input
+            this.imageInput.click();
+        });
+
+        this.imageInput.addEventListener('change', async (e) => {
+            console.log('Image input change fired');
+            const file = e.target.files && e.target.files[0];
+            if (!file) return;
+            await this._handlePickedFile(file);
+            this.imageInput.value = '';
+        });
+
+        this._handlePickedFile = async (file) => {
+            try {
+                const dataUrl = await this.prepareDataURL(file);
+                const promptText = (this.imagePrompt && this.imagePrompt.value) || '';
+                // Send to server; server forwards to Realtime API.
+                // Use chunked frames to avoid WS frame limits.
+                if (this.ws && this.ws.readyState === WebSocket.OPEN) {
+                    console.log('Interrupting and sending image (chunked) to server WebSocket');
+                    // Stop any current audio locally and tell model to interrupt
+                    this.stopAudioPlayback();
+                    this.ws.send(JSON.stringify({ type: 'interrupt' }));
+                    const id = 'img_' + Math.random().toString(36).slice(2);
+                    const CHUNK = 60_000; // ~60KB per frame
+                    this.ws.send(JSON.stringify({ type: 'image_start', id, text: promptText }));
+                    for (let i = 0; i < dataUrl.length; i += CHUNK) {
+                        const chunk = dataUrl.slice(i, i + CHUNK);
+                        this.ws.send(JSON.stringify({ type: 'image_chunk', id, chunk }));
+                    }
+                    this.ws.send(JSON.stringify({ type: 'image_end', id }));
+                } else {
+                    console.warn('Not connected; image will not be sent. Click Connect first.');
+                }
+                // Add to UI immediately for better feedback
+                console.log('Adding local user image bubble');
+                this.addUserImageMessage(dataUrl, promptText);
+            } catch (err) {
+                console.error('Failed to process image:', err);
+            }
+        };
     }
-    
+
     generateSessionId() {
         return 'session_' + Math.random().toString(36).substr(2, 9);
     }
-    
+
     async connect() {
         try {
             this.ws = new WebSocket(`ws://localhost:8000/ws/${this.sessionId}`);
-            
+
             this.ws.onopen = () => {
                 this.isConnected = true;
                 this.updateConnectionUI();
                 this.startContinuousCapture();
             };
-            
+
             this.ws.onmessage = (event) => {
                 const data = JSON.parse(event.data);
                 this.handleRealtimeEvent(data);
             };
-            
+
             this.ws.onclose = () => {
                 this.isConnected = false;
                 this.updateConnectionUI();
             };
-            
+
             this.ws.onerror = (error) => {
                 console.error('WebSocket error:', error);
             };
-            
+
         } catch (error) {
             console.error('Failed to connect:', error);
         }
     }
-    
+
     disconnect() {
         if (this.ws) {
             this.ws.close();
         }
         this.stopContinuousCapture();
     }
-    
+
     updateConnectionUI() {
         if (this.isConnected) {
             this.connectBtn.textContent = 'Disconnect';
@@ -97,12 +151,12 @@ class RealtimeDemo {
             this.muteBtn.disabled = true;
         }
     }
-    
+
     toggleMute() {
         this.isMuted = !this.isMuted;
         this.updateMuteUI();
     }
-    
+
     updateMuteUI() {
         if (this.isMuted) {
             this.muteBtn.textContent = '🔇 Mic Off';
@@ -115,90 +169,128 @@ class RealtimeDemo {
             }
         }
     }
-    
+
+    readFileAsDataURL(file) {
+        return new Promise((resolve, reject) => {
+            const reader = new FileReader();
+            reader.onload = () => resolve(reader.result);
+            reader.onerror = reject;
+            reader.readAsDataURL(file);
+        });
+    }
+
+    async prepareDataURL(file) {
+        const original = await this.readFileAsDataURL(file);
+        try {
+            const img = new Image();
+            img.decoding = 'async';
+            const loaded = new Promise((res, rej) => {
+                img.onload = () => res();
+                img.onerror = rej;
+            });
+            img.src = original;
+            await loaded;
+
+            const maxDim = 1024;
+            const maxSide = Math.max(img.width, img.height);
+            const scale = maxSide > maxDim ? (maxDim / maxSide) : 1;
+            const w = Math.max(1, Math.round(img.width * scale));
+            const h = Math.max(1, Math.round(img.height * scale));
+
+            const canvas = document.createElement('canvas');
+            canvas.width = w; canvas.height = h;
+            const ctx = canvas.getContext('2d');
+            ctx.drawImage(img, 0, 0, w, h);
+            return canvas.toDataURL('image/jpeg', 0.85);
+        } catch (e) {
+            console.warn('Image resize failed; sending original', e);
+            return original;
+        }
+    }
+
     async startContinuousCapture() {
         if (!this.isConnected || this.isCapturing) return;
-        
+
         // Check if getUserMedia is available
         if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
             throw new Error('getUserMedia not available. Please use HTTPS or localhost.');
         }
-        
+
         try {
-            this.stream = await navigator.mediaDevices.getUserMedia({ 
+            this.stream = await navigator.mediaDevices.getUserMedia({
                 audio: {
                     sampleRate: 24000,
                     channelCount: 1,
                     echoCancellation: true,
                     noiseSuppression: true
-                } 
+                }
             });
-            
-            this.audioContext = new AudioContext({ sampleRate: 24000 });
+
+            this.audioContext = new AudioContext({ sampleRate: 24000, latencyHint: 'interactive' });
             const source = this.audioContext.createMediaStreamSource(this.stream);
-            
+
             // Create a script processor to capture audio data
             this.processor = this.audioContext.createScriptProcessor(4096, 1, 1);
             source.connect(this.processor);
             this.processor.connect(this.audioContext.destination);
-            
+
             this.processor.onaudioprocess = (event) => {
                 if (!this.isMuted && this.ws && this.ws.readyState === WebSocket.OPEN) {
                     const inputBuffer = event.inputBuffer.getChannelData(0);
                     const int16Buffer = new Int16Array(inputBuffer.length);
-                    
+
                     // Convert float32 to int16
                     for (let i = 0; i < inputBuffer.length; i++) {
                         int16Buffer[i] = Math.max(-32768, Math.min(32767, inputBuffer[i] * 32768));
                     }
-                    
+
                     this.ws.send(JSON.stringify({
                         type: 'audio',
                         data: Array.from(int16Buffer)
                     }));
                 }
             };
-            
+
             this.isCapturing = true;
             this.updateMuteUI();
-            
+
         } catch (error) {
             console.error('Failed to start audio capture:', error);
         }
     }
-    
+
     stopContinuousCapture() {
         if (!this.isCapturing) return;
-        
+
         this.isCapturing = false;
-        
+
         if (this.processor) {
             this.processor.disconnect();
             this.processor = null;
         }
-        
+
         if (this.audioContext) {
             this.audioContext.close();
             this.audioContext = null;
         }
-        
+
         if (this.stream) {
             this.stream.getTracks().forEach(track => track.stop());
             this.stream = null;
         }
-        
+
         this.updateMuteUI();
     }
-    
+
     handleRealtimeEvent(event) {
         // Add to raw events pane
         this.addRawEvent(event);
-        
+
         // Add to tools panel if it's a tool or handoff event
         if (event.type === 'tool_start' || event.type === 'tool_end' || event.type === 'handoff') {
             this.addToolEvent(event);
         }
-        
+
         // Handle specific event types
         switch (event.type) {
             case 'audio':
@@ -207,115 +299,214 @@ class RealtimeDemo {
             case 'audio_interrupted':
                 this.stopAudioPlayback();
                 break;
+            case 'input_audio_timeout_triggered':
+                // Ask server to commit the input buffer to expedite model response
+                if (this.ws && this.ws.readyState === WebSocket.OPEN) {
+                    this.ws.send(JSON.stringify({ type: 'commit_audio' }));
+                }
+                break;
             case 'history_updated':
-                this.updateMessagesFromHistory(event.history);
+                this.syncMissingFromHistory(event.history);
+                this.updateLastMessageFromHistory(event.history);
+                break;
+            case 'history_added':
+                // Append just the new item without clearing the thread.
+                if (event.item) {
+                    this.addMessageFromItem(event.item);
+                }
                 break;
         }
     }
-    
-    
-    updateMessagesFromHistory(history) {
-        console.log('updateMessagesFromHistory called with:', history);
-        
-        // Clear all existing messages
-        this.messagesContent.innerHTML = '';
-        
-        // Add messages from history
-        if (history && Array.isArray(history)) {
-            console.log('Processing history array with', history.length, 'items');
-            history.forEach((item, index) => {
-                console.log(`History item ${index}:`, item);
-                if (item.type === 'message') {
-                    const role = item.role;
-                    let content = '';
-                    
-                    console.log(`Message item - role: ${role}, content:`, item.content);
-                    
-                    if (item.content && Array.isArray(item.content)) {
-                        // Extract text from content array
-                        item.content.forEach(contentPart => {
-                            console.log('Content part:', contentPart);
-                            if (contentPart.type === 'text' && contentPart.text) {
-                                content += contentPart.text;
-                            } else if (contentPart.type === 'input_text' && contentPart.text) {
-                                content += contentPart.text;
-                            } else if (contentPart.type === 'input_audio' && contentPart.transcript) {
-                                content += contentPart.transcript;
-                            } else if (contentPart.type === 'audio' && contentPart.transcript) {
-                                content += contentPart.transcript;
-                            }
-                        });
-                    }
-                    
-                    console.log(`Final content for ${role}:`, content);
-                    
-                    if (content.trim()) {
-                        this.addMessage(role, content.trim());
-                        console.log(`Added message: ${role} - ${content.trim()}`);
+    updateLastMessageFromHistory(history) {
+        if (!history || !Array.isArray(history) || history.length === 0) return;
+        // Find the last message item in history
+        let last = null;
+        for (let i = history.length - 1; i >= 0; i--) {
+            const it = history[i];
+            if (it && it.type === 'message') { last = it; break; }
+        }
+        if (!last) return;
+        const itemId = last.item_id;
+
+        // Extract a text representation (for assistant transcript updates)
+        let text = '';
+        if (Array.isArray(last.content)) {
+            for (const part of last.content) {
+                if (!part || typeof part !== 'object') continue;
+                if (part.type === 'text' && part.text) text += part.text;
+                else if (part.type === 'input_text' && part.text) text += part.text;
+                else if ((part.type === 'input_audio' || part.type === 'audio') && part.transcript) text += part.transcript;
+            }
+        }
+
+        const node = this.messageNodes.get(itemId);
+        if (!node) {
+            // If we haven't rendered this item yet, append it now.
+            this.addMessageFromItem(last);
+            return;
+        }
+
+        // Update only the text content of the bubble, preserving any images already present.
+        const bubble = node.querySelector('.message-bubble');
+        if (bubble && text && text.trim()) {
+            // If there's an , keep it and only update the trailing caption/text node.
+            const hasImg = !!bubble.querySelector('img');
+            if (hasImg) {
+                // Ensure there is a caption div after the image
+                let cap = bubble.querySelector('.image-caption');
+                if (!cap) {
+                    cap = document.createElement('div');
+                    cap.className = 'image-caption';
+                    cap.style.marginTop = '0.5rem';
+                    bubble.appendChild(cap);
+                }
+                cap.textContent = text.trim();
+            } else {
+                bubble.textContent = text.trim();
+            }
+            this.scrollToBottom();
+        }
+    }
+
+    syncMissingFromHistory(history) {
+        if (!history || !Array.isArray(history)) return;
+        for (const item of history) {
+            if (!item || item.type !== 'message') continue;
+            const id = item.item_id;
+            if (!id) continue;
+            if (!this.seenItemIds.has(id)) {
+                this.addMessageFromItem(item);
+            }
+        }
+    }
+
+    addMessageFromItem(item) {
+        try {
+            if (!item || item.type !== 'message') return;
+            const role = item.role;
+            let content = '';
+            let imageUrls = [];
+
+            if (Array.isArray(item.content)) {
+                for (const contentPart of item.content) {
+                    if (!contentPart || typeof contentPart !== 'object') continue;
+                    if (contentPart.type === 'text' && contentPart.text) {
+                        content += contentPart.text;
+                    } else if (contentPart.type === 'input_text' && contentPart.text) {
+                        content += contentPart.text;
+                    } else if (contentPart.type === 'input_audio' && contentPart.transcript) {
+                        content += contentPart.transcript;
+                    } else if (contentPart.type === 'audio' && contentPart.transcript) {
+                        content += contentPart.transcript;
+                    } else if (contentPart.type === 'input_image') {
+                        const url = contentPart.image_url || contentPart.url;
+                        if (typeof url === 'string' && url) imageUrls.push(url);
                     }
-                } else {
-                    console.log(`Skipping non-message item of type: ${item.type}`);
                 }
-            });
-        } else {
-            console.log('History is not an array or is null/undefined');
+            }
+
+            let node = null;
+            if (imageUrls.length > 0) {
+                for (const url of imageUrls) {
+                    node = this.addImageMessage(role, url, content.trim());
+                }
+            } else if (content && content.trim()) {
+                node = this.addMessage(role, content.trim());
+            }
+            if (node && item.item_id) {
+                this.messageNodes.set(item.item_id, node);
+                this.seenItemIds.add(item.item_id);
+            }
+        } catch (e) {
+            console.error('Failed to add message from item:', e, item);
         }
-        
-        this.scrollToBottom();
     }
-    
+
     addMessage(type, content) {
         const messageDiv = document.createElement('div');
         messageDiv.className = `message ${type}`;
-        
+
         const bubbleDiv = document.createElement('div');
         bubbleDiv.className = 'message-bubble';
         bubbleDiv.textContent = content;
-        
+
         messageDiv.appendChild(bubbleDiv);
         this.messagesContent.appendChild(messageDiv);
         this.scrollToBottom();
-        
+
         return messageDiv;
     }
-    
+
+    addImageMessage(role, imageUrl, caption = '') {
+        const messageDiv = document.createElement('div');
+        messageDiv.className = `message ${role}`;
+
+        const bubbleDiv = document.createElement('div');
+        bubbleDiv.className = 'message-bubble';
+
+        const img = document.createElement('img');
+        img.src = imageUrl;
+        img.alt = 'Uploaded image';
+        img.style.maxWidth = '220px';
+        img.style.borderRadius = '8px';
+        img.style.display = 'block';
+
+        bubbleDiv.appendChild(img);
+        if (caption) {
+            const cap = document.createElement('div');
+            cap.textContent = caption;
+            cap.style.marginTop = '0.5rem';
+            bubbleDiv.appendChild(cap);
+        }
+
+        messageDiv.appendChild(bubbleDiv);
+        this.messagesContent.appendChild(messageDiv);
+        this.scrollToBottom();
+
+        return messageDiv;
+    }
+
+    addUserImageMessage(imageUrl, caption = '') {
+        return this.addImageMessage('user', imageUrl, caption);
+    }
+
     addRawEvent(event) {
         const eventDiv = document.createElement('div');
         eventDiv.className = 'event';
-        
+
         const headerDiv = document.createElement('div');
         headerDiv.className = 'event-header';
         headerDiv.innerHTML = `
             ${event.type}
             ▼
         `;
-        
+
         const contentDiv = document.createElement('div');
         contentDiv.className = 'event-content collapsed';
         contentDiv.textContent = JSON.stringify(event, null, 2);
-        
+
         headerDiv.addEventListener('click', () => {
             const isCollapsed = contentDiv.classList.contains('collapsed');
             contentDiv.classList.toggle('collapsed');
             headerDiv.querySelector('span:last-child').textContent = isCollapsed ? '▲' : '▼';
         });
-        
+
         eventDiv.appendChild(headerDiv);
         eventDiv.appendChild(contentDiv);
         this.eventsContent.appendChild(eventDiv);
-        
+
         // Auto-scroll events pane
         this.eventsContent.scrollTop = this.eventsContent.scrollHeight;
     }
-    
+
     addToolEvent(event) {
         const eventDiv = document.createElement('div');
         eventDiv.className = 'event';
-        
+
         let title = '';
         let description = '';
         let eventClass = '';
-        
+
         if (event.type === 'handoff') {
             title = `🔄 Handoff`;
             description = `From ${event.from} to ${event.to}`;
@@ -329,7 +520,7 @@ class RealtimeDemo {
             description = `${event.tool}: ${event.output || 'No output'}`;
             eventClass = 'tool';
         }
-        
+
         eventDiv.innerHTML = `