Skip to content

Commit b824ac5

Browse files
committed
Add support for GPT-4 Vision
1 parent 058a5a7 commit b824ac5

File tree

11 files changed

+542
-38
lines changed

11 files changed

+542
-38
lines changed

OpenAI_API/Chat/ChatEndpoint.cs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,23 @@ public Task<ChatResult> CreateChatCompletionAsync(params ChatMessage[] messages)
125125
/// <returns>The <see cref="ChatResult"/> with the API response.</returns>
126126
public Task<ChatResult> CreateChatCompletionAsync(params string[] userMessages) => CreateChatCompletionAsync(userMessages.Select(m => new ChatMessage(ChatMessageRole.User, m)).ToArray());
127127

128+
129+
/// <summary>
130+
/// Ask the API to complete the request using the specified message and image(s). Any parameters will fall back to default values specified in <see cref="DefaultChatRequestArgs"/> if present, except for <see cref="ChatRequest.Model"/>, which will default to <see cref="Model.GPT4_Vision"/>.
131+
/// </summary>
132+
/// <param name="userMessage">The user message text to use in the generation.</param>
133+
/// <param name="images">The images to use in the generation.</param>
134+
/// <returns>The <see cref="ChatResult"/> with the API response.</returns>
135+
public Task<ChatResult> CreateChatCompletionAsync(string userMessage, params ChatMessage.ImageInput[] images)
136+
{
137+
ChatRequest request = new ChatRequest(DefaultChatRequestArgs)
138+
{
139+
Model = Model.GPT4_Vision,
140+
Messages = new ChatMessage[] { new ChatMessage(ChatMessageRole.User, userMessage, images) },
141+
};
142+
return CreateChatCompletionAsync(request);
143+
}
144+
128145
#endregion
129146

130147
#region Streaming

OpenAI_API/Chat/ChatMessage.cs

Lines changed: 280 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
using Newtonsoft.Json;
2+
using Newtonsoft.Json.Linq;
23
using System;
34
using System.Collections.Generic;
5+
using System.IO;
46
using System.Text;
7+
using static System.Net.WebRequestMethods;
58

69
namespace OpenAI_API.Chat
710
{
@@ -22,11 +25,24 @@ public ChatMessage()
2225
/// Constructor for a new Chat Message
2326
/// </summary>
2427
/// <param name="role">The role of the message, which can be "system", "assistant" or "user"</param>
25-
/// <param name="content">The text to send in the message</param>
26-
public ChatMessage(ChatMessageRole role, string content)
28+
/// <param name="text">The text to send in the message</param>
29+
public ChatMessage(ChatMessageRole role, string text)
2730
{
2831
this.Role = role;
29-
this.Content = content;
32+
this.TextContent = text;
33+
}
34+
35+
/// <summary>
36+
/// Constructor for a new Chat Message with text and one or more images
37+
/// </summary>
38+
/// <param name="role">The role of the message, which can be "system", "assistant" or "user"</param>
39+
/// <param name="text">The text to send in the message. May be null if only sending image(s).</param>
40+
/// <param name="imageInputs">Optionally add one or more images to the message if using a GPT Vision model. Consider using <see cref="ImageInput.FromFile(string, string)"/> to load an image from a local file, or <see cref="ImageInput.FromImageUrl(string, string)"/> to point to an image via URL. Please see <seealso href="https://platform.openai.com/docs/guides/vision"/> for more information and limitations.</param>
41+
public ChatMessage(ChatMessageRole role, string text, params ImageInput[] imageInputs)
42+
{
43+
this.Role = role;
44+
this.TextContent = text;
45+
this.Images.AddRange(imageInputs);
3046
}
3147

3248
[JsonProperty("role")]
@@ -49,15 +65,274 @@ public ChatMessageRole Role
4965
}
5066

5167
/// <summary>
52-
/// The content of the message
68+
/// The text content of the message.
69+
/// </summary>
70+
[JsonIgnore]
71+
public string TextContent { get; set; }
72+
73+
/// <summary>
74+
/// To support multi-modal messages, this property has been renamed to <see cref="TextContent"/>. Please use that instead."/>
75+
/// </summary>
76+
[Obsolete("This property has been renamed to TextContent.")]
77+
[JsonIgnore]
78+
public string Content { get => TextContent; set => TextContent = value; }
79+
80+
/// <summary>
81+
/// This is only used for serializing the request into JSON, do not use it directly.
5382
/// </summary>
5483
[JsonProperty("content")]
55-
public string Content { get; set; }
84+
[JsonConverter(typeof(ContentDataConverter))]
85+
internal IList<ContentItem> ContentItems
86+
{
87+
get
88+
{
89+
List<ContentItem> items = new List<ContentItem>();
90+
if (!string.IsNullOrEmpty(TextContent))
91+
{
92+
items.Add(new ContentItem(TextContent));
93+
}
94+
if (Images != null && Images.Count > 0)
95+
{
96+
foreach (var image in Images)
97+
{
98+
items.Add(new ContentItem(image));
99+
}
100+
}
101+
102+
return items;
103+
}
104+
set
105+
{
106+
foreach (var item in value)
107+
{
108+
if (item.Type == "text")
109+
{
110+
TextContent = item.Text;
111+
}
112+
else if (item.Type == "image_url")
113+
{
114+
Images.Add(item.Image);
115+
}
116+
}
117+
}
118+
}
56119

57120
/// <summary>
58121
/// An optional name of the user in a multi-user chat
59122
/// </summary>
60123
[JsonProperty("name")]
61124
public string Name { get; set; }
125+
126+
/// <summary>
127+
/// Optionally add one or more images to the message if using a GPT Vision model. Please see <seealso href="https://platform.openai.com/docs/guides/vision"/> for more information and limitations.
128+
/// </summary>
129+
[JsonIgnore]
130+
public List<ImageInput> Images { get; set; } = new List<ImageInput>();
131+
132+
/// <summary>
133+
/// This is a helper class to serialize the content of the message to JSON
134+
/// </summary>
135+
internal class ContentItem
136+
{
137+
private string text;
138+
private ImageInput image;
139+
140+
/// <summary>
141+
/// The type of content to send to the API. This can be "text" or "image_url".
142+
/// </summary>
143+
[JsonProperty("type")]
144+
public string Type { get; set; } = "text";
145+
146+
/// <summary>
147+
/// Sends text to the API. This is the default type.
148+
/// </summary>
149+
[JsonProperty("text")]
150+
public string Text
151+
{
152+
get
153+
{
154+
if (Type == "text")
155+
return text;
156+
else
157+
return null;
158+
}
159+
160+
set
161+
{
162+
text = value;
163+
image = null;
164+
Type = "text";
165+
}
166+
}
167+
168+
/// <summary>
169+
/// Send an image to GPT Vision. Please see <seealso href="https://platform.openai.com/docs/guides/vision"/> for more information and limitations."/>
170+
/// </summary>
171+
[JsonProperty("image_url")]
172+
public ImageInput Image
173+
{
174+
get
175+
{
176+
if (Type == "image_url")
177+
return image;
178+
else
179+
return null;
180+
}
181+
182+
set
183+
{
184+
image = value;
185+
text = null;
186+
Type = "image_url";
187+
}
188+
}
189+
190+
/// <summary>
191+
/// Creates an empty <see cref="ContentItem"/>
192+
/// </summary>
193+
public ContentItem()
194+
{
195+
196+
}
197+
198+
/// <summary>
199+
/// Creates a new <see cref="ContentItem"/> with the given text
200+
/// </summary>
201+
/// <param name="text">The text to send to the API</param>
202+
public ContentItem(string text)
203+
{
204+
this.Text = text;
205+
this.Type = "text";
206+
}
207+
208+
/// <summary>
209+
/// Creates a new <see cref="ContentItem"/> with the given image
210+
/// </summary>
211+
/// <param name="image">The image to send to the API. Consider using <see cref="ImageInput.FromFile(string, string)"/> to load an image from a local file, or <see cref="ImageInput.FromImageUrl(string, string)"/> to point to an image via URL.</param>
212+
public ContentItem(ImageInput image)
213+
{
214+
this.Image = image;
215+
this.Type = "image_url";
216+
}
217+
}
218+
219+
/// <summary>
220+
/// Represents an image to send to the API in a chat message as part of GPT Vision.
221+
/// </summary>
222+
public class ImageInput
223+
{
224+
/// <summary>
225+
/// Either a URL of the image or the base64 encoded image data
226+
/// </summary>
227+
[JsonProperty("url")]
228+
public string Url { get; set; }
229+
230+
/// <summary>
231+
/// By controlling the detail parameter, which has three options, low, high, or auto, you have control over how the model processes the image and generates its textual understanding.
232+
/// </summary>
233+
[JsonProperty("detail")]
234+
public string Detail { get; set; } = "auto";
235+
236+
/// <summary>
237+
/// Instantiates a new ImageInput object with the given url
238+
/// </summary>
239+
/// <param name="url">A link to the image</param>
240+
/// <param name="detail">By controlling the detail parameter, which has three options, low, high, or auto, you have control over how the model processes the image and generates its textual understanding</param>
241+
public ImageInput(string url, string detail = "auto")
242+
{
243+
this.Url = url;
244+
this.Detail = detail;
245+
}
246+
247+
/// <summary>
248+
/// Instantiates a new ImageInput object with the given image data bytes
249+
/// </summary>
250+
/// <param name="imageData">The image as bytes to be base64 encoded. OpenAI currently supports PNG (.png), JPEG (.jpeg and .jpg), WEBP (.webp), and non-animated GIF (.gif)</param>
251+
/// <param name="detail">By controlling the detail parameter, which has three options, low, high, or auto, you have control over how the model processes the image and generates its textual understanding</param>
252+
public ImageInput(byte[] imageData, string detail = "auto")
253+
{
254+
this.Url = "data:image/jpeg;base64," + Convert.ToBase64String(imageData);
255+
this.Detail = detail;
256+
}
257+
258+
/// <summary>
259+
/// Instantiates a new ImageInput object with the given image loaded from disk
260+
/// </summary>
261+
/// <param name="filePath">The local file path of the image. OpenAI currently supports PNG (.png), JPEG (.jpeg and .jpg), WEBP (.webp), and non-animated GIF (.gif)</param>
262+
/// <param name="detail">By controlling the detail parameter, which has three options, low, high, or auto, you have control over how the model processes the image and generates its textual understanding</param>
263+
/// <returns></returns>
264+
public static ImageInput FromFile(string filePath, string detail = "auto")
265+
{
266+
return new ImageInput(System.IO.File.ReadAllBytes(filePath), detail);
267+
}
268+
269+
/// <summary>
270+
/// Instantiates a new ImageInput object with the given image data bytes
271+
/// </summary>
272+
/// <param name="imageData">The image as bytes to be base64 encoded</param>
273+
/// <param name="detail">By controlling the detail parameter, which has three options, low, high, or auto, you have control over how the model processes the image and generates its textual understanding</param>
274+
/// <returns></returns>
275+
public static ImageInput FromImageBytes(byte[] imageData, string detail = "auto")
276+
{
277+
return new ImageInput(imageData, detail);
278+
}
279+
280+
/// <summary>
281+
/// Instantiates a new ImageInput object with the given url
282+
/// </summary>
283+
/// <param name="url">A link to the image</param>
284+
/// <param name="detail">By controlling the detail parameter, which has three options, low, high, or auto, you have control over how the model processes the image and generates its textual understanding</param>
285+
/// <returns></returns>
286+
public static ImageInput FromImageUrl(string url, string detail = "auto")
287+
{
288+
return new ImageInput(url, detail);
289+
}
290+
291+
/// <summary>
292+
/// By default, the model will use the auto setting which will look at the image input size and decide if it should use the low or high setting.
293+
/// </summary>
294+
public const string DetailAuto = "auto";
295+
/// <summary>
296+
/// low will disable the “high res” model. The model will receive a low-res 512px x 512px version of the image, and represent the image with a budget of 65 tokens. This allows the API to return faster responses and consume fewer input tokens for use cases that do not require high detail.
297+
/// </summary>
298+
public const string DetailLow = "low";
299+
/// <summary>
300+
/// high will enable “high res” mode, which first allows the model to see the low res image and then creates detailed crops of input images as 512px squares based on the input image size. Each of the detailed crops uses twice the token budget (65 tokens) for a total of 129 tokens.
301+
/// </summary>
302+
public const string DetailHigh = "high";
303+
}
304+
305+
internal class ContentDataConverter : JsonConverter
306+
{
307+
public override bool CanConvert(Type objectType)
308+
{
309+
return true;
310+
}
311+
312+
public override object ReadJson(JsonReader reader, Type objectType, object existingValue, JsonSerializer serializer)
313+
{
314+
JToken token = JToken.Load(reader);
315+
if (token.Type == JTokenType.Object)
316+
{
317+
return token.ToObject<IList<ContentItem>>();
318+
}
319+
else if (token.Type == JTokenType.String)
320+
{
321+
List<ContentItem> content = new List<ContentItem>();
322+
content.Add(new ContentItem(token.ToObject<string>()));
323+
return content;
324+
}
325+
else
326+
{
327+
return null;
328+
}
329+
}
330+
331+
public override void WriteJson(JsonWriter writer, object value, JsonSerializer serializer)
332+
{
333+
serializer.Serialize(writer, value);
334+
}
335+
}
336+
62337
}
63338
}

OpenAI_API/Chat/ChatRequest.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ public string StopSequence
134134
/// This is only used for serializing the request into JSON, do not use it directly.
135135
/// </summary>
136136
[JsonProperty("response_format", DefaultValueHandling=DefaultValueHandling.Ignore)]
137-
public Dictionary<string, string> ResponseFormatRaw
137+
internal Dictionary<string, string> ResponseFormatRaw
138138
{
139139
get
140140
{

OpenAI_API/Chat/ChatResult.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@ public class ChatChoice
7777
/// <returns>The content of the message in this response, not including <see cref="ChatMessageRole"/>.</returns>
7878
public override string ToString()
7979
{
80-
return Message.Content;
80+
if (Message == null && Delta != null)
81+
return Delta.TextContent;
82+
else
83+
return Message.TextContent;
8184
}
8285
}
8386

0 commit comments

Comments
 (0)