diff --git a/CHANGELOG.md b/CHANGELOG.md index 458d65b..ef3da6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +# 2.1.0 +- Added Mistral OCR. See the README.md for more details. + # 2.0.0 - **BREAKING**: Major refactor of message handling for chat completions: diff --git a/README.md b/README.md index b950127..0558c64 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ **Mistral-java-client** is a Java client for the [Mistral AI](https://mistral.ai/) API. It allows you to easily interact with the Mistral AI API from your Java application. -Supports all chat completion and embedding models available in the API. +Supports all chat completion, OCR, and embedding models available in the API. New models or models not listed here may be already supported without any updates to the library. @@ -15,6 +15,7 @@ Mistral-java-client is built against version 0.0.2 of the [Mistral AI API](https - [Chat Completion](https://docs.mistral.ai/api/#tag/chat/operation/chat_completion_v1_chat_completions_post) - [List Models](https://docs.mistral.ai/api/#tag/models/operation/list_models_v1_models_get) - [Embeddings](https://docs.mistral.ai/api/#tag/embeddings/operation/embeddings_v1_embeddings_post) +- [OCR](https://docs.mistral.ai/api/#tag/ocr) # Requirements @@ -33,7 +34,7 @@ repositories { } dependencies { - implementation 'com.github.Dannyj1:mistral-java-client:2.0.0' + implementation 'com.github.Dannyj1:mistral-java-client:2.1.0' } ``` @@ -51,7 +52,7 @@ dependencies { com.github.Dannyj1 mistral-java-client - 2.0.0 + 2.1.0 ``` @@ -397,11 +398,74 @@ Example output: [-0.02015686, 0.04272461, 0.05529785, ... , -0.006855011, 0.009529114, -0.016448975] ``` + +## OCR Completion + +This example shows how to use the Mistral AI API to perform OCR on a document. + +```java +import nl.dannyj.mistral.MistralClient; +import nl.dannyj.mistral.models.completion.content.DocumentURLChunk; +import nl.dannyj.mistral.models.ocr.OCRRequest; +import nl.dannyj.mistral.models.ocr.OCRResponse; +import nl.dannyj.mistral.models.ocr.OCRPageObject; + +import java.net.URI; + +public class MinimalOcrExample { + + public static void main(String[] args) { + // Replace "C:\\path\\to\\file.pdf" with the actual path to your document + String filePath = "C:\\\\path\\\\to\\\\file.pdf"; + File documentFile = new File(filePath); + + // Replace "YOUR_API_KEY" with your actual Mistral AI API key + // Or set the MISTRAL_API_KEY environment variable + MistralClient client = new MistralClient("YOUR_API_KEY"); + + // Convert document to base64 + byte[] documentBytes = Files.readAllBytes(documentFile.toPath()); + String documentBase64 = Base64.getEncoder().encodeToString(documentBytes); + URI documentUrl = URI.create("data:application/pdf;base64," + documentBase64); + + DocumentURLChunk documentChunk = DocumentURLChunk.builder() + .documentUrl(documentUrl) + .documentName("your_document.pdf") // Replace with your document name + .build(); + + OCRRequest request = OCRRequest.builder() + .model("mistral-ocr-latest") // Or another supported OCR model + .document(documentChunk) + .build(); + + try { + System.out.println("Performing OCR..."); + OCRResponse response = client.createOcrCompletion(request); + + System.out.println("OCR Results:"); + if (response.getPages() != null && !response.getPages().isEmpty()) { + // Print markdown content of the first page + OCRPageObject firstPage = response.getPages().get(0); + System.out.println("--- Page " + firstPage.getIndex() + " ---"); + System.out.println("Markdown Content:"); + System.out.println(firstPage.getMarkdown()); + } else { + System.out.println("No pages processed or results found."); + } + + } catch (Exception e) { + System.err.println("An error occurred: " + e.getMessage()); + e.printStackTrace(); + } + } +} +``` + # Roadmap - [ ] Make multi-modal usage more convenient (through builders, etc.) - [ ] Make JSON schemas for function calling more developer-friendly -- [ ] Add support for all missing features (e.g. OCR) +- [ ] Add support for all missing features (e.g. Codestral) - [ ] Handle rate limits - [ ] Unit tests diff --git a/build.gradle b/build.gradle index fbc2998..b6ee355 100644 --- a/build.gradle +++ b/build.gradle @@ -6,7 +6,7 @@ plugins { } group = "nl.dannyj" -version = "2.0.0" +version = "2.1.0" repositories { mavenCentral() diff --git a/src/main/java/nl/dannyj/mistral/MistralClient.java b/src/main/java/nl/dannyj/mistral/MistralClient.java index 1e130b0..51ecc5f 100644 --- a/src/main/java/nl/dannyj/mistral/MistralClient.java +++ b/src/main/java/nl/dannyj/mistral/MistralClient.java @@ -30,11 +30,14 @@ import nl.dannyj.mistral.models.embedding.EmbeddingRequest; import nl.dannyj.mistral.models.embedding.EmbeddingResponse; import nl.dannyj.mistral.models.model.ListModelsResponse; +import nl.dannyj.mistral.models.ocr.OCRRequest; +import nl.dannyj.mistral.models.ocr.OCRResponse; import nl.dannyj.mistral.net.ChatCompletionChunkCallback; import nl.dannyj.mistral.services.HttpService; import nl.dannyj.mistral.services.MistralService; import okhttp3.OkHttpClient; +import java.util.Objects; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; @@ -72,7 +75,7 @@ public MistralClient(@NonNull String apiKey) { * Default constructor that initializes the MistralClient with the API key from the environment variable "MISTRAL_API_KEY". */ public MistralClient() { - this.apiKey = System.getenv(API_KEY_ENV_VAR); + this.apiKey = Objects.requireNonNull(System.getenv(API_KEY_ENV_VAR), "API key not found in environment variable " + API_KEY_ENV_VAR); this.httpClient = buildHttpClient(120, 10, 10); this.objectMapper = buildObjectMapper(); this.mistralService = buildMistralService(); @@ -134,7 +137,7 @@ public MistralClient(@NonNull String apiKey, int readTimeoutSeconds, int connect } /** - * Default constructor that initializes the MistralClient with the API key from the environment variable "MISTRAL_API_KEY" and custom timeouts. + * Default constructor that initializes the MistralClient with the API key from the environment variable "MISTRAL_API_KEY". * * @param readTimeoutSeconds The read timeout in seconds * @param connectTimeoutSeconds The connect timeout in seconds @@ -241,6 +244,32 @@ public CompletableFuture listModelsAsync() { return mistralService.listModelsAsync(); } + /** + * Use the Mistral AI API to perform OCR on a document. + * This is a blocking method. + * + * @param request The request to perform OCR. See {@link OCRRequest}. + * @return The response from the Mistral AI API containing the OCR results. See {@link OCRResponse}. + * @throws ConstraintViolationException if the request does not pass validation + * @throws UnexpectedResponseException if an unexpected response is received from the Mistral AI API + */ + public OCRResponse performOcr(@NonNull OCRRequest request) { + return mistralService.performOcr(request); + } + + /** + * Use the Mistral AI API to perform OCR on a document. + * This is a non-blocking/asynchronous method. + * + * @param request The request to perform OCR. See {@link OCRRequest}. + * @return A CompletableFuture that will complete with the OCR results from the Mistral AI API. See {@link OCRResponse}. + * @throws ConstraintViolationException if the request does not pass validation + * @throws UnexpectedResponseException if an unexpected response is received from the Mistral AI API + */ + public CompletableFuture performOcrAsync(@NonNull OCRRequest request) { + return mistralService.performOcrAsync(request); + } + public void createChatCompletionStream(@NonNull ChatCompletionRequest request, @NonNull ChatCompletionChunkCallback callback) { mistralService.createChatCompletionStream(request, callback); } diff --git a/src/main/java/nl/dannyj/mistral/models/completion/content/DocumentURLChunk.java b/src/main/java/nl/dannyj/mistral/models/completion/content/DocumentURLChunk.java index d66d4b3..7107756 100644 --- a/src/main/java/nl/dannyj/mistral/models/completion/content/DocumentURLChunk.java +++ b/src/main/java/nl/dannyj/mistral/models/completion/content/DocumentURLChunk.java @@ -21,6 +21,7 @@ import jakarta.annotation.Nullable; import jakarta.validation.constraints.NotNull; import lombok.AllArgsConstructor; +import lombok.Builder; import lombok.Getter; import lombok.NoArgsConstructor; @@ -31,6 +32,7 @@ */ @NoArgsConstructor @AllArgsConstructor +@Builder public class DocumentURLChunk implements ContentChunk { /** diff --git a/src/main/java/nl/dannyj/mistral/models/ocr/OCRImageObject.java b/src/main/java/nl/dannyj/mistral/models/ocr/OCRImageObject.java new file mode 100644 index 0000000..04ed44f --- /dev/null +++ b/src/main/java/nl/dannyj/mistral/models/ocr/OCRImageObject.java @@ -0,0 +1,77 @@ +package nl.dannyj.mistral.models.ocr; + +import com.fasterxml.jackson.annotation.JsonProperty; +import jakarta.annotation.Nullable; +import jakarta.validation.constraints.NotNull; +import jakarta.validation.constraints.PositiveOrZero; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * Represents an extracted image object within an OCR page. + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OCRImageObject { + + /** + * Image ID for extracted image in a page. + * + * @return The image ID. + */ + @NotNull + private String id; + + /** + * X coordinate of top-left corner of the extracted image. + * + * @return The top-left X coordinate. + */ + @NotNull + @PositiveOrZero + @JsonProperty("top_left_x") + private Integer topLeftX; + + /** + * Y coordinate of top-left corner of the extracted image. + * + * @return The top-left Y coordinate. + */ + @NotNull + @PositiveOrZero + @JsonProperty("top_left_y") + private Integer topLeftY; + + /** + * X coordinate of bottom-right corner of the extracted image. + * + * @return The bottom-right X coordinate. + */ + @NotNull + @PositiveOrZero + @JsonProperty("bottom_right_x") + private Integer bottomRightX; + + /** + * Y coordinate of bottom-right corner of the extracted image. + * + * @return The bottom-right Y coordinate. + */ + @NotNull + @PositiveOrZero + @JsonProperty("bottom_right_y") + private Integer bottomRightY; + + /** + * Base64 string of the extracted image. + * + * @return The Base64 image string. + */ + @Nullable + @JsonProperty("image_base64") + private String imageBase64; +} diff --git a/src/main/java/nl/dannyj/mistral/models/ocr/OCRPageDimensions.java b/src/main/java/nl/dannyj/mistral/models/ocr/OCRPageDimensions.java new file mode 100644 index 0000000..ba3e000 --- /dev/null +++ b/src/main/java/nl/dannyj/mistral/models/ocr/OCRPageDimensions.java @@ -0,0 +1,45 @@ +package nl.dannyj.mistral.models.ocr; + +import jakarta.validation.constraints.NotNull; +import jakarta.validation.constraints.PositiveOrZero; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * Represents the dimensions of a PDF page's screenshot image. + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OCRPageDimensions { + + /** + * Dots per inch of the page-image. + * + * @return The DPI of the page image. + */ + @NotNull + @PositiveOrZero + private Integer dpi; + + /** + * Height of the image in pixels. + * + * @return The height of the image. + */ + @NotNull + @PositiveOrZero + private Integer height; + + /** + * Width of the image in pixels. + * + * @return The width of the image. + */ + @NotNull + @PositiveOrZero + private Integer width; +} diff --git a/src/main/java/nl/dannyj/mistral/models/ocr/OCRPageObject.java b/src/main/java/nl/dannyj/mistral/models/ocr/OCRPageObject.java new file mode 100644 index 0000000..bbd1ad6 --- /dev/null +++ b/src/main/java/nl/dannyj/mistral/models/ocr/OCRPageObject.java @@ -0,0 +1,53 @@ +package nl.dannyj.mistral.models.ocr; + +import jakarta.validation.constraints.NotNull; +import jakarta.validation.constraints.PositiveOrZero; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.List; + +/** + * Represents the OCR information for a single page. + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OCRPageObject { + + /** + * The page index in a PDF document starting from 0. + * + * @return The page index. + */ + @NotNull + @PositiveOrZero + private Integer index; + + /** + * The markdown string response of the page. + * + * @return The markdown string response. + */ + @NotNull + private String markdown; + + /** + * List of all extracted images in the page. + * + * @return The list of extracted images. + */ + @NotNull + private List images; + + /** + * The dimensions of the PDF Page's screenshot image. + * + * @return The dimensions of the page. + */ + @NotNull + private OCRPageDimensions dimensions; +} diff --git a/src/main/java/nl/dannyj/mistral/models/ocr/OCRRequest.java b/src/main/java/nl/dannyj/mistral/models/ocr/OCRRequest.java new file mode 100644 index 0000000..d987514 --- /dev/null +++ b/src/main/java/nl/dannyj/mistral/models/ocr/OCRRequest.java @@ -0,0 +1,92 @@ +package nl.dannyj.mistral.models.ocr; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import jakarta.validation.constraints.NotNull; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import nl.dannyj.mistral.models.Request; +import nl.dannyj.mistral.models.completion.content.ContentChunk; +import nl.dannyj.mistral.models.completion.content.DocumentURLChunk; +import nl.dannyj.mistral.models.completion.content.ImageURLChunk; + +import java.util.List; + +/** + * Represents the request body for the OCR API endpoint (`/v1/ocr`). + */ +@Data +@AllArgsConstructor +@NoArgsConstructor +@Builder +public class OCRRequest implements Request { + + /** + * ID of the model to use. + * + * @param model The model's ID. Can't be null. + * @return The model's ID. + */ + @NotNull + private String model; + + /** + * Optional ID for the request. + * + * @param id The optional ID for the request. + * @return The optional ID for the request. + */ + private String id; + + /** + * Document to run OCR on. Can be a DocumentURLChunk or an ImageURLChunk. + * + * @param document The document to run OCR on. Can be a {@link DocumentURLChunk} or an {@link ImageURLChunk}. Can't be null. + * @return The document to run OCR on. + */ + @NotNull + @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.EXISTING_PROPERTY, property = "type", visible = false) + @JsonSubTypes({ + @JsonSubTypes.Type(value = DocumentURLChunk.class, name = "document_url"), + @JsonSubTypes.Type(value = ImageURLChunk.class, name = "image_url") + }) + private ContentChunk document; + + /** + * Specific pages user wants to process in various formats: single number, range, or list of both. Starts from 0. + * + * @param pages A list of specific page indices to process. Starts from 0. Null to process all pages. + * @return A list of specific page indices to process. + */ + private List pages; + + /** + * Include image URLs in response. + * + * @param includeImageBase64 Whether to include image URLs in the response. Null for default behavior. + * @return Whether to include image URLs in the response. + */ + @JsonProperty("include_image_base64") + private Boolean includeImageBase64; + + /** + * Maximum images to extract. + * + * @param imageLimit The maximum number of images to extract. Null for default behavior. + * @return The maximum number of images to extract. + */ + @JsonProperty("image_limit") + private Integer imageLimit; + + /** + * Minimum height and width of image to extract. + * + * @param imageMinSize The minimum height and width of images to extract. Null for default behavior. + * @return The minimum height and width of images to extract. + */ + @JsonProperty("image_min_size") + private Integer imageMinSize; +} diff --git a/src/main/java/nl/dannyj/mistral/models/ocr/OCRResponse.java b/src/main/java/nl/dannyj/mistral/models/ocr/OCRResponse.java new file mode 100644 index 0000000..296175f --- /dev/null +++ b/src/main/java/nl/dannyj/mistral/models/ocr/OCRResponse.java @@ -0,0 +1,42 @@ +package nl.dannyj.mistral.models.ocr; + +import com.fasterxml.jackson.annotation.JsonProperty; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import nl.dannyj.mistral.models.Response; + +import java.util.List; + +/** + * Represents the response body from the OCR API endpoint (`/v1/ocr`). + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OCRResponse implements Response { + + /** + * List of OCR info for pages. + * + * @return The list of OCR info for pages. + */ + private List pages; + + /** + * The model used to generate the OCR. + * + * @return The model used to generate the OCR. + */ + private String model; + + /** + * Usage info for the OCR request. + * + * @return The usage info for the OCR request. + */ + @JsonProperty("usage_info") + private OCRUsageInfo usageInfo; +} diff --git a/src/main/java/nl/dannyj/mistral/models/ocr/OCRUsageInfo.java b/src/main/java/nl/dannyj/mistral/models/ocr/OCRUsageInfo.java new file mode 100644 index 0000000..2df9c18 --- /dev/null +++ b/src/main/java/nl/dannyj/mistral/models/ocr/OCRUsageInfo.java @@ -0,0 +1,40 @@ +package nl.dannyj.mistral.models.ocr; + +import com.fasterxml.jackson.annotation.JsonProperty; +import jakarta.annotation.Nullable; +import jakarta.validation.constraints.NotNull; +import jakarta.validation.constraints.PositiveOrZero; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * Represents the usage information for an OCR request. + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class OCRUsageInfo { + + /** + * Number of pages processed. + * + * @return The number of pages processed. + */ + @NotNull + @PositiveOrZero + @JsonProperty("pages_processed") + private Integer pagesProcessed; + + /** + * Document size in bytes. + * + * @return The document size in bytes. + */ + @Nullable + @PositiveOrZero + @JsonProperty("doc_size_bytes") + private Integer docSizeBytes; +} diff --git a/src/main/java/nl/dannyj/mistral/services/MistralService.java b/src/main/java/nl/dannyj/mistral/services/MistralService.java index a28da8e..a2c91b3 100644 --- a/src/main/java/nl/dannyj/mistral/services/MistralService.java +++ b/src/main/java/nl/dannyj/mistral/services/MistralService.java @@ -37,6 +37,8 @@ import nl.dannyj.mistral.models.embedding.EmbeddingRequest; import nl.dannyj.mistral.models.embedding.EmbeddingResponse; import nl.dannyj.mistral.models.model.ListModelsResponse; +import nl.dannyj.mistral.models.ocr.OCRRequest; +import nl.dannyj.mistral.models.ocr.OCRResponse; import nl.dannyj.mistral.net.ChatCompletionChunkCallback; import okhttp3.Call; import okhttp3.Callback; @@ -60,7 +62,7 @@ public class MistralService { /** * Constructor that initializes the MistralService with a provided HttpService and ObjectMapper. * - * @param httpService The HttpService to be used for making HTTP requests to the Mistral AI API + * @param httpService The HttpService to be used for making HTTP requests to the Mistral AI API * @param objectMapper The ObjectMapper to be used for converting objects to and from JSON */ public MistralService(@NonNull HttpService httpService, @NonNull ObjectMapper objectMapper) { @@ -198,11 +200,40 @@ public EmbeddingResponse createEmbedding(@NonNull EmbeddingRequest request) { * * @param request The request to create an embedding. See {@link EmbeddingRequest}. * @return A CompletableFuture that will complete with the generated embedding from the Mistral AI API. See {@link EmbeddingResponse}. + * @throws ConstraintViolationException if the request does not pass validation + * @throws UnexpectedResponseException if an unexpected response is received from the Mistral AI API */ public CompletableFuture createEmbeddingAsync(@NonNull EmbeddingRequest request) { return CompletableFuture.supplyAsync(() -> createEmbedding(request)); } + /** + * Use the Mistral AI API to perform OCR on a document. + * This is a blocking method. + * + * @param request The request to perform OCR. See {@link OCRRequest}. + * @return The response from the Mistral AI API containing the OCR results. See {@link OCRResponse}. + * @throws ConstraintViolationException if the request does not pass validation + * @throws UnexpectedResponseException if an unexpected response is received from the Mistral AI API + */ + public OCRResponse performOcr(@NonNull OCRRequest request) { + validateRequest(request); + return postRequest("/ocr", request, OCRResponse.class); + } + + /** + * Use the Mistral AI API to perform OCR on a document. + * This is a non-blocking/asynchronous method. + * + * @param request The request to perform OCR. See {@link OCRRequest}. + * @return A CompletableFuture that will complete with the OCR results from the Mistral AI API. See {@link OCRResponse}. + * @throws ConstraintViolationException if the request does not pass validation + * @throws UnexpectedResponseException if an unexpected response is received from the Mistral AI API + */ + public CompletableFuture performOcrAsync(@NonNull OCRRequest request) { + return CompletableFuture.supplyAsync(() -> performOcr(request)); + } + /** * This method is used to validate the request using the provided validator. * If there are any constraint violations, it throws a ConstraintViolationException.