Skip to content

Commit 427c332

Browse files
committed
cloud providers
1 parent a305ecc commit 427c332

37 files changed

+3999
-71
lines changed

README.md

Lines changed: 211 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -378,16 +378,14 @@ Console.WriteLine(urlResult.Title);
378378

379379
```csharp
380380
using System;
381-
using Azure;
382381
using MarkItDown;
382+
using MarkItDown.Intelligence;
383+
using MarkItDown.Intelligence.Providers.Azure;
384+
using MarkItDown.Intelligence.Providers.Google;
385+
using MarkItDown.Intelligence.Providers.Aws;
383386

384387
var options = new MarkItDownOptions
385388
{
386-
// Plug in your own services (Azure AI, OpenAI, etc.)
387-
ImageCaptioner = async (bytes, info, token) =>
388-
await myCaptionService.DescribeAsync(bytes, info, token),
389-
AudioTranscriber = async (bytes, info, token) =>
390-
await speechClient.TranscribeAsync(bytes, info, token),
391389
Segments = new SegmentOptions
392390
{
393391
IncludeSegmentMetadataInMarkdown = true,
@@ -396,10 +394,68 @@ var options = new MarkItDownOptions
396394
SegmentDuration = TimeSpan.FromMinutes(2)
397395
}
398396
},
399-
DocumentIntelligence = new DocumentIntelligenceOptions
397+
398+
// Cloud providers can be wired in through the intelligence options.
399+
AzureIntelligence = new AzureIntelligenceOptions
400+
{
401+
DocumentIntelligence = new AzureDocumentIntelligenceOptions
402+
{
403+
Endpoint = "https://<your-document-intelligence>.cognitiveservices.azure.com/",
404+
ApiKey = "<document-intelligence-key>",
405+
ModelId = "prebuilt-layout"
406+
},
407+
Vision = new AzureVisionOptions
408+
{
409+
Endpoint = "https://<your-vision>.cognitiveservices.azure.com/",
410+
ApiKey = "<vision-key>"
411+
},
412+
Media = new AzureMediaIntelligenceOptions
413+
{
414+
AccountId = "<video-indexer-account-id>",
415+
Location = "trial",
416+
SubscriptionId = "<subscription-id>",
417+
ResourceGroup = "<resource-group>"
418+
}
419+
},
420+
421+
GoogleIntelligence = new GoogleIntelligenceOptions
422+
{
423+
DocumentIntelligence = new GoogleDocumentIntelligenceOptions
424+
{
425+
ProjectId = "my-project",
426+
Location = "us",
427+
ProcessorId = "<processor-id>",
428+
CredentialsPath = "google-sa.json"
429+
},
430+
Vision = new GoogleVisionOptions
431+
{
432+
CredentialsPath = "google-sa.json",
433+
MaxLabels = 5
434+
},
435+
Media = new GoogleMediaIntelligenceOptions
436+
{
437+
CredentialsPath = "google-sa.json",
438+
LanguageCode = "en-US"
439+
}
440+
},
441+
442+
AwsIntelligence = new AwsIntelligenceOptions
400443
{
401-
Endpoint = "https://<your-resource>.cognitiveservices.azure.com/",
402-
Credential = new AzureKeyCredential("<document-intelligence-key>")
444+
DocumentIntelligence = new AwsDocumentIntelligenceOptions
445+
{
446+
Region = "us-east-1"
447+
},
448+
Vision = new AwsVisionOptions
449+
{
450+
Region = "us-east-1",
451+
MinConfidence = 80f
452+
},
453+
Media = new AwsMediaIntelligenceOptions
454+
{
455+
Region = "us-east-1",
456+
InputBucketName = "my-transcribe-input",
457+
OutputBucketName = "my-transcribe-output"
458+
}
403459
}
404460
};
405461

@@ -697,6 +753,8 @@ public class DocumentConversionFunction
697753
- **`StreamInfo`** - Metadata about the input stream (MIME type, extension, charset, etc.)
698754
- **`ConverterRegistration`** - Associates converters with priority for selection
699755

756+
> ℹ️ MIME detection and normalization rely on [ManagedCode.MimeTypes](https://github.com/managedcode/MimeTypes).
757+
700758
### Built-in Converters
701759

702760
MarkItDown includes these converters in priority order:
@@ -741,6 +799,134 @@ foreach (var segment in result.Segments)
741799

742800
Runtime behaviour is controlled through `SegmentOptions` on `MarkItDownOptions`. Enabling `IncludeSegmentMetadataInMarkdown` emits inline annotations like `[page:1]`, `[sheet:Sales]`, or `[timecode:00:01:00-00:02:00]` directly in the Markdown stream. Audio transcripts honour `Segments.Audio.SegmentDuration`, while still collapsing short transcripts into a single, time-aware slice.
743801

802+
### Cloud Intelligence Providers
803+
804+
MarkItDown exposes optional abstractions for running documents through cloud services:
805+
806+
- `IDocumentIntelligenceProvider` – structured page, table, and layout extraction.
807+
- `IImageUnderstandingProvider` – OCR, captioning, and object detection for embedded images.
808+
- `IMediaTranscriptionProvider` – timed transcripts for audio and video inputs.
809+
810+
The `AzureIntelligenceOptions`, `GoogleIntelligenceOptions`, and `AwsIntelligenceOptions` helpers wire the respective cloud Document AI/Vision/Speech stacks without forcing the dependency on consumers. You can still bring your own implementation by assigning the provider interfaces directly on `MarkItDownOptions`.
811+
812+
#### Azure AI setup (keys and managed identity)
813+
814+
- **Docs**: [Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/), [Computer Vision Image Analysis](https://learn.microsoft.com/azure/ai-services/computer-vision/overview-image-analysis), [Video Indexer authentication](https://learn.microsoft.com/azure/azure-video-indexer/video-indexer-get-started/connect-to-azure).
815+
- **API keys / connection strings**: store your Cognitive Services key in configuration (for example `appsettings.json` or an Azure App Configuration connection string) and hydrate the options:
816+
817+
```csharp
818+
var configuration = host.Services.GetRequiredService<IConfiguration>();
819+
820+
var azureOptions = new AzureIntelligenceOptions
821+
{
822+
DocumentIntelligence = new AzureDocumentIntelligenceOptions
823+
{
824+
Endpoint = configuration["Azure:DocumentIntelligence:Endpoint"],
825+
ApiKey = configuration.GetConnectionString("AzureDocumentIntelligenceKey"),
826+
ModelId = "prebuilt-layout"
827+
},
828+
Vision = new AzureVisionOptions
829+
{
830+
Endpoint = configuration["Azure:Vision:Endpoint"],
831+
ApiKey = configuration.GetConnectionString("AzureVisionKey")
832+
},
833+
Media = new AzureMediaIntelligenceOptions
834+
{
835+
AccountId = configuration["Azure:VideoIndexer:AccountId"],
836+
Location = configuration["Azure:VideoIndexer:Location"],
837+
SubscriptionId = configuration["Azure:VideoIndexer:SubscriptionId"],
838+
ResourceGroup = configuration["Azure:VideoIndexer:ResourceGroup"],
839+
ArmAccessToken = configuration.GetConnectionString("AzureVideoIndexerArmToken")
840+
}
841+
};
842+
```
843+
844+
- **Managed identity**: omit the `ApiKey`/`ArmAccessToken` properties and the providers automatically fall back to `DefaultAzureCredential`. Assign the managed identity the *Cognitive Services User* role for Document Intelligence and Vision, and follow the [Video Indexer managed identity instructions](https://learn.microsoft.com/azure/azure-video-indexer/video-indexer-use-azure-ad) to authorize uploads.
845+
846+
```csharp
847+
var azureOptions = new AzureIntelligenceOptions
848+
{
849+
DocumentIntelligence = new AzureDocumentIntelligenceOptions
850+
{
851+
Endpoint = "https://contoso.cognitiveservices.azure.com/"
852+
},
853+
Vision = new AzureVisionOptions
854+
{
855+
Endpoint = "https://contoso.cognitiveservices.azure.com/"
856+
},
857+
Media = new AzureMediaIntelligenceOptions
858+
{
859+
AccountId = "<video-indexer-account-id>",
860+
Location = "trial"
861+
}
862+
};
863+
```
864+
865+
#### Google Cloud setup
866+
867+
- **Docs**: [Document AI](https://cloud.google.com/document-ai/docs), [Vision API](https://cloud.google.com/vision/docs), [Speech-to-Text](https://cloud.google.com/speech-to-text/docs).
868+
- **Service account JSON / ADC**: place your service account JSON on disk or load it from Secret Manager, then point the options at it (or provide a `GoogleCredential` instance). If `CredentialsPath`/`JsonCredentials`/`Credential` are omitted the providers use [Application Default Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc#local-key):
869+
870+
```csharp
871+
var googleOptions = new GoogleIntelligenceOptions
872+
{
873+
DocumentIntelligence = new GoogleDocumentIntelligenceOptions
874+
{
875+
ProjectId = "my-project",
876+
Location = "us",
877+
ProcessorId = "processor-id",
878+
CredentialsPath = Environment.GetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS")
879+
},
880+
Vision = new GoogleVisionOptions
881+
{
882+
JsonCredentials = Environment.GetEnvironmentVariable("GOOGLE_VISION_JSON")
883+
},
884+
Media = new GoogleMediaIntelligenceOptions
885+
{
886+
Credential = GoogleCredential.GetApplicationDefault(),
887+
LanguageCode = "en-US"
888+
}
889+
};
890+
```
891+
892+
- **Workload identity / managed identities**: host the app on GKE, Cloud Run, or Cloud Functions with [Workload Identity Federation](https://cloud.google.com/iam/docs/workload-identity-federation). The Google SDK automatic credential chain will pick up the ambient identity and the providers will work without JSON keys.
893+
894+
#### AWS setup
895+
896+
- **Docs**: [Textract](https://docs.aws.amazon.com/textract/latest/dg/what-is.html), [Rekognition](https://docs.aws.amazon.com/rekognition/latest/dg/what-is.html), [Transcribe](https://docs.aws.amazon.com/transcribe/latest/dg/what-is-transcribe.html), [.NET credential management](https://docs.aws.amazon.com/sdk-for-net/v3/developer-guide/net-dg-config-creds.html).
897+
- **Access keys / connection strings**: populate the options directly from configuration when you must supply static credentials (for example from AWS Secrets Manager or an encrypted connection string):
898+
899+
```csharp
900+
var awsOptions = new AwsIntelligenceOptions
901+
{
902+
DocumentIntelligence = new AwsDocumentIntelligenceOptions
903+
{
904+
AccessKeyId = configuration["AWS:AccessKeyId"],
905+
SecretAccessKey = configuration["AWS:SecretAccessKey"],
906+
Region = configuration.GetValue<string>("AWS:Region")
907+
},
908+
Vision = new AwsVisionOptions
909+
{
910+
AccessKeyId = configuration["AWS:AccessKeyId"],
911+
SecretAccessKey = configuration["AWS:SecretAccessKey"],
912+
Region = configuration.GetValue<string>("AWS:Region"),
913+
MinConfidence = 80f
914+
},
915+
Media = new AwsMediaIntelligenceOptions
916+
{
917+
AccessKeyId = configuration["AWS:AccessKeyId"],
918+
SecretAccessKey = configuration["AWS:SecretAccessKey"],
919+
Region = configuration.GetValue<string>("AWS:Region"),
920+
InputBucketName = configuration["AWS:Transcribe:InputBucket"],
921+
OutputBucketName = configuration["AWS:Transcribe:OutputBucket"]
922+
}
923+
};
924+
```
925+
926+
- **IAM roles / AWS managed identity**: leave the credential fields null to use the default AWS credential chain (environment variables, shared credentials file, EC2/ECS/EKS IAM roles, or AWS SSO). Ensure the execution role has permissions for `textract:AnalyzeDocument`, `rekognition:DetectLabels`, `rekognition:DetectText`, `transcribe:StartTranscriptionJob`, and S3 access for the specified buckets.
927+
928+
For LLM-style post-processing, assign `MarkItDownOptions.AiModels` with an `IAiModelProvider`. The built-in `StaticAiModelProvider` accepts `Microsoft.Extensions.AI` clients (chat models, speech-to-text, etc.), enabling you to share application-wide model builders.
929+
744930
### Converter Priority & Detection
745931

746932
- Priority-based dispatch (lower values processed first)
@@ -1007,34 +1193,25 @@ var markItDown = new MarkItDown(options);
10071193
using Azure;
10081194
using OpenAI;
10091195

1196+
var openAIChatClient = new MyChatClient(); // IChatClient from Microsoft.Extensions.AI
1197+
var whisperSpeechClient = new MySpeechToTextClient(); // ISpeechToTextClient from Microsoft.Extensions.AI
1198+
10101199
var options = new MarkItDownOptions
10111200
{
1012-
// Azure AI Vision for image captions
1013-
ImageCaptioner = async (bytes, info, token) =>
1014-
{
1015-
var client = new VisionServiceClient("your-endpoint", new AzureKeyCredential("your-key"));
1016-
var result = await client.AnalyzeImageAsync(bytes, token);
1017-
return $"Image: {result.Description?.Captions?.FirstOrDefault()?.Text ?? "Visual content"}";
1018-
},
1019-
1020-
// OpenAI Whisper for audio transcription
1021-
AudioTranscriber = async (bytes, info, token) =>
1022-
{
1023-
var client = new OpenAIClient("your-api-key");
1024-
using var stream = new MemoryStream(bytes);
1025-
var result = await client.AudioEndpoint.CreateTranscriptionAsync(
1026-
stream,
1027-
Path.GetFileName(info.FileName) ?? "audio",
1028-
cancellationToken: token);
1029-
return result.Text;
1030-
},
1031-
1032-
// Azure Document Intelligence for enhanced PDF/form processing
1033-
DocumentIntelligence = new DocumentIntelligenceOptions
1201+
AiModels = new StaticAiModelProvider(openAIChatClient, whisperSpeechClient),
1202+
1203+
AzureIntelligence = new AzureIntelligenceOptions
10341204
{
1035-
Endpoint = "https://your-resource.cognitiveservices.azure.com/",
1036-
Credential = new AzureKeyCredential("your-document-intelligence-key"),
1037-
ApiVersion = "2023-10-31-preview"
1205+
DocumentIntelligence = new AzureDocumentIntelligenceOptions
1206+
{
1207+
Endpoint = "https://your-document-intelligence.cognitiveservices.azure.com/",
1208+
ApiKey = "<document-intelligence-key>"
1209+
},
1210+
Vision = new AzureVisionOptions
1211+
{
1212+
Endpoint = "https://your-computervision.cognitiveservices.azure.com/",
1213+
ApiKey = "<vision-key>"
1214+
}
10381215
}
10391216
};
10401217

0 commit comments

Comments
 (0)