diff --git a/Modules/Sources/WordPressShared/Intelligence/IntelligenceService.swift b/Modules/Sources/WordPressShared/Intelligence/IntelligenceService.swift index 66f386c49c9e..743331fd00a1 100644 --- a/Modules/Sources/WordPressShared/Intelligence/IntelligenceService.swift +++ b/Modules/Sources/WordPressShared/Intelligence/IntelligenceService.swift @@ -1,5 +1,7 @@ import Foundation import FoundationModels +import Vision +import CoreImage @available(iOS 26, *) public actor IntelligenceService { @@ -17,6 +19,66 @@ public actor IntelligenceService { public init() {} + /// Analyzes an image using Vision framework to extract visual information. + /// + /// - Parameter cgImage: The image to analyze. + /// - Returns: A description of what's in the image. + public func analyzeImage(_ cgImage: CGImage) async throws -> String { + let startTime = CFAbsoluteTimeGetCurrent() + + var analysisResults: [String] = [] + + // 1. Scene classification + let sceneRequest = VNClassifyImageRequest() + + // 2. Object recognition + let objectRequest = VNRecognizeAnimalsRequest() + + // 3. Text detection + let textRequest = VNRecognizeTextRequest() + textRequest.recognitionLevel = .fast + + // Perform all requests + let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) + try handler.perform([sceneRequest, objectRequest, textRequest]) + + // Process scene classifications + if let sceneResults = sceneRequest.results as? [VNClassificationObservation] { + let topScenes = sceneResults + .prefix(3) + .filter { $0.confidence > 0.3 } + .map { "\($0.identifier) (\(Int($0.confidence * 100))%)" } + if !topScenes.isEmpty { + analysisResults.append("Scenes: \(topScenes.joined(separator: ", "))") + } + } + + // Process animal recognition + if let animalResults = objectRequest.results as? [VNRecognizedObjectObservation] { + let animals = animalResults + .filter { $0.confidence > 0.5 } + .compactMap { $0.labels.first?.identifier } + if !animals.isEmpty { + analysisResults.append("Animals: \(animals.joined(separator: ", "))") + } + } + + // Process text recognition + if let textResults = textRequest.results as? [VNRecognizedTextObservation] { + let recognizedText = textResults + .prefix(5) + .compactMap { $0.topCandidates(1).first?.string } + .filter { !$0.isEmpty } + if !recognizedText.isEmpty { + analysisResults.append("Text: \(recognizedText.joined(separator: ", "))") + } + } + + WPLogInfo("IntelligenceService.analyzeImage executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms") + + return analysisResults.isEmpty ? "" : analysisResults.joined(separator: "; ") + } + /// Suggests tags for a WordPress post. /// /// - Parameters: @@ -155,6 +217,195 @@ public actor IntelligenceService { let postSizeLimit = Double(IntelligenceService.contextSizeLimit) * ratio return String((extract ?? post).prefix(Int(postSizeLimit))) } + + /// Metadata for generating alt text and captions. + public struct MediaMetadata { + public let filename: String? + public let title: String? + public let caption: String? + public let description: String? + public let altText: String? + public let fileType: String? + public let dimensions: String? + public let imageAnalysis: String? + + public init(filename: String? = nil, title: String? = nil, caption: String? = nil, description: String? = nil, altText: String? = nil, fileType: String? = nil, dimensions: String? = nil, imageAnalysis: String? = nil) { + self.filename = filename + self.title = title + self.caption = caption + self.description = description + self.altText = altText + self.fileType = fileType + self.dimensions = dimensions + self.imageAnalysis = imageAnalysis + } + + var hasContent: Bool { + return [filename, title, caption, description, altText, fileType, dimensions, imageAnalysis] + .contains(where: { !($0?.isEmpty ?? true) }) + } + } + + /// Generates alt text for a media item based on available metadata. + /// + /// - Parameter metadata: The media metadata to use for generation. + /// - Returns: Generated alt text. + public func generateAltText(metadata: MediaMetadata) async throws -> String { + guard metadata.hasContent else { + throw NSError(domain: "IntelligenceService", code: -1, userInfo: [ + NSLocalizedDescriptionKey: "Insufficient metadata to generate alt text. Please add a filename, title, or description first." + ]) + } + + let startTime = CFAbsoluteTimeGetCurrent() + + let instructions = """ + You are helping a WordPress user generate alt text for an image. + Alt text should be concise, descriptive, and accessible for screen readers. + + **Parameters** + - IMAGE_ANALYSIS: Visual analysis of the actual image content (MOST IMPORTANT) + - FILENAME: the image filename + - FILE_TYPE: the file type/extension + - DIMENSIONS: the image dimensions + - TITLE: the image title (if available) + - CAPTION: the image caption (if available) + - DESCRIPTION: the image description (if available) + + **Requirements** + - Generate concise alt text (1-2 sentences, max 125 characters) + - Prioritize IMAGE_ANALYSIS when describing what's in the image + - Focus on what the image depicts, not decorative elements + - Use simple, clear language + - Do not include phrases like "image of" or "picture of" + - Only output the alt text, nothing else + """ + + let session = LanguageModelSession( + model: .init(guardrails: .permissiveContentTransformations), + instructions: instructions + ) + + var contextParts: [String] = [] + if let imageAnalysis = metadata.imageAnalysis, !imageAnalysis.isEmpty { + contextParts.append("IMAGE_ANALYSIS: '\(imageAnalysis)'") + } + if let filename = metadata.filename, !filename.isEmpty { + contextParts.append("FILENAME: '\(filename)'") + } + if let fileType = metadata.fileType, !fileType.isEmpty { + contextParts.append("FILE_TYPE: '\(fileType)'") + } + if let dimensions = metadata.dimensions, !dimensions.isEmpty { + contextParts.append("DIMENSIONS: '\(dimensions)'") + } + if let title = metadata.title, !title.isEmpty { + contextParts.append("TITLE: '\(title)'") + } + if let caption = metadata.caption, !caption.isEmpty { + contextParts.append("CAPTION: '\(caption)'") + } + if let description = metadata.description, !description.isEmpty { + contextParts.append("DESCRIPTION: '\(description)'") + } + + let prompt = """ + Generate alt text for an image with the following information: + + \(contextParts.joined(separator: "\n")) + """ + + WPLogInfo("IntelligenceService.generateAltText prompt:\n\(prompt)") + + let response = try await session.respond( + to: prompt, + options: GenerationOptions(temperature: 0.7) + ) + + WPLogInfo("IntelligenceService.generateAltText executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms") + + return response.content.trimmingCharacters(in: .whitespacesAndNewlines) + } + + /// Generates a caption for a media item based on available metadata. + /// + /// - Parameter metadata: The media metadata to use for generation. + /// - Returns: Generated caption. + public func generateCaption(metadata: MediaMetadata) async throws -> String { + guard metadata.hasContent else { + throw NSError(domain: "IntelligenceService", code: -1, userInfo: [ + NSLocalizedDescriptionKey: "Insufficient metadata to generate caption. Please add a filename, title, or description first." + ]) + } + + let startTime = CFAbsoluteTimeGetCurrent() + + let instructions = """ + You are helping a WordPress user generate a caption for an image. + Captions should be engaging, informative, and complement the image. + + **Parameters** + - IMAGE_ANALYSIS: Visual analysis of the actual image content (MOST IMPORTANT) + - FILENAME: the image filename + - FILE_TYPE: the file type/extension + - DIMENSIONS: the image dimensions + - TITLE: the image title (if available) + - ALT_TEXT: the image alt text (if available) + - DESCRIPTION: the image description (if available) + + **Requirements** + - Generate an engaging caption (1-2 sentences) + - Prioritize IMAGE_ANALYSIS to understand what's actually in the image + - Can be more creative and conversational than alt text + - May include context, emotion, or storytelling elements + - Only output the caption, nothing else + """ + + let session = LanguageModelSession( + model: .init(guardrails: .permissiveContentTransformations), + instructions: instructions + ) + + var contextParts: [String] = [] + if let imageAnalysis = metadata.imageAnalysis, !imageAnalysis.isEmpty { + contextParts.append("IMAGE_ANALYSIS: '\(imageAnalysis)'") + } + if let filename = metadata.filename, !filename.isEmpty { + contextParts.append("FILENAME: '\(filename)'") + } + if let fileType = metadata.fileType, !fileType.isEmpty { + contextParts.append("FILE_TYPE: '\(fileType)'") + } + if let dimensions = metadata.dimensions, !dimensions.isEmpty { + contextParts.append("DIMENSIONS: '\(dimensions)'") + } + if let title = metadata.title, !title.isEmpty { + contextParts.append("TITLE: '\(title)'") + } + if let altText = metadata.altText, !altText.isEmpty { + contextParts.append("ALT_TEXT: '\(altText)'") + } + if let description = metadata.description, !description.isEmpty { + contextParts.append("DESCRIPTION: '\(description)'") + } + + let prompt = """ + Generate a caption for an image with the following information: + + \(contextParts.joined(separator: "\n")) + """ + + WPLogInfo("IntelligenceService.generateCaption prompt:\n\(prompt)") + + let response = try await session.respond( + to: prompt, + options: GenerationOptions(temperature: 0.8) + ) + + WPLogInfo("IntelligenceService.generateCaption executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms") + + return response.content.trimmingCharacters(in: .whitespacesAndNewlines) + } } private extension Array where Element: Hashable { diff --git a/WordPress/Classes/ViewRelated/Media/MediaItemViewController.swift b/WordPress/Classes/ViewRelated/Media/MediaItemViewController.swift index bf8166cea368..c73024a1a224 100644 --- a/WordPress/Classes/ViewRelated/Media/MediaItemViewController.swift +++ b/WordPress/Classes/ViewRelated/Media/MediaItemViewController.swift @@ -22,6 +22,24 @@ final class MediaItemViewController: UITableViewController { private let headerView = MediaItemHeaderView() private lazy var headerMaxHeightConstraint = headerView.heightAnchor.constraint(lessThanOrEqualToConstant: 320) + private var _textGenerationController: AnyObject? + + @available(iOS 26, *) + private var textGenerationController: MediaTextGenerationController { + if _textGenerationController == nil { + _textGenerationController = MediaTextGenerationController(media: media) { [weak self] type, generatedText in + guard let self else { return } + switch type { + case .altText: + self.mediaMetadata.alt = generatedText + case .caption: + self.mediaMetadata.caption = generatedText + } + self.reloadViewModel() + } + } + return _textGenerationController as! MediaTextGenerationController + } init(media: Media) { self.media = media @@ -327,11 +345,14 @@ final class MediaItemViewController: UITableViewController { private func editCaption() -> ((ImmuTableRow) -> ()) { return { [weak self] row in let editableRow = row as! EditableTextRow - self?.pushSettingsController(for: editableRow, hint: Strings.Hints.imageCaption, + let controller = self?.pushSettingsController(for: editableRow, hint: Strings.Hints.imageCaption, onValueChanged: { value in self?.mediaMetadata.caption = value self?.reloadViewModel() }) + if #available(iOS 26, *), let self, let controller { + self.textGenerationController.configure(controller, for: .caption) + } } } @@ -349,15 +370,19 @@ final class MediaItemViewController: UITableViewController { private func editAlt() -> ((ImmuTableRow) -> ()) { return { [weak self] row in let editableRow = row as! EditableTextRow - self?.pushSettingsController(for: editableRow, hint: Strings.Hints.imageAlt, + let controller = self?.pushSettingsController(for: editableRow, hint: Strings.Hints.imageAlt, onValueChanged: { value in self?.mediaMetadata.alt = value self?.reloadViewModel() }) + if #available(iOS 26, *), let self, let controller { + self.textGenerationController.configure(controller, for: .altText) + } } } - private func pushSettingsController(for row: EditableTextRow, hint: String? = nil, onValueChanged: @escaping SettingsTextChanged) { + @discardableResult + private func pushSettingsController(for row: EditableTextRow, hint: String? = nil, onValueChanged: @escaping SettingsTextChanged) -> SettingsTextViewController { let title = row.title let value = row.value let controller = SettingsTextViewController(text: value, placeholder: "\(title)...", hint: hint) @@ -366,6 +391,7 @@ final class MediaItemViewController: UITableViewController { controller.onValueChanged = onValueChanged navigationController?.pushViewController(controller, animated: true) + return controller } // MARK: - Sharing Logic @@ -417,7 +443,7 @@ extension MediaItemViewController { /// Provides some extra formatting for a Media asset's metadata, used /// to present it in the MediaItemViewController /// -private struct MediaMetadataPresenter { +struct MediaMetadataPresenter { let media: Media /// A String containing the pixel size of the asset (width X height) diff --git a/WordPress/Classes/ViewRelated/Media/MediaTextGenerationController.swift b/WordPress/Classes/ViewRelated/Media/MediaTextGenerationController.swift new file mode 100644 index 000000000000..481736a620f2 --- /dev/null +++ b/WordPress/Classes/ViewRelated/Media/MediaTextGenerationController.swift @@ -0,0 +1,125 @@ +import UIKit +import SVProgressHUD +import WordPressData +import WordPressShared + +@available(iOS 26, *) +@MainActor +final class MediaTextGenerationController { + + enum GenerationType { + case altText + case caption + } + + private let media: Media + private let service = IntelligenceService() + private let onMetadataUpdated: (GenerationType, String) -> Void + + init(media: Media, onMetadataUpdated: @escaping (GenerationType, String) -> Void) { + self.media = media + self.onMetadataUpdated = onMetadataUpdated + } + + /// Configures a settings controller with a generate button + func configure(_ controller: SettingsTextViewController, for type: GenerationType) { + guard IntelligenceService.isSupported else { return } + + let button = makeGenerateButton(for: controller, type: type) + controller.navigationItem.rightBarButtonItem = button + } + + private func makeGenerateButton(for controller: SettingsTextViewController, type: GenerationType) -> UIBarButtonItem { + let button = UIBarButtonItem( + image: UIImage(systemName: "sparkles"), + style: .plain, + target: nil, + action: nil + ) + button.accessibilityLabel = Strings.generate + button.primaryAction = UIAction { [weak self, weak controller, weak button] _ in + guard let self, let controller, let button else { return } + self.handleGenerate(controller: controller, button: button, type: type) + } + return button + } + + private func handleGenerate(controller: SettingsTextViewController, button: UIBarButtonItem, type: GenerationType) { + setGenerating(true, button: button) + + Task { + do { + let generatedText = try await generateText(for: type) + controller.text = generatedText + onMetadataUpdated(type, generatedText) + } catch { + SVProgressHUD.showError(withStatus: error.localizedDescription) + } + setGenerating(false, button: button) + } + } + + private func generateText(for type: GenerationType) async throws -> String { + let imageAnalysis = await analyzeImage() + let metadata = buildMetadata(imageAnalysis: imageAnalysis) + + switch type { + case .altText: + return try await service.generateAltText(metadata: metadata) + case .caption: + return try await service.generateCaption(metadata: metadata) + } + } + + private func analyzeImage() async -> String? { + guard let imageURL = media.absoluteThumbnailLocalURL ?? media.absoluteLocalURL, + let imageData = try? Data(contentsOf: imageURL), + let image = UIImage(data: imageData), + let cgImage = image.cgImage else { + return nil + } + + do { + let analysis = try await service.analyzeImage(cgImage) + WPLogInfo("Image analysis: \(analysis)") + return analysis + } catch { + WPLogError("Failed to analyze image: \(error)") + return nil + } + } + + private func buildMetadata(imageAnalysis: String?) -> IntelligenceService.MediaMetadata { + let presenter = MediaMetadataPresenter(media: media) + return IntelligenceService.MediaMetadata( + filename: media.filename, + title: media.title, + caption: media.caption, + description: media.desc, + altText: media.alt, + fileType: presenter.fileType, + dimensions: presenter.dimensions, + imageAnalysis: imageAnalysis + ) + } + + private func setGenerating(_ isGenerating: Bool, button: UIBarButtonItem) { + if isGenerating { + let indicator = UIActivityIndicatorView() + indicator.startAnimating() + indicator.frame = CGRect(origin: .zero, size: CGSize(width: 24, height: 24)) + button.customView = indicator + } else { + button.customView = nil + } + button.isEnabled = !isGenerating + } +} + +private enum Strings { + static let generate = NSLocalizedString( + "media.textGeneration.generate", + value: "Generate", + comment: "Accessibility label for the generate button in media alt text/caption editor" + ) +}