diff --git a/charts/dm-store/Chart.yaml b/charts/dm-store/Chart.yaml index 566b7ea2a..e527c9c9b 100644 --- a/charts/dm-store/Chart.yaml +++ b/charts/dm-store/Chart.yaml @@ -2,7 +2,7 @@ description: Helm chart for the HMCTS CDM Document Management APO apiVersion: v2 name: dm-store home: https://github.com/hmcts/document-management-store-app -version: 2.3.5 +version: 2.3.6 maintainers: - name: HMCTS Evidence Management Team email: EvidenceManagement@HMCTS.NET diff --git a/charts/dm-store/values.yaml b/charts/dm-store/values.yaml index b194d1ab1..f9ee3ca95 100644 --- a/charts/dm-store/values.yaml +++ b/charts/dm-store/values.yaml @@ -82,7 +82,10 @@ java: CASE_DOCUMENTS_DELETION_NO_OF_ITERATIONS: 1 CASE_DOCUMENTS_DELETION_BATCH_SIZE: 1 CASE_DOCUMENTS_DELETION_SERVICE_NAME: ccd_case_disposer - + # mimetype update job + MIME_TYPE_UPDATE_THREAD_LIMIT: 1 + MIME_TYPE_UPDATE_NO_OF_ITERATIONS: 1 + MIME_TYPE_UPDATE_BATCH_SIZE: 300 blobstorage: resourceGroup: dm-store-aso-preview-rg teamName: "CCD" diff --git a/src/main/java/uk/gov/hmcts/dm/config/batch/MimeTypeUpdateTask.java b/src/main/java/uk/gov/hmcts/dm/config/batch/MimeTypeUpdateTask.java new file mode 100644 index 000000000..4caae74d8 --- /dev/null +++ b/src/main/java/uk/gov/hmcts/dm/config/batch/MimeTypeUpdateTask.java @@ -0,0 +1,106 @@ +package uk.gov.hmcts.dm.config.batch; + +import com.microsoft.applicationinsights.core.dependencies.google.common.collect.Lists; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.time.StopWatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.data.domain.PageRequest; +import org.springframework.data.domain.Pageable; +import org.springframework.stereotype.Service; +import uk.gov.hmcts.dm.repository.DocumentContentVersionRepository; +import uk.gov.hmcts.dm.service.DocumentContentVersionService; + +import java.util.List; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * This task periodically checks for Document Content Versions where the mimeTypeUpdated flag is false. + * It will then read the blob from storage, detect the correct MIME type, and update the database record. + */ +@Service +public class MimeTypeUpdateTask implements Runnable { + + private static final Logger log = LoggerFactory.getLogger(MimeTypeUpdateTask.class); + + private final DocumentContentVersionService documentContentVersionService; + private final DocumentContentVersionRepository documentContentVersionRepository; + + @Value("${spring.batch.mimeTypeUpdate.batchSize}") + private int batchSize; + + @Value("${spring.batch.mimeTypeUpdate.noOfIterations}") + private int noOfIterations; + + @Value("${spring.batch.mimeTypeUpdate.threadLimit}") + private int threadLimit; + + public MimeTypeUpdateTask(DocumentContentVersionService documentContentVersionService, + DocumentContentVersionRepository documentContentVersionRepository) { + this.documentContentVersionService = documentContentVersionService; + this.documentContentVersionRepository = documentContentVersionRepository; + } + + @Override + public void run() { + log.info("Started MIME Type Update job."); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + + try { + log.info("threadLimit: {}, noOfIterations: {}, batchSize: {}", threadLimit, noOfIterations, batchSize); + + for (int i = 0; i < noOfIterations; i++) { + if (!getAndUpdateMimeTypes(i)) { + // Stop iterating if a run finds no records to process + log.info("No records found in iteration {}. Stopping job.", i); + break; + } + } + + } catch (Exception e) { + log.error("MIME Type Update job failed with Error message: {}", e.getMessage(), e); + } finally { + stopWatch.stop(); + log.info("MIME Type Update job finished and took {} ms", stopWatch.getDuration().toMillis()); + } + } + + private boolean getAndUpdateMimeTypes(int iteration) { + StopWatch iterationStopWatch = new StopWatch(); + iterationStopWatch.start(); + + Pageable pageable = PageRequest.of(0, batchSize); + + List documentIds = documentContentVersionRepository + .findDocumentContentVersionIdsForMimeTypeUpdate(pageable); + + if (CollectionUtils.isEmpty(documentIds)) { + iterationStopWatch.stop(); + log.info("Iteration {}: No records found for MIME type update. Total time: {} ms", + iteration, iterationStopWatch.getDuration().toMillis()); + return false; // Indicates no records were found + } + + log.info("Iteration {}: Found {} records to process for MIME type update.", iteration, documentIds.size()); + + int batchCommitSize = 500; // Define the batch size for committing to the DB + List> batches = Lists.partition(documentIds, batchCommitSize); + + try (ExecutorService executorService = Executors.newFixedThreadPool(threadLimit)) { + batches.forEach( + batch -> executorService.submit(() -> + documentContentVersionService.updateMimeType(batch)) + ); + } + + + iterationStopWatch.stop(); + log.info("Time taken to complete iteration number: {} was : {} ms", iteration, + iterationStopWatch.getDuration().toMillis()); + return true; // Indicates records were processed + } +} diff --git a/src/main/java/uk/gov/hmcts/dm/repository/DocumentContentVersionRepository.java b/src/main/java/uk/gov/hmcts/dm/repository/DocumentContentVersionRepository.java index 2448a89f2..d5e85db73 100644 --- a/src/main/java/uk/gov/hmcts/dm/repository/DocumentContentVersionRepository.java +++ b/src/main/java/uk/gov/hmcts/dm/repository/DocumentContentVersionRepository.java @@ -1,5 +1,6 @@ package uk.gov.hmcts.dm.repository; +import org.springframework.data.domain.Pageable; import org.springframework.data.jpa.repository.Modifying; import org.springframework.data.jpa.repository.Query; import org.springframework.data.repository.CrudRepository; @@ -29,4 +30,18 @@ void updateContentUriAndContentCheckSum(@Param("id") UUID id, @Query("select dcv from DocumentContentVersion dcv where dcv.storedDocument.id = :storedDocumentId") List findAllByStoredDocumentId(@Param("storedDocumentId") UUID storedDocumentId); + @Query(""" + SELECT dcv.id FROM DocumentContentVersion dcv + WHERE dcv.mimeTypeUpdated = false + ORDER BY dcv.createdOn DESC""") + List findDocumentContentVersionIdsForMimeTypeUpdate(Pageable pageable); + + + @Modifying + @Query("UPDATE DocumentContentVersion d SET d.mimeType = :mimeType, d.mimeTypeUpdated = true where d.id = :id") + void updateMimeType(@Param("id") UUID id, @Param("mimeType") String mimeType); + + @Modifying + @Query("UPDATE DocumentContentVersion d SET d.mimeTypeUpdated = true WHERE d.id = :id") + void markMimeTypeUpdated(@Param("id") UUID id); } diff --git a/src/main/java/uk/gov/hmcts/dm/service/BlobStorageReadService.java b/src/main/java/uk/gov/hmcts/dm/service/BlobStorageReadService.java index 61c0ed82e..65dc3d92a 100644 --- a/src/main/java/uk/gov/hmcts/dm/service/BlobStorageReadService.java +++ b/src/main/java/uk/gov/hmcts/dm/service/BlobStorageReadService.java @@ -18,6 +18,7 @@ import uk.gov.hmcts.dm.exception.InvalidRangeRequestException; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.util.Enumeration; import java.util.Objects; @@ -42,6 +43,18 @@ public BlobStorageReadService(BlobContainerClient cloudBlobContainer, ToggleConf this.toggleConfiguration = toggleConfiguration; } + + /** + * Gets an InputStream for a given document/blob UUID. + * This is useful for internal processing like MIME type detection. + * + * @param documentId The UUID of the document content version. + * @return An InputStream of the blob's content. + */ + public InputStream getInputStream(UUID documentId) { + return blockBlobClient(documentId.toString()).openInputStream(); + } + /** * Streams the content of a document from blob storage. Processes a Range request when possible */ diff --git a/src/main/java/uk/gov/hmcts/dm/service/DocumentContentVersionService.java b/src/main/java/uk/gov/hmcts/dm/service/DocumentContentVersionService.java index 61b1780f0..14e5b0fa6 100644 --- a/src/main/java/uk/gov/hmcts/dm/service/DocumentContentVersionService.java +++ b/src/main/java/uk/gov/hmcts/dm/service/DocumentContentVersionService.java @@ -1,39 +1,75 @@ package uk.gov.hmcts.dm.service; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; import uk.gov.hmcts.dm.domain.DocumentContentVersion; import uk.gov.hmcts.dm.domain.StoredDocument; import uk.gov.hmcts.dm.repository.DocumentContentVersionRepository; import uk.gov.hmcts.dm.repository.StoredDocumentRepository; +import java.util.List; import java.util.Optional; import java.util.UUID; -@Transactional @Service public class DocumentContentVersionService { - private final DocumentContentVersionRepository documentContentVersionRepository; + private static final Logger log = LoggerFactory.getLogger(DocumentContentVersionService.class); + private final DocumentContentVersionRepository documentContentVersionRepository; private final StoredDocumentRepository storedDocumentRepository; + private final MimeTypeDetectionService mimeTypeDetectionService; // New dependency @Autowired public DocumentContentVersionService(DocumentContentVersionRepository documentContentVersionRepository, - StoredDocumentRepository storedDocumentRepository) { + StoredDocumentRepository storedDocumentRepository, + MimeTypeDetectionService mimeTypeDetectionService) { // Injected here this.documentContentVersionRepository = documentContentVersionRepository; this.storedDocumentRepository = storedDocumentRepository; + this.mimeTypeDetectionService = mimeTypeDetectionService; } public Optional findById(UUID id) { return documentContentVersionRepository.findById(id); } + @Transactional public Optional findMostRecentDocumentContentVersionByStoredDocumentId(UUID id) { return storedDocumentRepository - .findByIdAndDeleted(id, false) - .map(StoredDocument::getMostRecentDocumentContentVersion); + .findByIdAndDeleted(id, false) + .map(StoredDocument::getMostRecentDocumentContentVersion); } + @Transactional(propagation = Propagation.REQUIRES_NEW) + public void updateMimeType(List documentVersionIdList) { + + for (UUID documentVersionId : documentVersionIdList) { + log.info("Processing MIME type update for ID: {}", documentVersionId); + + String detectedMimeType = mimeTypeDetectionService.detectMimeType(documentVersionId); + + if (detectedMimeType == null) { + log.warn( + "Could not detect MIME type for {}. Marking as processed to prevent retries.", + documentVersionId + ); + documentContentVersionRepository.markMimeTypeUpdated(documentVersionId); + continue; + } + log.info("Updating MIME type for document {}. New: [{}].", + documentVersionId, detectedMimeType); + + documentContentVersionRepository.updateMimeType(documentVersionId, detectedMimeType); + + log.info("Updated documentVersion id:{}, mimeType:{}", + documentVersionId, + detectedMimeType + ); + } + } } + diff --git a/src/main/java/uk/gov/hmcts/dm/service/MimeTypeDetectionService.java b/src/main/java/uk/gov/hmcts/dm/service/MimeTypeDetectionService.java new file mode 100644 index 000000000..56a58c531 --- /dev/null +++ b/src/main/java/uk/gov/hmcts/dm/service/MimeTypeDetectionService.java @@ -0,0 +1,59 @@ +package uk.gov.hmcts.dm.service; + +import org.apache.commons.io.input.BoundedInputStream; +import org.apache.tika.Tika; +import org.apache.tika.metadata.Metadata; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Service; + +import java.io.IOException; +import java.io.InputStream; +import java.util.UUID; + +/** + * Service to detect the MIME type of a document stored in blob storage. + */ +@Service +public class MimeTypeDetectionService { + + private static final Logger log = LoggerFactory.getLogger(MimeTypeDetectionService.class); + private static final int MAX_BYTES_TO_READ = 2 * 1024 * 1024; // 2 MB is sufficient for Tika to detect type + + private final BlobStorageReadService blobStorageReadService; + + public MimeTypeDetectionService(BlobStorageReadService blobStorageReadService) { + this.blobStorageReadService = blobStorageReadService; + } + + /** + * Detects the MIME type of a document version by reading the first few bytes from its blob. + * + * @param documentVersionId The UUID of the document version. + * @return The detected MIME type as a String, or null if detection fails. + */ + public String detectMimeType(UUID documentVersionId) { + log.debug("Attempting to detect MIME type for document version ID: {}", documentVersionId); + try (InputStream inputStream = blobStorageReadService.getInputStream(documentVersionId); + BoundedInputStream limitedStream = BoundedInputStream.builder() + .setInputStream(inputStream) + .setMaxCount(MAX_BYTES_TO_READ) + .get()) { + + Tika tika = new Tika(); + Metadata metadata = new Metadata(); + String mimeType = tika.detect(limitedStream, metadata); + log.info("Detected MIME type for {} as: {}", documentVersionId, mimeType); + return mimeType; + + } catch (IOException e) { + log.error("Failed to read blob stream for MIME type detection on document version {}", + documentVersionId); + return null; + } catch (Exception e) { + log.error("An unexpected error occurred during MIME type detection for document version {}", + documentVersionId); + return null; + } + } +} diff --git a/src/main/resources/application.yaml b/src/main/resources/application.yaml index a31f38355..2b605136d 100755 --- a/src/main/resources/application.yaml +++ b/src/main/resources/application.yaml @@ -66,7 +66,10 @@ spring: noOfIterations: ${CASE_DOCUMENTS_DELETION_NO_OF_ITERATIONS:1} batchSize: ${CASE_DOCUMENTS_DELETION_BATCH_SIZE:10} serviceName: ${CASE_DOCUMENTS_DELETION_SERVICE_NAME:ccd_case_disposer} - + mimeTypeUpdate: + threadLimit: ${MIME_TYPE_UPDATE_THREAD_LIMIT:1} + noOfIterations: ${MIME_TYPE_UPDATE_NO_OF_ITERATIONS:1} + batchSize: ${MIME_TYPE_UPDATE_BATCH_SIZE:300} # run cleanup every 1 hour (since a single execution should not run longer than 1 hour) historicExecutionsRetentionMilliseconds: ${HISTORIC_EXECUTIONS_RETENTION_MILLISECONDS:3600000} documentMetaDataUpdateMilliseconds: ${DOCUMENT_METADATA_UPDATE_MILLISECONDS:10000} @@ -113,7 +116,6 @@ logging: org.springframework.jdbc.core: ${LOG_LEVEL_SPRING_JDBC:DEBUG} org.hibernate.SQL: ${LOG_LEVEL_HIBERNATE:DEBUG} org.hibernate.type.descriptor.sql.BasicBinder: ${LOG_LEVEL_HIBERNATE:DEBUG} - azure: app_insights_key: ${APPINSIGHTS_INSTRUMENTATIONKEY:true} application-insights: @@ -232,4 +234,3 @@ dbMigration: # When true, the app will run DB migration on startup. # Otherwise, it will just check if all migrations have been applied (and fail to start if not). runOnStartup: ${RUN_DB_MIGRATION_ON_STARTUP:true} - diff --git a/src/test/java/uk/gov/hmcts/dm/service/DocumentContentVersionServiceTests.java b/src/test/java/uk/gov/hmcts/dm/service/DocumentContentVersionServiceTests.java index 32ecf2cc0..fce938ab7 100644 --- a/src/test/java/uk/gov/hmcts/dm/service/DocumentContentVersionServiceTests.java +++ b/src/test/java/uk/gov/hmcts/dm/service/DocumentContentVersionServiceTests.java @@ -4,29 +4,38 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.InjectMocks; import org.mockito.Mock; -import org.springframework.test.context.junit.jupiter.SpringExtension; +import org.mockito.junit.jupiter.MockitoExtension; import uk.gov.hmcts.dm.componenttests.TestUtil; import uk.gov.hmcts.dm.domain.DocumentContentVersion; import uk.gov.hmcts.dm.repository.DocumentContentVersionRepository; import uk.gov.hmcts.dm.repository.StoredDocumentRepository; +import java.util.List; import java.util.Optional; +import java.util.UUID; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -@ExtendWith(SpringExtension.class) +@ExtendWith(MockitoExtension.class) class DocumentContentVersionServiceTests { @Mock - DocumentContentVersionRepository documentContentVersionRepository; + private DocumentContentVersionRepository documentContentVersionRepository; @Mock - StoredDocumentRepository storedDocumentRepository; + private StoredDocumentRepository storedDocumentRepository; + + @Mock + private MimeTypeDetectionService mimeTypeDetectionService; @InjectMocks - DocumentContentVersionService documentContentVersionService; + private DocumentContentVersionService documentContentVersionService; @Test void testFindOne() { @@ -51,4 +60,39 @@ void testMostRecentFileContentVersionByStoredFileIdOnNullStoredFile() { documentContentVersionService.findMostRecentDocumentContentVersionByStoredDocumentId(TestUtil.RANDOM_UUID)); } + @Test + void updateMimeType_shouldUpdateMimeTypeWhenDetectedSuccessfully() { + List documentVersionIdList = List.of(UUID.randomUUID()); + String detectedMimeType = "application/pdf"; + + when(mimeTypeDetectionService.detectMimeType(any())).thenReturn(detectedMimeType); + + documentContentVersionService.updateMimeType(documentVersionIdList); + + verify(documentContentVersionRepository).updateMimeType(any(), eq(detectedMimeType)); + verify(documentContentVersionRepository, never()).markMimeTypeUpdated(any()); + } + + @Test + void updateMimeType_shouldMarkAsUpdatedWhenDetectionFails() { + List documentVersionIdList = List.of(UUID.randomUUID()); + + when(mimeTypeDetectionService.detectMimeType(any())).thenReturn(null); + + documentContentVersionService.updateMimeType(documentVersionIdList); + + verify(documentContentVersionRepository).markMimeTypeUpdated(any()); + verify(documentContentVersionRepository, never()).updateMimeType(any(), any()); + } + + @Test + void updateMimeType_shouldDoNothingIfDocumentNotFound() { + List documentVersionIdList = List.of(UUID.randomUUID()); + when(mimeTypeDetectionService.detectMimeType(any())).thenReturn(null); + + documentContentVersionService.updateMimeType(documentVersionIdList); + + verify(mimeTypeDetectionService).detectMimeType(any()); + verify(documentContentVersionRepository, never()).updateMimeType(any(), any()); + } } diff --git a/src/test/java/uk/gov/hmcts/dm/service/MimeTypeDetectionServiceTest.java b/src/test/java/uk/gov/hmcts/dm/service/MimeTypeDetectionServiceTest.java new file mode 100644 index 000000000..f9caac3fb --- /dev/null +++ b/src/test/java/uk/gov/hmcts/dm/service/MimeTypeDetectionServiceTest.java @@ -0,0 +1,95 @@ +package uk.gov.hmcts.dm.service; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.when; + +@ExtendWith(MockitoExtension.class) +class MimeTypeDetectionServiceTest { + + @Mock + private BlobStorageReadService blobStorageReadService; + + @InjectMocks + private MimeTypeDetectionService mimeTypeDetectionService; + + private UUID documentVersionId; + + @BeforeEach + void setUp() { + documentVersionId = UUID.randomUUID(); + } + + @Test + void testDetectMimeTypeSuccess() { + // Given + String content = "this is some plain text content"; + InputStream inputStream = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)); + when(blobStorageReadService.getInputStream(documentVersionId)).thenReturn(inputStream); + + // When + String detectedMimeType = mimeTypeDetectionService.detectMimeType(documentVersionId); + + // Then + assertEquals("text/plain", detectedMimeType); + } + + @Test + void testDetectMimeTypeHandlesGenericException() { + // Given + when(blobStorageReadService.getInputStream(documentVersionId)) + .thenThrow(new RuntimeException("Unexpected error")); + + // When + String detectedMimeType = mimeTypeDetectionService.detectMimeType(documentVersionId); + + // Then + assertNull(detectedMimeType); + } + + @Test + void testDetectMimeTypeHandlesIOException() { + InputStream throwingStream = new InputStream() { + @Override + public int read() throws IOException { + throw new IOException("IO error"); + } + }; + + when(blobStorageReadService.getInputStream(documentVersionId)).thenReturn(throwingStream); + + String detectedMimeType = mimeTypeDetectionService.detectMimeType(documentVersionId); + + assertNull(detectedMimeType); + } + + @Test + void testDetectMimeTypeForPdf() { + // Given + // A minimal PDF file header to simulate PDF content + byte[] pdfBytes = "%PDF-1.4".getBytes(StandardCharsets.UTF_8); + InputStream inputStream = new ByteArrayInputStream(pdfBytes); + when(blobStorageReadService.getInputStream(any(UUID.class))).thenReturn(inputStream); + + // When + String detectedMimeType = mimeTypeDetectionService.detectMimeType(UUID.randomUUID()); + + // Then + assertEquals("application/pdf", detectedMimeType); + } +} +