-
Couldn't load subscription status.
- Fork 32
EM-6870 add mimetype update task #2920
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
193df78
f98fcab
457e0ca
5657542
d78d65f
34125ea
bcaba3d
a248e93
27fc77a
5c38535
030cda5
3e6137c
99a0b55
e903e69
bb6dd21
711307c
6e7392e
b1174fe
4bf93ee
c2c01c1
3e63e7f
4c88bda
c56d1f6
3f88613
8fdb101
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,7 +2,7 @@ description: Helm chart for the HMCTS CDM Document Management APO | |
| apiVersion: v2 | ||
| name: dm-store | ||
| home: https://github.com/hmcts/document-management-store-app | ||
| version: 2.3.5 | ||
| version: 2.3.6 | ||
| maintainers: | ||
| - name: HMCTS Evidence Management Team | ||
| email: [email protected] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| package uk.gov.hmcts.dm.config.batch; | ||
|
|
||
| import org.apache.commons.collections4.CollectionUtils; | ||
| import org.apache.commons.lang3.time.StopWatch; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
| import org.springframework.beans.factory.annotation.Value; | ||
| import org.springframework.data.domain.PageRequest; | ||
| import org.springframework.data.domain.Pageable; | ||
| import org.springframework.stereotype.Service; | ||
| import uk.gov.hmcts.dm.repository.DocumentContentVersionRepository; | ||
| import uk.gov.hmcts.dm.service.DocumentContentVersionService; | ||
|
|
||
| import java.util.List; | ||
| import java.util.UUID; | ||
| import java.util.concurrent.ExecutorService; | ||
| import java.util.concurrent.Executors; | ||
| import java.util.concurrent.TimeUnit; | ||
|
|
||
| /** | ||
| * This task periodically checks for Document Content Versions where the mimeTypeUpdated flag is false. | ||
| * It will then read the blob from storage, detect the correct MIME type, and update the database record. | ||
| */ | ||
| @Service | ||
| public class MimeTypeUpdateTask implements Runnable { | ||
|
|
||
| private static final Logger log = LoggerFactory.getLogger(MimeTypeUpdateTask.class); | ||
|
|
||
| private final DocumentContentVersionService documentContentVersionService; | ||
| private final DocumentContentVersionRepository documentContentVersionRepository; | ||
|
|
||
| @Value("${spring.batch.mimeTypeUpdate.batchSize}") | ||
| private int batchSize; | ||
|
|
||
| @Value("${spring.batch.mimeTypeUpdate.noOfIterations}") | ||
| private int noOfIterations; | ||
|
|
||
| @Value("${spring.batch.mimeTypeUpdate.threadLimit}") | ||
| private int threadLimit; | ||
|
|
||
| public MimeTypeUpdateTask(DocumentContentVersionService documentContentVersionService, | ||
| DocumentContentVersionRepository documentContentVersionRepository) { | ||
| this.documentContentVersionService = documentContentVersionService; | ||
| this.documentContentVersionRepository = documentContentVersionRepository; | ||
| } | ||
|
|
||
| @Override | ||
| public void run() { | ||
| log.info("Started MIME Type Update job."); | ||
| StopWatch stopWatch = new StopWatch(); | ||
| stopWatch.start(); | ||
|
|
||
| try { | ||
| log.info("threadLimit: {}, noOfIterations: {}, batchSize: {}", threadLimit, noOfIterations, batchSize); | ||
|
|
||
| for (int i = 0; i < noOfIterations; i++) { | ||
| if (!getAndUpdateMimeTypes(i)) { | ||
| // Stop iterating if a run finds no records to process | ||
| log.info("No records found in iteration {}. Stopping job.", i); | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| } catch (Exception e) { | ||
| log.error("MIME Type Update job failed with Error message: {}", e.getMessage(), e); | ||
| } finally { | ||
| stopWatch.stop(); | ||
| log.info("MIME Type Update job finished and took {} ms", stopWatch.getDuration().toMillis()); | ||
| } | ||
| } | ||
|
|
||
| private boolean getAndUpdateMimeTypes(int iteration) { | ||
| StopWatch iterationStopWatch = new StopWatch(); | ||
| iterationStopWatch.start(); | ||
|
|
||
| Pageable pageable = PageRequest.of(0, batchSize); | ||
|
|
||
| List<UUID> documentIds = documentContentVersionRepository | ||
| .findDocumentContentVersionIdsForMimeTypeUpdate(pageable); | ||
|
|
||
| if (CollectionUtils.isEmpty(documentIds)) { | ||
| iterationStopWatch.stop(); | ||
| log.info("Iteration {}: No records found for MIME type update. Total time: {} ms", | ||
| iteration, iterationStopWatch.getDuration().toMillis()); | ||
| return false; // Indicates no records were found | ||
| } | ||
|
|
||
| log.info("Iteration {}: Found {} records to process for MIME type update.", iteration, documentIds.size()); | ||
|
|
||
| ExecutorService executorService = Executors.newFixedThreadPool(threadLimit); | ||
| try { | ||
| documentIds.forEach( | ||
| id -> executorService.submit(() -> documentContentVersionService.updateMimeType(id)) | ||
| ); | ||
| } finally { | ||
| executorService.shutdown(); | ||
| } | ||
|
|
||
| try { | ||
| // Wait for all tasks to complete | ||
| executorService.awaitTermination(1, TimeUnit.HOURS); | ||
| } catch (InterruptedException e) { | ||
| Thread.currentThread().interrupt(); | ||
| log.error("MIME type update job was interrupted while waiting for tasks to complete.", e); | ||
| } | ||
|
|
||
| iterationStopWatch.stop(); | ||
| log.info("Time taken to complete iteration number: {} was : {} ms", iteration, | ||
| iterationStopWatch.getDuration().toMillis()); | ||
| return true; // Indicates records were processed | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,10 @@ | ||
| package uk.gov.hmcts.dm.service; | ||
|
|
||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
| import org.springframework.beans.factory.annotation.Autowired; | ||
| import org.springframework.stereotype.Service; | ||
| import org.springframework.transaction.annotation.Propagation; | ||
| import org.springframework.transaction.annotation.Transactional; | ||
| import uk.gov.hmcts.dm.domain.DocumentContentVersion; | ||
| import uk.gov.hmcts.dm.domain.StoredDocument; | ||
|
|
@@ -11,29 +14,58 @@ | |
| import java.util.Optional; | ||
| import java.util.UUID; | ||
|
|
||
| @Transactional | ||
| @Service | ||
| public class DocumentContentVersionService { | ||
|
|
||
| private final DocumentContentVersionRepository documentContentVersionRepository; | ||
| private static final Logger log = LoggerFactory.getLogger(DocumentContentVersionService.class); | ||
|
|
||
| private final DocumentContentVersionRepository documentContentVersionRepository; | ||
| private final StoredDocumentRepository storedDocumentRepository; | ||
| private final MimeTypeDetectionService mimeTypeDetectionService; // New dependency | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This java comment can be removed |
||
|
|
||
| @Autowired | ||
| public DocumentContentVersionService(DocumentContentVersionRepository documentContentVersionRepository, | ||
| StoredDocumentRepository storedDocumentRepository) { | ||
| StoredDocumentRepository storedDocumentRepository, | ||
| MimeTypeDetectionService mimeTypeDetectionService) { // Injected here | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This java comment can be removed |
||
| this.documentContentVersionRepository = documentContentVersionRepository; | ||
| this.storedDocumentRepository = storedDocumentRepository; | ||
| this.mimeTypeDetectionService = mimeTypeDetectionService; | ||
| } | ||
|
|
||
| public Optional<DocumentContentVersion> findById(UUID id) { | ||
| return documentContentVersionRepository.findById(id); | ||
| } | ||
|
|
||
| @Transactional | ||
| public Optional<DocumentContentVersion> findMostRecentDocumentContentVersionByStoredDocumentId(UUID id) { | ||
| return storedDocumentRepository | ||
| .findByIdAndDeleted(id, false) | ||
| .map(StoredDocument::getMostRecentDocumentContentVersion); | ||
| .findByIdAndDeleted(id, false) | ||
| .map(StoredDocument::getMostRecentDocumentContentVersion); | ||
| } | ||
|
|
||
| @Transactional(propagation = Propagation.REQUIRES_NEW) | ||
| public void updateMimeType(UUID documentVersionId) { | ||
| log.info("Processing MIME type update for ID: {}", documentVersionId); | ||
|
|
||
| String detectedMimeType = mimeTypeDetectionService.detectMimeType(documentVersionId); | ||
|
|
||
| if (detectedMimeType == null) { | ||
| log.warn( | ||
| "Could not detect MIME type for {}. Marking as processed to prevent retries.", | ||
| documentVersionId | ||
| ); | ||
| documentContentVersionRepository.markMimeTypeUpdated(documentVersionId); | ||
| return; | ||
| } | ||
| log.info("Updating MIME type for document {}. New: [{}].", | ||
| documentVersionId, detectedMimeType); | ||
|
|
||
| documentContentVersionRepository.updateMimeType(documentVersionId, detectedMimeType); | ||
|
|
||
| log.info("Updated documentVersion id:{}, mimeType:{}", | ||
| documentVersionId, | ||
| detectedMimeType | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| package uk.gov.hmcts.dm.service; | ||
|
|
||
| import org.apache.commons.io.input.BoundedInputStream; | ||
| import org.apache.tika.Tika; | ||
| import org.apache.tika.metadata.Metadata; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
| import org.springframework.stereotype.Service; | ||
|
|
||
| import java.io.IOException; | ||
| import java.io.InputStream; | ||
| import java.util.UUID; | ||
|
|
||
| /** | ||
| * Service to detect the MIME type of a document stored in blob storage. | ||
| */ | ||
| @Service | ||
| public class MimeTypeDetectionService { | ||
|
|
||
| private static final Logger log = LoggerFactory.getLogger(MimeTypeDetectionService.class); | ||
| private static final int MAX_BYTES_TO_READ = 2 * 1024 * 1024; // 2 MB is sufficient for Tika to detect type | ||
|
|
||
| private final BlobStorageReadService blobStorageReadService; | ||
|
|
||
| public MimeTypeDetectionService(BlobStorageReadService blobStorageReadService) { | ||
| this.blobStorageReadService = blobStorageReadService; | ||
| } | ||
|
|
||
| /** | ||
| * Detects the MIME type of a document version by reading the first few bytes from its blob. | ||
| * | ||
| * @param documentVersionId The UUID of the document version. | ||
| * @return The detected MIME type as a String, or null if detection fails. | ||
| */ | ||
| public String detectMimeType(UUID documentVersionId) { | ||
| log.debug("Attempting to detect MIME type for document version ID: {}", documentVersionId); | ||
| try (InputStream inputStream = blobStorageReadService.getInputStream(documentVersionId); | ||
| BoundedInputStream limitedStream = BoundedInputStream.builder() | ||
| .setInputStream(inputStream) | ||
| .setMaxCount(MAX_BYTES_TO_READ) | ||
| .get()) { | ||
|
|
||
| Tika tika = new Tika(); | ||
| Metadata metadata = new Metadata(); | ||
| String mimeType = tika.detect(limitedStream, metadata); | ||
| log.info("Detected MIME type for {} as: {}", documentVersionId, mimeType); | ||
| return mimeType; | ||
|
|
||
| } catch (IOException e) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing a unit test for this catch block |
||
| log.error("Failed to read blob stream for MIME type detection on document version {}", | ||
| documentVersionId); | ||
| return null; | ||
| } catch (Exception e) { | ||
| log.error("An unexpected error occurred during MIME type detection for document version {}", | ||
| documentVersionId); | ||
| return null; | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can update in batch as well similar to
document-management-store-app/src/main/java/uk/gov/hmcts/dm/config/batch/CaseDocumentsDeletionTask.java
Line 100 in 6b9cd5d