Skip to content

Commit 7c476e2

Browse files
authored
Merge pull request #11822 from QualitativeDataRepository/QDR-perm-reindex-in-mult-transactions
QDR: Multiple transactions for permission reindexing
2 parents 6c17c9d + ef5065c commit 7c476e2

File tree

2 files changed

+117
-95
lines changed

2 files changed

+117
-95
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Permission reindexing, which occurs, e.g., after a user has been granted a role on a collection, has been made faster and less memory intensive in this release.

src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java

Lines changed: 116 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828

2929
import jakarta.ejb.EJB;
3030
import jakarta.ejb.Stateless;
31+
import jakarta.ejb.TransactionAttribute;
32+
import jakarta.ejb.TransactionAttributeType;
3133
import jakarta.inject.Named;
3234
import jakarta.json.Json;
3335
import jakarta.json.JsonObjectBuilder;
@@ -43,6 +45,9 @@ public class SolrIndexServiceBean {
4345

4446
private static final Logger logger = Logger.getLogger(SolrIndexServiceBean.class.getCanonicalName());
4547

48+
@EJB
49+
private SolrIndexServiceBean self; // Self-injection to allow calling methods in new transactions (from other methods in this bean)
50+
4651
@EJB
4752
DvObjectServiceBean dvObjectService;
4853
@EJB
@@ -317,142 +322,158 @@ private void persistToSolr(Collection<SolrInputDocument> docs) throws SolrServer
317322

318323
/**
319324
* We use the database to determine direct children since there is no
320-
* inheritance
325+
* inheritance. This implementation uses smaller transactions to avoid memory issues.
321326
*/
322327
public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) {
323328

324329
if (definitionPoint == null) {
325330
logger.log(Level.WARNING, "Cannot perform indexPermissionsOnSelfAndChildren with a definitionPoint null");
326331
return null;
327332
}
328-
int fileQueryMin= JvmSettings.MIN_FILES_TO_USE_PROXY.lookupOptional(Integer.class).orElse(Integer.MAX_VALUE);
329-
List<DataFileProxy> filesToReindexAsBatch = new ArrayList<>();
330-
/**
331-
* @todo Re-indexing the definition point itself seems to be necessary
332-
* for revoke but not necessarily grant.
333-
*/
334-
335-
// We don't create a Solr "primary/content" doc for the root dataverse
336-
// so don't create a Solr "permission" doc either.
337-
final int[] counter = {0};
333+
int fileQueryMin = JvmSettings.MIN_FILES_TO_USE_PROXY.lookupOptional(Integer.class).orElse(Integer.MAX_VALUE);
334+
final int[] counter = { 0 };
338335
int numObjects = 0;
339336
long globalStartTime = System.currentTimeMillis();
340-
if (definitionPoint.isInstanceofDataverse()) {
341-
Dataverse selfDataverse = (Dataverse) definitionPoint;
342-
if (!selfDataverse.equals(dataverseService.findRootDataverse())) {
337+
338+
// Handle the definition point itself in its own transaction
339+
if (definitionPoint instanceof Dataverse dataverse) {
340+
// We don't create a Solr "primary/content" doc for the root dataverse
341+
// so don't create a Solr "permission" doc either.
342+
if (!dataverse.equals(dataverseService.findRootDataverse())) {
343343
indexPermissionsForOneDvObject(definitionPoint);
344344
numObjects++;
345345
}
346-
List<Dataset> directChildDatasetsOfDvDefPoint = datasetService.findByOwnerId(selfDataverse.getId());
347-
for (Dataset dataset : directChildDatasetsOfDvDefPoint) {
346+
347+
// Process datasets in batches
348+
List<Long> datasetIds = datasetService.findIdsByOwnerId(dataverse.getId());
349+
int batchSize = 10; // Process 10 datasets per transaction
350+
351+
for (int i = 0; i < datasetIds.size(); i += batchSize) {
352+
int endIndex = Math.min(i + batchSize, datasetIds.size());
353+
List<Long> batchIds = datasetIds.subList(i, endIndex);
354+
355+
// Process this batch of datasets in a new transaction
356+
self.indexDatasetBatchInNewTransaction(batchIds, counter, fileQueryMin);
357+
numObjects += batchIds.size();
358+
359+
logger.fine("Permission reindexing: Processed batch " + (i/batchSize + 1) + " of " +
360+
(int) Math.ceil(datasetIds.size() / (double) batchSize) +
361+
" dataset batches for dataverse " + dataverse.getId());
362+
}
363+
} else if (definitionPoint instanceof Dataset dataset) {
364+
// For a single dataset, process it in its own transaction
365+
indexPermissionsForOneDvObject(definitionPoint);
366+
numObjects++;
367+
368+
// Process the dataset's files in a new transaction
369+
self.indexDatasetFilesInNewTransaction(dataset.getId(), counter, fileQueryMin);
370+
} else {
371+
// For other types (like files), just index in a new transaction
372+
indexPermissionsForOneDvObject(definitionPoint);
373+
numObjects++;
374+
}
375+
376+
logger.fine("Reindexed permissions for " + counter[0] + " files and " + numObjects +
377+
" datasets/collections in " + (System.currentTimeMillis() - globalStartTime) + " ms");
378+
379+
return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint + ": " + numObjects);
380+
}
381+
382+
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
383+
public void indexDatasetBatchInNewTransaction(List<Long> datasetIds, final int[] fileCounter, int fileQueryMin) {
384+
for (Long datasetId : datasetIds) {
385+
Dataset dataset = datasetService.find(datasetId);
386+
if (dataset != null) {
348387
indexPermissionsForOneDvObject(dataset);
349-
numObjects++;
350388

389+
// Process files for this dataset
351390
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
352-
long startTime = System.currentTimeMillis();
391+
353392
for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) {
354393
if (desiredCards.get(version.getVersionState())) {
355-
List<String> cachedPerms = searchPermissionsService.findDatasetVersionPerms(version);
356-
String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState());
357-
Long versionId = version.getId();
358-
for (FileMetadata fmd : version.getFileMetadatas()) {
359-
DataFileProxy fileProxy = new DataFileProxy(fmd);
360-
// Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice
361-
filesToReindexAsBatch.add(fileProxy);
362-
counter[0]++;
363-
if (counter[0] % 100 == 0) {
364-
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
365-
filesToReindexAsBatch.clear();
366-
}
367-
if (counter[0] % 1000 == 0) {
368-
logger.fine("Progress: " + counter[0] + "files permissions reindexed");
369-
}
370-
}
371-
372-
// Re-index any remaining files in the datasetversion (so that verionId, etc. remain constants for all files in the batch)
373-
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
374-
logger.info("Progress : dataset " + dataset.getId() + " permissions reindexed in " + (System.currentTimeMillis() - startTime) + " ms");
394+
processDatasetVersionFiles(version, fileCounter, fileQueryMin);
375395
}
376396
}
377397
}
378-
} else if (definitionPoint.isInstanceofDataset()) {
379-
indexPermissionsForOneDvObject(definitionPoint);
380-
numObjects++;
381-
// index files
382-
Dataset dataset = (Dataset) definitionPoint;
398+
}
399+
}
400+
401+
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
402+
public void indexDatasetFilesInNewTransaction(Long datasetId, final int[] fileCounter, int fileQueryMin) {
403+
Dataset dataset = datasetService.find(datasetId);
404+
if (dataset != null) {
383405
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
384406
for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) {
385407
if (desiredCards.get(version.getVersionState())) {
386-
List<String> cachedPerms = searchPermissionsService.findDatasetVersionPerms(version);
387-
String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState());
388-
Long versionId = version.getId();
389-
if (version.getFileMetadatas().size() > fileQueryMin) {
390-
// For large datasets, use a more efficient SQL query instead of loading all file metadata objects
391-
getDataFileInfoForPermissionIndexing(version.getId()).forEach(fileInfo -> {
392-
filesToReindexAsBatch.add(fileInfo);
393-
counter[0]++;
394-
395-
if (counter[0] % 100 == 0) {
396-
long startTime = System.currentTimeMillis();
397-
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
398-
filesToReindexAsBatch.clear();
399-
logger.fine("Progress: 100 file permissions at " + counter[0] + " files reindexed in " + (System.currentTimeMillis() - startTime) + " ms");
400-
}
401-
});
402-
} else {
403-
version.getFileMetadatas().stream()
404-
.forEach(fmd -> {
405-
DataFileProxy fileProxy = new DataFileProxy(fmd);
406-
filesToReindexAsBatch.add(fileProxy);
407-
counter[0]++;
408-
if (counter[0] % 100 == 0) {
409-
long startTime = System.currentTimeMillis();
410-
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
411-
filesToReindexAsBatch.clear();
412-
logger.fine("Progress: 100 file permissions at " + counter[0] + "files reindexed in " + (System.currentTimeMillis() - startTime) + " ms");
413-
}
414-
});
415-
}
416-
// Re-index any remaining files in the dataset version (versionId, etc. remain constants for all files in the batch)
408+
processDatasetVersionFiles(version, fileCounter, fileQueryMin);
409+
}
410+
}
411+
}
412+
}
413+
414+
private void processDatasetVersionFiles(DatasetVersion version,
415+
final int[] fileCounter, int fileQueryMin) {
416+
List<String> cachedPerms = searchPermissionsService.findDatasetVersionPerms(version);
417+
String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState());
418+
Long versionId = version.getId();
419+
List<DataFileProxy> filesToReindexAsBatch = new ArrayList<>();
420+
421+
// Process files in batches of 100
422+
int batchSize = 100;
423+
424+
if (version.getFileMetadatas().size() > fileQueryMin) {
425+
// For large datasets, use a more efficient SQL query
426+
Stream<DataFileProxy> fileStream = getDataFileInfoForPermissionIndexing(version.getId());
427+
428+
// Process files in batches to avoid memory issues
429+
fileStream.forEach(fileInfo -> {
430+
filesToReindexAsBatch.add(fileInfo);
431+
fileCounter[0]++;
432+
433+
if (filesToReindexAsBatch.size() >= batchSize) {
417434
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
418435
filesToReindexAsBatch.clear();
419436
}
437+
});
438+
} else {
439+
// For smaller datasets, process files directly
440+
for (FileMetadata fmd : version.getFileMetadatas()) {
441+
DataFileProxy fileProxy = new DataFileProxy(fmd);
442+
filesToReindexAsBatch.add(fileProxy);
443+
fileCounter[0]++;
420444

445+
if (filesToReindexAsBatch.size() >= batchSize) {
446+
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
447+
filesToReindexAsBatch.clear();
448+
}
421449
}
422-
} else {
423-
indexPermissionsForOneDvObject(definitionPoint);
424-
numObjects++;
425450
}
426451

427-
/**
428-
* @todo Error handling? What to do with response?
429-
*
430-
* @todo Should update timestamps, probably, even thought these are files, see
431-
* https://github.com/IQSS/dataverse/issues/2421
432-
*/
433-
logger.fine("Reindexed permissions for " + counter[0] + " files and " + numObjects + "datasets/collections in " + (System.currentTimeMillis() - globalStartTime) + " ms");
434-
return new IndexResponse("Number of dvObject permissions indexed for " + definitionPoint
435-
+ ": " + numObjects);
452+
// Process any remaining files
453+
if (!filesToReindexAsBatch.isEmpty()) {
454+
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
455+
}
436456
}
437457

438-
private String reindexFilesInBatches(List<DataFileProxy> filesToReindexAsBatch, List<String> cachedPerms, Long versionId, String solrIdEnd) {
458+
private void reindexFilesInBatches(List<DataFileProxy> filesToReindexAsBatch, List<String> cachedPerms, Long versionId, String solrIdEnd) {
439459
List<SolrInputDocument> docs = new ArrayList<>();
440460
try {
441461
// Assume all files have the same owner
442462
if (filesToReindexAsBatch.isEmpty()) {
443-
return "No files to reindex";
463+
logger.warning("reindexFilesInBatches called incorrectly with an empty file list");
444464
}
445465

446-
for (DataFileProxy file : filesToReindexAsBatch) {
466+
for (DataFileProxy file : filesToReindexAsBatch) {
447467

448-
DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(file, cachedPerms, versionId, solrIdEnd);
449-
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
450-
docs.add(solrDoc);
451-
}
468+
DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(file, cachedPerms, versionId, solrIdEnd);
469+
SolrInputDocument solrDoc = SearchUtil.createSolrDoc(fileSolrDoc);
470+
docs.add(solrDoc);
471+
}
452472
persistToSolr(docs);
453-
return " " + filesToReindexAsBatch.size() + " files indexed across " + docs.size() + " Solr documents ";
473+
logger.fine("Indexed " + filesToReindexAsBatch.size() + " files across " + docs.size() + " Solr documents");
454474
} catch (SolrServerException | IOException ex) {
455-
return " tried to reindex " + filesToReindexAsBatch.size() + " files indexed across " + docs.size() + " Solr documents but caught exception: " + ex;
475+
logger.log(Level.WARNING, "Failed to reindex " + filesToReindexAsBatch.size() +
476+
" files across " + docs.size() + " Solr documents", ex);
456477
}
457478
}
458479

0 commit comments

Comments
 (0)