2828
2929import jakarta .ejb .EJB ;
3030import jakarta .ejb .Stateless ;
31+ import jakarta .ejb .TransactionAttribute ;
32+ import jakarta .ejb .TransactionAttributeType ;
3133import jakarta .inject .Named ;
3234import jakarta .json .Json ;
3335import jakarta .json .JsonObjectBuilder ;
@@ -43,6 +45,9 @@ public class SolrIndexServiceBean {
4345
4446 private static final Logger logger = Logger .getLogger (SolrIndexServiceBean .class .getCanonicalName ());
4547
48+ @ EJB
49+ private SolrIndexServiceBean self ; // Self-injection to allow calling methods in new transactions (from other methods in this bean)
50+
4651 @ EJB
4752 DvObjectServiceBean dvObjectService ;
4853 @ EJB
@@ -317,142 +322,158 @@ private void persistToSolr(Collection<SolrInputDocument> docs) throws SolrServer
317322
318323 /**
319324 * We use the database to determine direct children since there is no
320- * inheritance
325+ * inheritance. This implementation uses smaller transactions to avoid memory issues.
321326 */
322327 public IndexResponse indexPermissionsOnSelfAndChildren (DvObject definitionPoint ) {
323328
324329 if (definitionPoint == null ) {
325330 logger .log (Level .WARNING , "Cannot perform indexPermissionsOnSelfAndChildren with a definitionPoint null" );
326331 return null ;
327332 }
328- int fileQueryMin = JvmSettings .MIN_FILES_TO_USE_PROXY .lookupOptional (Integer .class ).orElse (Integer .MAX_VALUE );
329- List <DataFileProxy > filesToReindexAsBatch = new ArrayList <>();
330- /**
331- * @todo Re-indexing the definition point itself seems to be necessary
332- * for revoke but not necessarily grant.
333- */
334-
335- // We don't create a Solr "primary/content" doc for the root dataverse
336- // so don't create a Solr "permission" doc either.
337- final int [] counter = {0 };
333+ int fileQueryMin = JvmSettings .MIN_FILES_TO_USE_PROXY .lookupOptional (Integer .class ).orElse (Integer .MAX_VALUE );
334+ final int [] counter = { 0 };
338335 int numObjects = 0 ;
339336 long globalStartTime = System .currentTimeMillis ();
340- if (definitionPoint .isInstanceofDataverse ()) {
341- Dataverse selfDataverse = (Dataverse ) definitionPoint ;
342- if (!selfDataverse .equals (dataverseService .findRootDataverse ())) {
337+
338+ // Handle the definition point itself in its own transaction
339+ if (definitionPoint instanceof Dataverse dataverse ) {
340+ // We don't create a Solr "primary/content" doc for the root dataverse
341+ // so don't create a Solr "permission" doc either.
342+ if (!dataverse .equals (dataverseService .findRootDataverse ())) {
343343 indexPermissionsForOneDvObject (definitionPoint );
344344 numObjects ++;
345345 }
346- List <Dataset > directChildDatasetsOfDvDefPoint = datasetService .findByOwnerId (selfDataverse .getId ());
347- for (Dataset dataset : directChildDatasetsOfDvDefPoint ) {
346+
347+ // Process datasets in batches
348+ List <Long > datasetIds = datasetService .findIdsByOwnerId (dataverse .getId ());
349+ int batchSize = 10 ; // Process 10 datasets per transaction
350+
351+ for (int i = 0 ; i < datasetIds .size (); i += batchSize ) {
352+ int endIndex = Math .min (i + batchSize , datasetIds .size ());
353+ List <Long > batchIds = datasetIds .subList (i , endIndex );
354+
355+ // Process this batch of datasets in a new transaction
356+ self .indexDatasetBatchInNewTransaction (batchIds , counter , fileQueryMin );
357+ numObjects += batchIds .size ();
358+
359+ logger .fine ("Permission reindexing: Processed batch " + (i /batchSize + 1 ) + " of " +
360+ (int ) Math .ceil (datasetIds .size () / (double ) batchSize ) +
361+ " dataset batches for dataverse " + dataverse .getId ());
362+ }
363+ } else if (definitionPoint instanceof Dataset dataset ) {
364+ // For a single dataset, process it in its own transaction
365+ indexPermissionsForOneDvObject (definitionPoint );
366+ numObjects ++;
367+
368+ // Process the dataset's files in a new transaction
369+ self .indexDatasetFilesInNewTransaction (dataset .getId (), counter , fileQueryMin );
370+ } else {
371+ // For other types (like files), just index in a new transaction
372+ indexPermissionsForOneDvObject (definitionPoint );
373+ numObjects ++;
374+ }
375+
376+ logger .fine ("Reindexed permissions for " + counter [0 ] + " files and " + numObjects +
377+ " datasets/collections in " + (System .currentTimeMillis () - globalStartTime ) + " ms" );
378+
379+ return new IndexResponse ("Number of dvObject permissions indexed for " + definitionPoint + ": " + numObjects );
380+ }
381+
382+ @ TransactionAttribute (TransactionAttributeType .REQUIRES_NEW )
383+ public void indexDatasetBatchInNewTransaction (List <Long > datasetIds , final int [] fileCounter , int fileQueryMin ) {
384+ for (Long datasetId : datasetIds ) {
385+ Dataset dataset = datasetService .find (datasetId );
386+ if (dataset != null ) {
348387 indexPermissionsForOneDvObject (dataset );
349- numObjects ++;
350388
389+ // Process files for this dataset
351390 Map <DatasetVersion .VersionState , Boolean > desiredCards = searchPermissionsService .getDesiredCards (dataset );
352- long startTime = System . currentTimeMillis ();
391+
353392 for (DatasetVersion version : versionsToReIndexPermissionsFor (dataset )) {
354393 if (desiredCards .get (version .getVersionState ())) {
355- List <String > cachedPerms = searchPermissionsService .findDatasetVersionPerms (version );
356- String solrIdEnd = getDatasetOrDataFileSolrEnding (version .getVersionState ());
357- Long versionId = version .getId ();
358- for (FileMetadata fmd : version .getFileMetadatas ()) {
359- DataFileProxy fileProxy = new DataFileProxy (fmd );
360- // Since reindexFilesInBatches() re-indexes a file in all versions needed, we should not send a file already in the released version twice
361- filesToReindexAsBatch .add (fileProxy );
362- counter [0 ]++;
363- if (counter [0 ] % 100 == 0 ) {
364- reindexFilesInBatches (filesToReindexAsBatch , cachedPerms , versionId , solrIdEnd );
365- filesToReindexAsBatch .clear ();
366- }
367- if (counter [0 ] % 1000 == 0 ) {
368- logger .fine ("Progress: " + counter [0 ] + "files permissions reindexed" );
369- }
370- }
371-
372- // Re-index any remaining files in the datasetversion (so that verionId, etc. remain constants for all files in the batch)
373- reindexFilesInBatches (filesToReindexAsBatch , cachedPerms , versionId , solrIdEnd );
374- logger .info ("Progress : dataset " + dataset .getId () + " permissions reindexed in " + (System .currentTimeMillis () - startTime ) + " ms" );
394+ processDatasetVersionFiles (version , fileCounter , fileQueryMin );
375395 }
376396 }
377397 }
378- } else if (definitionPoint .isInstanceofDataset ()) {
379- indexPermissionsForOneDvObject (definitionPoint );
380- numObjects ++;
381- // index files
382- Dataset dataset = (Dataset ) definitionPoint ;
398+ }
399+ }
400+
401+ @ TransactionAttribute (TransactionAttributeType .REQUIRES_NEW )
402+ public void indexDatasetFilesInNewTransaction (Long datasetId , final int [] fileCounter , int fileQueryMin ) {
403+ Dataset dataset = datasetService .find (datasetId );
404+ if (dataset != null ) {
383405 Map <DatasetVersion .VersionState , Boolean > desiredCards = searchPermissionsService .getDesiredCards (dataset );
384406 for (DatasetVersion version : versionsToReIndexPermissionsFor (dataset )) {
385407 if (desiredCards .get (version .getVersionState ())) {
386- List <String > cachedPerms = searchPermissionsService .findDatasetVersionPerms (version );
387- String solrIdEnd = getDatasetOrDataFileSolrEnding (version .getVersionState ());
388- Long versionId = version .getId ();
389- if (version .getFileMetadatas ().size () > fileQueryMin ) {
390- // For large datasets, use a more efficient SQL query instead of loading all file metadata objects
391- getDataFileInfoForPermissionIndexing (version .getId ()).forEach (fileInfo -> {
392- filesToReindexAsBatch .add (fileInfo );
393- counter [0 ]++;
394-
395- if (counter [0 ] % 100 == 0 ) {
396- long startTime = System .currentTimeMillis ();
397- reindexFilesInBatches (filesToReindexAsBatch , cachedPerms , versionId , solrIdEnd );
398- filesToReindexAsBatch .clear ();
399- logger .fine ("Progress: 100 file permissions at " + counter [0 ] + " files reindexed in " + (System .currentTimeMillis () - startTime ) + " ms" );
400- }
401- });
402- } else {
403- version .getFileMetadatas ().stream ()
404- .forEach (fmd -> {
405- DataFileProxy fileProxy = new DataFileProxy (fmd );
406- filesToReindexAsBatch .add (fileProxy );
407- counter [0 ]++;
408- if (counter [0 ] % 100 == 0 ) {
409- long startTime = System .currentTimeMillis ();
410- reindexFilesInBatches (filesToReindexAsBatch , cachedPerms , versionId , solrIdEnd );
411- filesToReindexAsBatch .clear ();
412- logger .fine ("Progress: 100 file permissions at " + counter [0 ] + "files reindexed in " + (System .currentTimeMillis () - startTime ) + " ms" );
413- }
414- });
415- }
416- // Re-index any remaining files in the dataset version (versionId, etc. remain constants for all files in the batch)
408+ processDatasetVersionFiles (version , fileCounter , fileQueryMin );
409+ }
410+ }
411+ }
412+ }
413+
414+ private void processDatasetVersionFiles (DatasetVersion version ,
415+ final int [] fileCounter , int fileQueryMin ) {
416+ List <String > cachedPerms = searchPermissionsService .findDatasetVersionPerms (version );
417+ String solrIdEnd = getDatasetOrDataFileSolrEnding (version .getVersionState ());
418+ Long versionId = version .getId ();
419+ List <DataFileProxy > filesToReindexAsBatch = new ArrayList <>();
420+
421+ // Process files in batches of 100
422+ int batchSize = 100 ;
423+
424+ if (version .getFileMetadatas ().size () > fileQueryMin ) {
425+ // For large datasets, use a more efficient SQL query
426+ Stream <DataFileProxy > fileStream = getDataFileInfoForPermissionIndexing (version .getId ());
427+
428+ // Process files in batches to avoid memory issues
429+ fileStream .forEach (fileInfo -> {
430+ filesToReindexAsBatch .add (fileInfo );
431+ fileCounter [0 ]++;
432+
433+ if (filesToReindexAsBatch .size () >= batchSize ) {
417434 reindexFilesInBatches (filesToReindexAsBatch , cachedPerms , versionId , solrIdEnd );
418435 filesToReindexAsBatch .clear ();
419436 }
437+ });
438+ } else {
439+ // For smaller datasets, process files directly
440+ for (FileMetadata fmd : version .getFileMetadatas ()) {
441+ DataFileProxy fileProxy = new DataFileProxy (fmd );
442+ filesToReindexAsBatch .add (fileProxy );
443+ fileCounter [0 ]++;
420444
445+ if (filesToReindexAsBatch .size () >= batchSize ) {
446+ reindexFilesInBatches (filesToReindexAsBatch , cachedPerms , versionId , solrIdEnd );
447+ filesToReindexAsBatch .clear ();
448+ }
421449 }
422- } else {
423- indexPermissionsForOneDvObject (definitionPoint );
424- numObjects ++;
425450 }
426451
427- /**
428- * @todo Error handling? What to do with response?
429- *
430- * @todo Should update timestamps, probably, even thought these are files, see
431- * https://github.com/IQSS/dataverse/issues/2421
432- */
433- logger .fine ("Reindexed permissions for " + counter [0 ] + " files and " + numObjects + "datasets/collections in " + (System .currentTimeMillis () - globalStartTime ) + " ms" );
434- return new IndexResponse ("Number of dvObject permissions indexed for " + definitionPoint
435- + ": " + numObjects );
452+ // Process any remaining files
453+ if (!filesToReindexAsBatch .isEmpty ()) {
454+ reindexFilesInBatches (filesToReindexAsBatch , cachedPerms , versionId , solrIdEnd );
455+ }
436456 }
437457
438- private String reindexFilesInBatches (List <DataFileProxy > filesToReindexAsBatch , List <String > cachedPerms , Long versionId , String solrIdEnd ) {
458+ private void reindexFilesInBatches (List <DataFileProxy > filesToReindexAsBatch , List <String > cachedPerms , Long versionId , String solrIdEnd ) {
439459 List <SolrInputDocument > docs = new ArrayList <>();
440460 try {
441461 // Assume all files have the same owner
442462 if (filesToReindexAsBatch .isEmpty ()) {
443- return "No files to reindex" ;
463+ logger . warning ( "reindexFilesInBatches called incorrectly with an empty file list" ) ;
444464 }
445465
446- for (DataFileProxy file : filesToReindexAsBatch ) {
466+ for (DataFileProxy file : filesToReindexAsBatch ) {
447467
448- DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc (file , cachedPerms , versionId , solrIdEnd );
449- SolrInputDocument solrDoc = SearchUtil .createSolrDoc (fileSolrDoc );
450- docs .add (solrDoc );
451- }
468+ DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc (file , cachedPerms , versionId , solrIdEnd );
469+ SolrInputDocument solrDoc = SearchUtil .createSolrDoc (fileSolrDoc );
470+ docs .add (solrDoc );
471+ }
452472 persistToSolr (docs );
453- return " " + filesToReindexAsBatch .size () + " files indexed across " + docs .size () + " Solr documents " ;
473+ logger . fine ( "Indexed " + filesToReindexAsBatch .size () + " files across " + docs .size () + " Solr documents" ) ;
454474 } catch (SolrServerException | IOException ex ) {
455- return " tried to reindex " + filesToReindexAsBatch .size () + " files indexed across " + docs .size () + " Solr documents but caught exception: " + ex ;
475+ logger .log (Level .WARNING , "Failed to reindex " + filesToReindexAsBatch .size () +
476+ " files across " + docs .size () + " Solr documents" , ex );
456477 }
457478 }
458479
0 commit comments