Skip to content

Commit f91e75d

Browse files
authored
Merge pull request #11374 from GlobalDataverseCommunityConsortium/solr-index-improvements
Solr Index Improvements
2 parents bed5b30 + 359c153 commit f91e75d

File tree

19 files changed

+594
-515
lines changed

19 files changed

+594
-515
lines changed

conf/solr/solrconfig.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@
238238
have some sort of hard autoCommit to limit the log size.
239239
-->
240240
<autoCommit>
241-
<maxTime>${solr.autoCommit.maxTime:30000}</maxTime>
241+
<maxTime>${solr.autoCommit.maxTime:300000}</maxTime>
242242
<openSearcher>false</openSearcher>
243243
</autoCommit>
244244

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
### Solr Indexing speed improved
2+
3+
The performance of Solr indexing has been significantly improved, particularly for datasets with many files.
4+
5+
A new dataverse.solr.min-files-to-use-proxy microprofile setting can be used to further improve performance/lower memory requirements for datasets with many files (e.g. 500+) (defaults to Integer.MAX, disabling use of the new functionality)

doc/sphinx-guides/source/installation/config.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2689,6 +2689,17 @@ when using it to configure your core name!
26892689

26902690
Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_PATH``.
26912691

2692+
dataverse.solr.min-files-to-use-proxy
2693+
+++++++++++++++++++++++++++++++++++++
2694+
2695+
Specifies when to use a smaller datafile proxy object for the purposes of dataset indexing. This can lower memory requirements
2696+
and improve performance when reindexing large datasets (e.g. those with hundreds or thousands of files). (Creating the proxy may slightly slow indexing datasets with only a few files.)
2697+
2698+
This setting represents a number of files for which the datafile procy should be used. By default, this is set to Interger.MAX which disables using the proxy.
2699+
A recommended value would be ~1000 but the optimal value may vary depending on details of your installation.
2700+
2701+
Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_MIN_FILES_TO_USE_PROXY``.
2702+
26922703
dataverse.solr.concurrency.max-async-indexes
26932704
++++++++++++++++++++++++++++++++++++++++++++
26942705

src/main/java/edu/harvard/iq/dataverse/DataFile.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import edu.harvard.iq.dataverse.datasetutility.FileSizeChecker;
1414
import edu.harvard.iq.dataverse.ingest.IngestReport;
1515
import edu.harvard.iq.dataverse.ingest.IngestRequest;
16+
import edu.harvard.iq.dataverse.search.SolrIndexServiceBean;
1617
import edu.harvard.iq.dataverse.util.BundleUtil;
1718
import edu.harvard.iq.dataverse.util.FileUtil;
1819
import edu.harvard.iq.dataverse.util.ShapefileHandler;
@@ -23,6 +24,7 @@
2324
import java.util.Objects;
2425
import java.text.SimpleDateFormat;
2526
import java.util.Arrays;
27+
import java.util.Date;
2628
import java.util.HashMap;
2729
import java.util.Map;
2830
import java.util.Set;
@@ -50,6 +52,26 @@
5052
@NamedQuery(name="DataFile.findDataFileThatReplacedId",
5153
query="SELECT s.id FROM DataFile s WHERE s.previousDataFileId=:identifier")
5254
})
55+
@NamedNativeQuery(
56+
name = "DataFile.getDataFileInfoForPermissionIndexing",
57+
query = "SELECT fm.label, df.id, dvo.publicationDate " +
58+
"FROM filemetadata fm " +
59+
"JOIN datafile df ON fm.datafile_id = df.id " +
60+
"JOIN dvobject dvo ON df.id = dvo.id " +
61+
"WHERE fm.datasetversion_id = ?",
62+
resultSetMapping = "DataFileInfoMapping"
63+
)
64+
@SqlResultSetMapping(
65+
name = "DataFileInfoMapping",
66+
classes = @ConstructorResult(
67+
targetClass = SolrIndexServiceBean.DataFileProxy.class,
68+
columns = {
69+
@ColumnResult(name = "label", type = String.class),
70+
@ColumnResult(name = "id", type = Long.class),
71+
@ColumnResult(name = "publicationDate", type = Date.class)
72+
}
73+
)
74+
)
5375
@Entity
5476
@Table(indexes = {@Index(columnList="ingeststatus")
5577
, @Index(columnList="checksumvalue")

src/main/java/edu/harvard/iq/dataverse/Dataset.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,20 @@
2020
import java.util.Objects;
2121
import java.util.Set;
2222
import jakarta.persistence.CascadeType;
23+
import jakarta.persistence.ColumnResult;
2324
import jakarta.persistence.Entity;
2425
import jakarta.persistence.Index;
2526
import jakarta.persistence.JoinColumn;
2627
import jakarta.persistence.ManyToOne;
28+
import jakarta.persistence.NamedNativeQuery;
2729
import jakarta.persistence.NamedQueries;
2830
import jakarta.persistence.NamedQuery;
2931
import jakarta.persistence.NamedStoredProcedureQuery;
3032
import jakarta.persistence.OneToMany;
3133
import jakarta.persistence.OneToOne;
3234
import jakarta.persistence.OrderBy;
3335
import jakarta.persistence.ParameterMode;
36+
import jakarta.persistence.SqlResultSetMapping;
3437
import jakarta.persistence.StoredProcedureParameter;
3538
import jakarta.persistence.Table;
3639
import jakarta.persistence.Temporal;
@@ -71,6 +74,23 @@
7174
@NamedQuery(name = "Dataset.countAll",
7275
query = "SELECT COUNT(ds) FROM Dataset ds")
7376
})
77+
@NamedNativeQuery(
78+
name = "Dataset.findAllOrSubsetOrderByFilesOwned",
79+
query = "SELECT DISTINCT CAST(o.id AS BIGINT) as id, COUNT(f.id) as numFiles " +
80+
"FROM dvobject o " +
81+
"LEFT JOIN dvobject f ON f.owner_id = o.id " +
82+
"WHERE o.dtype = 'Dataset' " +
83+
"AND (? = false OR o.indexTime IS NULL) " +
84+
"GROUP BY o.id " +
85+
"ORDER BY numfiles ASC, id",
86+
resultSetMapping = "DatasetIdMapping"
87+
)
88+
@SqlResultSetMapping(
89+
name = "DatasetIdMapping",
90+
columns = {
91+
@ColumnResult(name = "id", type = Long.class)
92+
}
93+
)
7494

7595
/*
7696
Below is the database stored procedure for getting a string dataset id.

src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -279,32 +279,9 @@ public List<Long> findAllOrSubsetOrderByFilesOwned(boolean skipIndexed) {
279279
SEK - 11/09/2021
280280
*/
281281

282-
String skipClause = skipIndexed ? "AND o.indexTime is null " : "";
283-
Query query = em.createNativeQuery(" Select distinct(o.id), count(f.id) as numFiles FROM dvobject o " +
284-
"left join dvobject f on f.owner_id = o.id where o.dtype = 'Dataset' "
285-
+ skipClause
286-
+ " group by o.id "
287-
+ "ORDER BY count(f.id) asc, o.id");
288-
289-
List<Object[]> queryResults;
290-
queryResults = query.getResultList();
291-
292-
List<Long> retVal = new ArrayList();
293-
for (Object[] result : queryResults) {
294-
Long dsId;
295-
if (result[0] != null) {
296-
try {
297-
dsId = Long.parseLong(result[0].toString()) ;
298-
} catch (Exception ex) {
299-
dsId = null;
300-
}
301-
if (dsId == null) {
302-
continue;
303-
}
304-
retVal.add(dsId);
305-
}
306-
}
307-
return retVal;
282+
return em.createNamedQuery("Dataset.findAllOrSubsetOrderByFilesOwned", Long.class)
283+
.setParameter(1, skipIndexed)
284+
.getResultList();
308285
}
309286

310287
/**

src/main/java/edu/harvard/iq/dataverse/FileMetadata.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import jakarta.json.Json;
2626
import jakarta.json.JsonArrayBuilder;
2727
import jakarta.persistence.Column;
28+
import jakarta.persistence.ColumnResult;
2829
import jakarta.persistence.Entity;
2930
import jakarta.persistence.GeneratedValue;
3031
import jakarta.persistence.GenerationType;
@@ -35,8 +36,10 @@
3536
import jakarta.persistence.JoinTable;
3637
import jakarta.persistence.ManyToMany;
3738
import jakarta.persistence.ManyToOne;
39+
import jakarta.persistence.NamedNativeQuery;
3840
import jakarta.persistence.OneToMany;
3941
import jakarta.persistence.OrderBy;
42+
import jakarta.persistence.SqlResultSetMapping;
4043
import jakarta.persistence.Table;
4144
import jakarta.persistence.Transient;
4245
import jakarta.persistence.Version;
@@ -62,6 +65,39 @@
6265
* @author skraffmiller
6366
*/
6467
@Table(indexes = {@Index(columnList="datafile_id"), @Index(columnList="datasetversion_id")} )
68+
@NamedNativeQuery(
69+
name = "FileMetadata.compareFileMetadata",
70+
query = "WITH fm_categories AS (" +
71+
" SELECT fmd.filemetadatas_id, " +
72+
" STRING_AGG(dfc.name, ',' ORDER BY dfc.name) AS categories " +
73+
" FROM FileMetadata_DataFileCategory fmd " +
74+
" JOIN DataFileCategory dfc ON fmd.filecategories_id = dfc.id " +
75+
" GROUP BY fmd.filemetadatas_id " +
76+
") " +
77+
"SELECT fm1.id " +
78+
"FROM FileMetadata fm1 " +
79+
"LEFT JOIN FileMetadata fm2 ON fm1.datafile_id = fm2.datafile_id " +
80+
" AND fm2.datasetversion_id = ?1 " +
81+
"LEFT JOIN fm_categories fc1 ON fc1.filemetadatas_id = fm1.id " +
82+
"LEFT JOIN fm_categories fc2 ON fc2.filemetadatas_id = fm2.id " +
83+
"WHERE fm1.datasetversion_id = ?2 " +
84+
" AND (fm2.id IS NULL " +
85+
" OR (fm1.datafile_id = fm2.datafile_id " +
86+
" AND (fm2.description IS DISTINCT FROM fm1.description " +
87+
" OR fm2.directoryLabel IS DISTINCT FROM fm1.directoryLabel " +
88+
" OR fm2.label != fm1.label " +
89+
" OR fm2.restricted IS DISTINCT FROM fm1.restricted " +
90+
" OR fm2.prov_freeform IS DISTINCT FROM fm1.prov_freeform " +
91+
" OR fc1.categories IS DISTINCT FROM fc2.categories " +
92+
" ) " +
93+
" ) " +
94+
" )",
95+
resultSetMapping = "IdToLongMapping"
96+
)
97+
@SqlResultSetMapping(
98+
name = "IdToLongMapping",
99+
columns = @ColumnResult(name = "id", type = Long.class)
100+
)
65101
@Entity
66102
public class FileMetadata implements Serializable {
67103
private static final long serialVersionUID = 1L;

src/main/java/edu/harvard/iq/dataverse/FileVersionDifference.java

Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -64,26 +64,41 @@ When there are changes (after v4.19)to the file metadata data model this method
6464

6565
if (newFileMetadata.getDataFile() == null && originalFileMetadata != null){
6666
//File Deleted
67-
updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 1, 0);
67+
if (details) {
68+
updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 1, 0);
69+
}
6870
return false;
6971
}
70-
71-
if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null ){
72+
73+
if (this.originalFileMetadata == null && this.newFileMetadata.getDataFile() != null){
7274
//File Added
73-
if (!details) return false;
74-
retVal = false;
75-
updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0);
76-
}
77-
78-
//Check to see if File replaced
79-
if (originalFileMetadata != null &&
80-
newFileMetadata.getDataFile() != null && originalFileMetadata.getDataFile() != null &&!this.originalFileMetadata.getDataFile().equals(this.newFileMetadata.getDataFile())){
81-
if (!details) return false;
82-
updateDifferenceSummary( "", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 0, 1);
75+
if (!details) {
76+
return false;
77+
}
8378
retVal = false;
79+
updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 1, 0, 0, 0);
8480
}
8581

86-
if ( originalFileMetadata != null) {
82+
if (originalFileMetadata != null) {
83+
// Check to see if File replaced
84+
if (newFileMetadata.getDataFile() != null && originalFileMetadata.getDataFile() != null && !this.originalFileMetadata.getDataFile().equals(this.newFileMetadata.getDataFile())) {
85+
if (!details)
86+
return false;
87+
updateDifferenceSummary("", BundleUtil.getStringFromBundle("file.versionDifferences.fileGroupTitle"), 0, 0, 0, 1);
88+
retVal = false;
89+
}
90+
91+
/*
92+
* Get Restriction Differences
93+
*/
94+
if (originalFileMetadata.isRestricted() != newFileMetadata.isRestricted()) {
95+
if (details) {
96+
String value2 = newFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted");
97+
updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileAccessTitle"), value2, 0, 0, 0, 0);
98+
}
99+
retVal = false;
100+
}
101+
87102
if (!newFileMetadata.getLabel().equals(originalFileMetadata.getLabel())) {
88103
if (details) {
89104
differenceDetailItems.add(new FileDifferenceDetailItem(BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), originalFileMetadata.getLabel(), newFileMetadata.getLabel()));
@@ -94,10 +109,8 @@ When there are changes (after v4.19)to the file metadata data model this method
94109
BundleUtil.getStringFromBundle("file.versionDifferences.fileNameDetailTitle"), 0, 1, 0, 0);
95110
retVal = false;
96111
}
97-
}
98112

99-
//Description differences
100-
if ( originalFileMetadata != null) {
113+
//Description differences
101114
if (newFileMetadata.getDescription() != null
102115
&& originalFileMetadata.getDescription() != null
103116
&& !newFileMetadata.getDescription().equals(originalFileMetadata.getDescription())) {
@@ -134,9 +147,7 @@ When there are changes (after v4.19)to the file metadata data model this method
134147
BundleUtil.getStringFromBundle("file.versionDifferences.descriptionDetailTitle"), 0, 0, 1, 0);
135148
retVal = false;
136149
}
137-
}
138-
//Provenance Description differences
139-
if ( originalFileMetadata != null) {
150+
//Provenance Description differences
140151
if ((newFileMetadata.getProvFreeForm() != null && !newFileMetadata.getProvFreeForm().isEmpty())
141152
&& (originalFileMetadata.getProvFreeForm() != null && !originalFileMetadata.getProvFreeForm().isEmpty())
142153
&& !newFileMetadata.getProvFreeForm().equals(originalFileMetadata.getProvFreeForm())) {
@@ -173,8 +184,6 @@ When there are changes (after v4.19)to the file metadata data model this method
173184
BundleUtil.getStringFromBundle("file.versionDifferences.provenanceDetailTitle"), 0, 0, 1, 0);
174185
retVal = false;
175186
}
176-
}
177-
if (originalFileMetadata != null) {
178187
/*
179188
get Tags differences
180189
*/
@@ -188,7 +197,9 @@ When there are changes (after v4.19)to the file metadata data model this method
188197
}
189198

190199
if (!value1.equals(value2)) {
191-
if (!details) return false;
200+
if (!details) {
201+
return false;
202+
}
192203
int added = 0;
193204
int deleted = 0;
194205

@@ -223,16 +234,7 @@ When there are changes (after v4.19)to the file metadata data model this method
223234
}
224235
retVal = false;
225236
}
226-
227-
/*
228-
Get Restriction Differences
229-
*/
230-
value1 = originalFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted");
231-
value2 = newFileMetadata.isRestricted() ? BundleUtil.getStringFromBundle("file.versionDifferences.fileRestricted") : BundleUtil.getStringFromBundle("file.versionDifferences.fileUnrestricted");
232-
if (!value1.equals(value2)) {
233-
updateDifferenceSummary(BundleUtil.getStringFromBundle("file.versionDifferences.fileAccessTitle"), value2, 0, 0, 0, 0);
234-
retVal = false;
235-
}
237+
236238
}
237239
return retVal;
238240
}

src/main/java/edu/harvard/iq/dataverse/RoleAssigneeServiceBean.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean;
44
import edu.harvard.iq.dataverse.authorization.DataverseRole;
5+
import edu.harvard.iq.dataverse.authorization.Permission;
56
import edu.harvard.iq.dataverse.authorization.RoleAssignee;
67
import edu.harvard.iq.dataverse.authorization.groups.Group;
78
import edu.harvard.iq.dataverse.authorization.groups.GroupServiceBean;
@@ -27,6 +28,7 @@
2728
import jakarta.ejb.Stateless;
2829
import jakarta.inject.Named;
2930
import jakarta.persistence.EntityManager;
31+
import jakarta.persistence.NamedNativeQuery;
3032
import jakarta.persistence.PersistenceContext;
3133
import org.apache.commons.lang3.StringUtils;
3234

@@ -395,6 +397,15 @@ public List<RoleAssignee> filterRoleAssignees(String query, DvObject dvObject, L
395397

396398
return roleAssigneeList;
397399
}
400+
401+
402+
public List<String> findAssigneesWithPermissionOnDvObject(Long objectId, Permission permission) {
403+
int bitpos = 63 - permission.ordinal();
404+
return em.createNamedQuery("RoleAssignment.findAssigneesWithPermissionOnDvObject", String.class)
405+
.setParameter(1, bitpos)
406+
.setParameter(2, objectId)
407+
.getResultList();
408+
}
398409

399410
private void msg(String s) {
400411
//System.out.println(s);

0 commit comments

Comments
 (0)