Skip to content

Commit 3433877

Browse files
authored
Merge pull request #11555 from vera/feat/dataset-count
feat: datasetCount for dataverses
2 parents 1c5334f + 88cff95 commit 3433877

File tree

16 files changed

+498
-40
lines changed

16 files changed

+498
-40
lines changed

conf/solr/schema.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@
242242

243243
<field name="license" type="string" stored="true" indexed="true" multiValued="false"/>
244244
<field name="fileCount" type="plong" stored="true" indexed="true" multiValued="false"/>
245+
<field name="datasetCount" type="plong" stored="true" indexed="true" multiValued="false"/>
245246

246247
<!--
247248
METADATA SCHEMA FIELDS
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The search index now includes datasetCount for each collection, counting published, linked, and harvested datasets.
2+
Collections can be filtered using datasetCount (e.g., `datasetCount:[1000 TO *]`), and the value is returned in Dataverse search results via the Search API.

src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -165,21 +165,31 @@ private List<Dataset> findByOwnerId(Long ownerId, boolean onlyPublished) {
165165
}
166166

167167
public List<Long> findIdsByOwnerId(Long ownerId) {
168-
return findIdsByOwnerId(ownerId, false);
168+
return findIdsByOwnerId(ownerId, false, false);
169169
}
170170

171-
private List<Long> findIdsByOwnerId(Long ownerId, boolean onlyPublished) {
171+
public List<Long> findIdsByOwnerId(Long ownerId, boolean onlyPublished, boolean includeHarvested) {
172172
List<Long> retList = new ArrayList<>();
173-
if (!onlyPublished) {
173+
if (!onlyPublished && includeHarvested) {
174174
return em.createNamedQuery("Dataset.findIdByOwnerId")
175175
.setParameter("ownerId", ownerId)
176176
.getResultList();
177177
} else {
178178
List<Dataset> results = em.createNamedQuery("Dataset.findByOwnerId")
179179
.setParameter("ownerId", ownerId).getResultList();
180180
for (Dataset ds : results) {
181-
if (ds.isReleased() && !ds.isDeaccessioned()) {
182-
retList.add(ds.getId());
181+
// For harvested datasets, only add them if includeHarvested is true
182+
if (ds.isHarvested()) {
183+
if (includeHarvested) {
184+
retList.add(ds.getId());
185+
}
186+
// For non-harvested datasets, either
187+
// - add them all (if onlyPublished is false) OR
188+
// - only add them if they are released and not deaccessioned (if onlyPublished is true)
189+
} else {
190+
if (!onlyPublished || (ds.isReleased() && !ds.isDeaccessioned())) {
191+
retList.add(ds.getId());
192+
}
183193
}
184194
}
185195
return retList;

src/main/java/edu/harvard/iq/dataverse/Dataverse.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,13 @@
5656
@NamedQuery(name = "Dataverse.filterByAlias", query="SELECT dv FROM Dataverse dv WHERE LOWER(dv.alias) LIKE :alias order by dv.alias"),
5757
@NamedQuery(name = "Dataverse.filterByAliasNameAffiliation", query="SELECT dv FROM Dataverse dv WHERE (LOWER(dv.alias) LIKE :alias) OR (LOWER(dv.name) LIKE :name) OR (LOWER(dv.affiliation) LIKE :affiliation) order by dv.alias"),
5858
@NamedQuery(name = "Dataverse.filterByName", query="SELECT dv FROM Dataverse dv WHERE LOWER(dv.name) LIKE :name order by dv.alias"),
59-
@NamedQuery(name = "Dataverse.countAll", query = "SELECT COUNT(dv) FROM Dataverse dv")
59+
@NamedQuery(name = "Dataverse.countAll", query = "SELECT COUNT(dv) FROM Dataverse dv"),
60+
@NamedQuery(name = "Dataverse.getDatasetCount",
61+
query = "SELECT " +
62+
"(SELECT COUNT(DISTINCT d) FROM Dataset d JOIN d.versions v WHERE d.owner.id IN :ids AND v.versionState = :datasetState) + " +
63+
"(SELECT COUNT(DISTINCT l.dataset) FROM DatasetLinkingDataverse l JOIN l.dataset.versions v WHERE l.linkingDataverse.id IN :ids AND v.versionState = :datasetState) " +
64+
// The WHERE statement is a hacky way of ensuring the count is returned in a single result row
65+
"FROM Dataverse d WHERE d.id = (SELECT MIN(d2.id) FROM Dataverse d2)")
6066
})
6167
@Entity
6268
@Table(indexes = {@Index(columnList="defaultcontributorrole_id")

src/main/java/edu/harvard/iq/dataverse/DataverseServiceBean.java

Lines changed: 60 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import edu.harvard.iq.dataverse.authorization.users.User;
1515
import edu.harvard.iq.dataverse.batch.util.LoggingUtil;
1616
import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter;
17+
import edu.harvard.iq.dataverse.DatasetVersion.VersionState;
1718
import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
1819
import edu.harvard.iq.dataverse.search.IndexServiceBean;
1920
import edu.harvard.iq.dataverse.search.SolrIndexServiceBean;
@@ -26,13 +27,8 @@
2627
import java.io.File;
2728
import java.io.IOException;
2829
import java.sql.Timestamp;
29-
import java.util.ArrayList;
30-
import java.util.Date;
31-
import java.util.HashMap;
32-
import java.util.List;
33-
import java.util.Map;
30+
import java.util.*;
3431
import java.util.logging.Logger;
35-
import java.util.Properties;
3632

3733
import edu.harvard.iq.dataverse.validation.JSONDataValidation;
3834
import jakarta.ejb.EJB;
@@ -468,8 +464,26 @@ public List<Dataverse> findDataversesThatLinkToThisDvId(long dataverseId) {
468464
return dataverseLinkingService.findLinkingDataverses(dataverseId);
469465
}
470466

467+
public List<Dataset> findDatasetsThisIdHasLinkedTo(long dataverseId, boolean onlyPublished) {
468+
List<Dataset> linkedDatasets = datasetLinkingService.findLinkedDatasets(dataverseId);
469+
470+
if (!onlyPublished) {
471+
return linkedDatasets;
472+
}
473+
474+
List<Dataset> retList = new ArrayList();
475+
476+
for (Dataset ds : linkedDatasets) {
477+
if (ds.isReleased() && !ds.isDeaccessioned()) {
478+
retList.add(ds);
479+
}
480+
}
481+
482+
return retList;
483+
}
484+
471485
public List<Dataset> findDatasetsThisIdHasLinkedTo(long dataverseId) {
472-
return datasetLinkingService.findLinkedDatasets(dataverseId);
486+
return this.findDatasetsThisIdHasLinkedTo(dataverseId, false);
473487
}
474488

475489
public List<Dataverse> findDataversesThatLinkToThisDatasetId(long datasetId) {
@@ -754,21 +768,25 @@ public List<Long> findAllDataverseDataverseChildren(Long dvId) {
754768

755769
// function to recursively find ids of all children of a dataverse that are
756770
// of type dataset
757-
public List<Long> findAllDataverseDatasetChildren(Long dvId) {
771+
public List<Long> findAllDataverseDatasetChildren(Long dvId, boolean onlyPublished, boolean includeHarvested) {
758772
// get list of Dataverse children
759773
List<Long> dataverseChildren = findIdsByOwnerId(dvId);
760774
// get list of Dataset children
761-
List<Long> datasetChildren = datasetService.findIdsByOwnerId(dvId);
775+
List<Long> datasetChildren = datasetService.findIdsByOwnerId(dvId, onlyPublished, includeHarvested);
762776

763777
if (dataverseChildren == null) {
764778
return datasetChildren;
765779
} else {
766780
for (Long childDvId : dataverseChildren) {
767-
datasetChildren.addAll(findAllDataverseDatasetChildren(childDvId));
781+
datasetChildren.addAll(findAllDataverseDatasetChildren(childDvId, onlyPublished, includeHarvested));
768782
}
769783
return datasetChildren;
770784
}
771785
}
786+
787+
public List<Long> findAllDataverseDatasetChildren(Long dvId) {
788+
return findAllDataverseDatasetChildren(dvId, false, false);
789+
}
772790

773791
public String addRoleAssignmentsToChildren(Dataverse owner, ArrayList<String> rolesToInherit,
774792
boolean inheritAllRoles) {
@@ -1257,4 +1275,36 @@ public void disableStorageQuota(StorageQuota storageQuota) {
12571275
public long getDataverseCount() {
12581276
return em.createNamedQuery("Dataverse.countAll", Long.class).getSingleResult();
12591277
}
1278+
1279+
/**
1280+
* Returns the total number of published datasets within a Dataverse collection. The number includes harvested and
1281+
* linked datasets. Datasets in subcollections are also counted.
1282+
* @param dvId ID of a Dataverse collection
1283+
* @return the total number of published datasets within that Dataverse collection
1284+
*/
1285+
public long getDatasetCount(Long dvId) {
1286+
Set<Long> dvIds = new HashSet<>();
1287+
Deque<Long> stack = new ArrayDeque<>();
1288+
dvIds.add(dvId);
1289+
stack.push(dvId);
1290+
1291+
// Collect IDs of all subdataverses
1292+
while (!stack.isEmpty()) {
1293+
Long currentId = stack.pop();
1294+
List<Long> children = em.createQuery("SELECT d.id FROM Dataverse d WHERE d.owner.id = :parentId", Long.class)
1295+
.setParameter("parentId", currentId)
1296+
.getResultList();
1297+
1298+
for (Long childId : children) {
1299+
if (dvIds.add(childId)) {
1300+
stack.push(childId);
1301+
}
1302+
}
1303+
}
1304+
1305+
return em.createNamedQuery("Dataverse.getDatasetCount", Long.class)
1306+
.setParameter("ids", dvIds)
1307+
.setParameter("datasetState", VersionState.RELEASED)
1308+
.getSingleResult();
1309+
}
12601310
}

src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DeleteDatasetLinkingDataverseCommand.java

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import edu.harvard.iq.dataverse.Dataset;
99
import edu.harvard.iq.dataverse.DatasetLinkingDataverse;
10+
import edu.harvard.iq.dataverse.Dataverse;
1011
import edu.harvard.iq.dataverse.authorization.Permission;
1112
import edu.harvard.iq.dataverse.batch.util.LoggingUtil;
1213
import edu.harvard.iq.dataverse.engine.command.AbstractCommand;
@@ -15,6 +16,8 @@
1516
import edu.harvard.iq.dataverse.engine.command.RequiredPermissions;
1617
import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
1718
import java.io.IOException;
19+
import java.util.ArrayList;
20+
import java.util.List;
1821

1922
import org.apache.solr.client.solrj.SolrServerException;
2023

@@ -42,12 +45,17 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
4245
DatasetLinkingDataverse doomedAndMerged = ctxt.em().merge(doomed);
4346
ctxt.em().remove(doomedAndMerged);
4447

45-
try {
46-
ctxt.index().indexDataverse(doomed.getLinkingDataverse());
47-
} catch (IOException | SolrServerException e) {
48-
String failureLogText = "Post delete linking dataverse indexing failed for Dataverse. ";
49-
failureLogText += "\r\n" + e.getLocalizedMessage();
50-
LoggingUtil.writeOnSuccessFailureLog(this, failureLogText, doomed.getLinkingDataverse());
48+
List<Dataverse> toReindex = new ArrayList<>();
49+
toReindex.add(doomed.getLinkingDataverse());
50+
toReindex.addAll(doomed.getLinkingDataverse().getOwners());
51+
for (Dataverse dv : toReindex) {
52+
try {
53+
ctxt.index().indexDataverse(dv);
54+
} catch (IOException | SolrServerException e) {
55+
String failureLogText = "Post delete linking dataverse indexing failed for Dataverse. ";
56+
failureLogText += "\r\n" + e.getLocalizedMessage();
57+
LoggingUtil.writeOnSuccessFailureLog(this, failureLogText, dv);
58+
}
5159
}
5260

5361
return merged;

src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DestroyDatasetCommand.java

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,13 @@ public class DestroyDatasetCommand extends AbstractVoidCommand {
4949

5050
private List<String> datasetAndFileSolrIdsToDelete;
5151

52-
private Dataverse toReIndex;
52+
private List<Dataverse> toReIndex;
5353

5454
public DestroyDatasetCommand(Dataset doomed, DataverseRequest aRequest) {
5555
super(aRequest, doomed);
5656
this.doomed = doomed;
5757
datasetAndFileSolrIdsToDelete = new ArrayList<>();
58+
toReIndex = new ArrayList<>();
5859
}
5960

6061
@Override
@@ -116,7 +117,12 @@ protected void executeImpl(CommandContext ctxt) throws CommandException {
116117
}
117118
}
118119

119-
toReIndex = managedDoomed.getOwner();
120+
toReIndex.add(managedDoomed.getOwner());
121+
toReIndex.addAll(managedDoomed.getOwner().getOwners());
122+
managedDoomed.getDatasetLinkingDataverses().forEach(dld -> {
123+
toReIndex.add(dld.getLinkingDataverse());
124+
toReIndex.addAll(dld.getLinkingDataverse().getOwners());
125+
});
120126

121127
// add potential Solr IDs of datasets to list for deletion
122128
String solrIdOfPublishedDatasetVersion = IndexServiceBean.solrDocIdentifierDataset + managedDoomed.getId();
@@ -145,13 +151,15 @@ public boolean onSuccess(CommandContext ctxt, Object r) {
145151
logger.log(Level.FINE, "Result of attempt to delete dataset and file IDs from the search index: {0}", resultOfSolrDeletionAttempt.getMessage());
146152

147153
// reindex
148-
try {
149-
ctxt.index().indexDataverse(toReIndex);
150-
} catch (IOException | SolrServerException e) {
151-
String failureLogText = "Post-destroy dataset indexing of the owning dataverse failed. You can kickoff a re-index of this dataverse with: \r\n curl http://localhost:8080/api/admin/index/dataverses/" + toReIndex.getId().toString();
152-
failureLogText += "\r\n" + e.getLocalizedMessage();
153-
LoggingUtil.writeOnSuccessFailureLog(this, failureLogText, toReIndex);
154-
retVal = false;
154+
for (Dataverse dv : toReIndex) {
155+
try {
156+
ctxt.index().indexDataverse(dv);
157+
} catch (IOException | SolrServerException e) {
158+
String failureLogText = "Post-destroy dataset indexing of an owning or linking dataverse failed. You can kickoff a re-index of this dataverse with: \r\n curl http://localhost:8080/api/admin/index/dataverses/" + dv.getId().toString();
159+
failureLogText += "\r\n" + e.getLocalizedMessage();
160+
LoggingUtil.writeOnSuccessFailureLog(this, failureLogText, dv);
161+
retVal = false;
162+
}
155163
}
156164

157165
return retVal;

src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,15 @@
3333
import java.awt.datatransfer.StringSelection;
3434
import java.io.IOException;
3535
import java.sql.Timestamp;
36-
import java.util.Date;
37-
import java.util.List;
36+
import java.util.*;
3837
import java.util.logging.Level;
3938
import java.util.logging.Logger;
4039

4140
import edu.harvard.iq.dataverse.batch.util.LoggingUtil;
4241
import edu.harvard.iq.dataverse.dataaccess.StorageIO;
4342
import edu.harvard.iq.dataverse.engine.command.Command;
4443
import edu.harvard.iq.dataverse.util.FileUtil;
45-
import java.util.ArrayList;
44+
4645
import java.util.concurrent.Future;
4746

4847
import org.apache.logging.log4j.util.Strings;
@@ -67,7 +66,7 @@ public class FinalizeDatasetPublicationCommand extends AbstractPublishDatasetCom
6766
*/
6867
final boolean datasetExternallyReleased;
6968

70-
List<Dataverse> dataversesToIndex = new ArrayList<>();
69+
Set<Dataverse> dataversesToIndex = new HashSet<>();
7170

7271
public static final String FILE_VALIDATION_ERROR = "FILE VALIDATION ERROR";
7372

@@ -209,6 +208,15 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
209208

210209
}
211210

211+
// The owning dataverse plus all dataverses linking to this dataset must be re-indexed to update their
212+
// datasetCount
213+
dataversesToIndex.add(getDataset().getOwner());
214+
dataversesToIndex.addAll(getDataset().getOwner().getOwners());
215+
getDataset().getDatasetLinkingDataverses().forEach(dld -> {
216+
dataversesToIndex.add(dld.getLinkingDataverse());
217+
dataversesToIndex.addAll(dld.getLinkingDataverse().getOwners());
218+
});
219+
212220
List<Command> previouslyCalled = ctxt.getCommandsCalled();
213221

214222
PrivateUrl privateUrl = ctxt.engine().submit(new GetPrivateUrlCommand(getRequest(), theDataset));

src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LinkDatasetCommand.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919
import edu.harvard.iq.dataverse.util.BundleUtil;
2020
import java.io.IOException;
2121
import java.sql.Timestamp;
22+
import java.util.ArrayList;
2223
import java.util.Date;
24+
import java.util.List;
25+
2326
import org.apache.solr.client.solrj.SolrServerException;
2427

2528
/**
@@ -68,6 +71,20 @@ public boolean onSuccess(CommandContext ctxt, Object r) {
6871

6972
ctxt.index().asyncIndexDataset(dld.getDataset(), true);
7073

74+
List<Dataverse> toReindex = new ArrayList<>();
75+
toReindex.add(dld.getLinkingDataverse());
76+
toReindex.addAll(dld.getLinkingDataverse().getOwners());
77+
for (Dataverse dv : toReindex) {
78+
try {
79+
ctxt.index().indexDataverse(dv);
80+
} catch (IOException | SolrServerException e) {
81+
String failureLogText = "Indexing of linking dataverse failed. You can kickoff a re-index of this dataverse with: \r\n curl http://localhost:8080/api/admin/index/dataverses/" + dv.getId().toString();
82+
failureLogText += "\r\n" + e.getLocalizedMessage();
83+
LoggingUtil.writeOnSuccessFailureLog(null, failureLogText, dv);
84+
return false;
85+
}
86+
}
87+
7188
return retVal;
7289
}
7390
}

src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import javax.xml.parsers.ParserConfigurationException;
3232
import javax.xml.transform.TransformerException;
3333
import org.apache.commons.lang3.mutable.MutableBoolean;
34+
import org.apache.solr.client.solrj.SolrServerException;
3435
import org.xml.sax.SAXException;
3536

3637
import io.gdcc.xoai.model.oaipmh.results.Record;
@@ -191,6 +192,17 @@ public void doHarvest(DataverseRequest dataverseRequest, Long harvestingClientId
191192

192193
hdLogger.log(Level.INFO, String.format("Datasets created/updated: %s, datasets deleted: %s, datasets failed: %s", harvestedDatasetIds.size(), deletedIdentifiers.size(), failedIdentifiers.size()));
193194

195+
// Reindex dataverse to update datasetCount
196+
List<Dataverse> toReindex = new ArrayList<>();
197+
toReindex.add(harvestingClientConfig.getDataverse());
198+
toReindex.addAll(harvestingClientConfig.getDataverse().getOwners());
199+
for (Dataverse dv : toReindex) {
200+
try {
201+
indexService.indexDataverse(dv);
202+
} catch (IOException | SolrServerException e) {
203+
hdLogger.log(Level.SEVERE, "Dataverse indexing failed. You can kickoff a re-index of this dataverse with: \r\n curl http://localhost:8080/api/admin/index/dataverses/" + dv.getId().toString());
204+
}
205+
}
194206
}
195207
} catch (StopHarvestException she) {
196208
hdLogger.log(Level.INFO, "HARVEST INTERRUPTED BY EXTERNAL REQUEST");

0 commit comments

Comments
 (0)