IQSS · ofahimIQSS · Jun 24, 2025 · May 23, 2025 · May 23, 2025 · May 27, 2025
diff --git a/doc/release-notes/11275-add-limit-to-number-of-dataset-files.md b/doc/release-notes/11275-add-limit-to-number-of-dataset-files.md
@@ -0,0 +1,5 @@
+### Files attached to a Dataset can now be limited by count
+
+Added the ability to set a limit on the number of files that can be uploaded to a Dataset. Limits can be set globally through a JVM setting or set per Collection or Dataset.
+
+See also [the guides](https://dataverse-guide--11359.org.readthedocs.build/en/11359/api/native-api.html#imposing-a-limit-to-the-number-of-files-allowed-to-be-uploaded-to-a-dataset), #11275, and #11359.
diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst
@@ -2474,7 +2474,7 @@ When adding a file to a dataset, you can optionally specify the following:
 - Whether or not the file is restricted.
 - Whether or not the file skips :doc:`tabular ingest </user/tabulardataingest/index>`. If the ``tabIngest`` parameter is not specified, it defaults to ``true``.
 
-Note that when a Dataverse installation is configured to use S3 storage with direct upload enabled, there is API support to send a file directly to S3. This is more complex and is described in the :doc:`/developers/s3-direct-upload-api` guide.
+Note that when a Dataverse installation is configured to use S3 storage with direct upload enabled, there is API support to send a file directly to S3. This is more complex and is described in the :doc:`/developers/s3-direct-upload-api` guide. Also, see :ref:`set-dataset-file-limit-api`, for limitations to the number of files allowed per Dataset.
 
 In the curl example below, all of the above are specified but they are optional.
 
@@ -2699,6 +2699,51 @@ In some circumstances, it may be useful to move or copy files into Dataverse's s
 Two API calls are available for this use case to add files to a dataset or to replace files that were already in the dataset.
 These calls were developed as part of Dataverse's direct upload mechanism and are detailed in :doc:`/developers/s3-direct-upload-api`.
 
+Imposing a limit to the number of files allowed to be uploaded to a Dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Having thousands of files in a Dataset can cause issues. Most users would be better off with the data repackaged in fewer large bundles. To help curtail these issues, a limit can be set to prevent the number of file uploads from getting out of hand.
+
+The limit can be set via JVM setting :ref:`dataverse.files.default-dataset-file-count-limit` to be installation wide, or, set on each Collection/Dataset.
+
+For Installation wide limit, the limit can be set via JVM. ./asadmin $ASADMIN_OPTS create-jvm-options "-Ddataverse.files.default-dataset-file-count-limit=<limit>"
+
+For Collections, the attribute can be controlled by calling the Create or Update Dataverse API and adding ``datasetFileCountLimit=500`` to the Json body.
+
+For Datasets, the attribute can be set using the `Update Dataset Files Limit <#setting-the-files-count-limit-on-a-dataset>`_ API and passing the qp `fileCountLimit=500`.
+
+Setting a value less than 1 will clear the limit for that level. If no limit is found on the Dataset, the hierarchy of parent nodes will be checked until finally the JVM setting is checked.
+
+With this setting set a 400 error response stating that the limit has been reached, including the effective limit, will be returned.
+
+Please not that a superuser will be exempt from this rule.
+
+The check will use the value defined in the Dataset first, and if not set (value <1) the Dataverse/Collection will be checked, and finally the JVM setting.
+
+.. _set-dataset-file-limit-api:
+
+Setting the files count limit on a Dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In order to update the number of files allowed for a Dataset, without causing a Draft version of the Dataset being created, the following API can be used
+
+.. note:: To clear the limit simply set the limit to 0 or -1
+
+.. code-block:: bash
+
+  export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+  export SERVER_URL=https://demo.dataverse.org
+  export ID=24
+  export LIMIT=500
+
+  curl -H "X-Dataverse-key:$API_TOKEN" -X POST "$SERVER_URL/api/datasets/$ID/files/limits?fileCountLimit=$LIMIT"
+
+The fully expanded example above (without environment variables) looks like this:
+
+.. code-block:: bash
+
+  curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X POST "https://demo.dataverse.org/api/datasets/24/files/limits?fileCountLimit=500"
+
+
 Report the data (file) size of a Dataset
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst
@@ -2534,6 +2534,20 @@ Notes:
 - During startup, this directory will be checked for existence and write access. It will be created for you
   if missing. If it cannot be created or does not have proper write access, application deployment will fail.
 
+.. _dataverse.files.default-dataset-file-count-limit:
+
+dataverse.files.default-dataset-file-count-limit
+++++++++++++++++++++++++++++++++++++++++++++++++
+
+Configure a limit to the maximum number of Datafiles that can be uploaded to a Dataset.
+
+Notes:
+
+- This is a default that can be overwritten in any Dataverse/Collection or Dataset.
+- A value less than 1 will be treated as no limit set.
+- Changing this value will not delete any existing files. It is only intended for preventing new files from being uploaded.
+- Superusers will not be governed by this rule.
+
 .. _dataverse.files.uploads:
 
 dataverse.files.uploads

diff --git a/scripts/search/tests/data/dataset-finch1-fileLimit.json b/scripts/search/tests/data/dataset-finch1-fileLimit.json
@@ -0,0 +1,82 @@
+{
+  "datasetVersion": {
+    "license": {
+      "name": "CC0 1.0",
+      "uri": "http://creativecommons.org/publicdomain/zero/1.0"
+    },
+    "metadataBlocks": {
+      "citation": {
+        "fields": [
+          {
+            "value": "Darwin's Finches",
+            "typeClass": "primitive",
+            "multiple": false,
+            "typeName": "title"
+          },
+          {
+            "value": [
+              {
+                "authorName": {
+                  "value": "Finch, Fiona",
+                  "typeClass": "primitive",
+                  "multiple": false,
+                  "typeName": "authorName"
+                },
+                "authorAffiliation": {
+                  "value": "Birds Inc.",
+                  "typeClass": "primitive",
+                  "multiple": false,
+                  "typeName": "authorAffiliation"
+                }
+              }
+            ],
+            "typeClass": "compound",
+            "multiple": true,
+            "typeName": "author"
+          },
+          {
+            "value": [ 
+                { "datasetContactEmail" : {
+                    "typeClass": "primitive",
+                    "multiple": false,
+                    "typeName": "datasetContactEmail",
+                    "value" : "finch@mailinator.com"
+                },
+                "datasetContactName" : {
+                    "typeClass": "primitive",
+                    "multiple": false,
+                    "typeName": "datasetContactName",
+                    "value": "Finch, Fiona"
+                }
+            }],
+            "typeClass": "compound",
+            "multiple": true,
+            "typeName": "datasetContact"
+          },
+          {
+            "value": [ {
+               "dsDescriptionValue":{
+                "value":   "Darwin's finches (also known as the Galápagos finches) are a group of about fifteen species of passerine birds.",
+                "multiple":false,
+               "typeClass": "primitive",
+               "typeName": "dsDescriptionValue"
+            }}],
+            "typeClass": "compound",
+            "multiple": true,
+            "typeName": "dsDescription"
+          },
+          {
+            "value": [
+              "Medicine, Health and Life Sciences"
+            ],
+            "typeClass": "controlledVocabulary",
+            "multiple": true,
+            "typeName": "subject"
+          }
+        ],
+        "displayName": "Citation Metadata"
+      }
+    }
+  },
+  "datasetFileCountLimit": 100
+}
diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataset.java b/src/main/java/edu/harvard/iq/dataverse/Dataset.java
@@ -72,7 +72,9 @@
     @NamedQuery(name = "Dataset.findByReleaseUserId",
                 query = "SELECT o FROM Dataset o WHERE o.releaseUser.id=:releaseUserId"),
     @NamedQuery(name = "Dataset.countAll",
-                query = "SELECT COUNT(ds) FROM Dataset ds")
+                query = "SELECT COUNT(ds) FROM Dataset ds"),
+    @NamedQuery(name = "Dataset.countFilesByOwnerId",
+                query = "SELECT COUNT(dvo) FROM DvObject dvo WHERE dvo.owner.id=:ownerId AND dvo.dtype='DataFile'")
 })
 @NamedNativeQuery(
         name = "Dataset.findAllOrSubsetOrderByFilesOwned",

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java
@@ -1077,4 +1077,13 @@ public long getDatasetCount() {
         return em.createNamedQuery("Dataset.countAll", Long.class).getSingleResult();
     }
 
+    /**
+     *
+     * @param id - owner id
+     * @return Total number of datafiles for this dataset/owner
+     */
+    public long getDataFileCountByOwner(long id) {
+        return em.createNamedQuery("Dataset.countFilesByOwnerId", Long.class).setParameter("ownerId", id).getSingleResult();
+    }
+
 }
diff --git a/src/main/java/edu/harvard/iq/dataverse/DvObject.java b/src/main/java/edu/harvard/iq/dataverse/DvObject.java
@@ -140,6 +140,9 @@ public String visit(DataFile df) {
 
     @Column(insertable = false, updatable = false) private String dtype;
 
+    @Column( nullable = true )
+    private Integer datasetFileCountLimit;
+
     @OneToMany(mappedBy="dvobject",fetch = FetchType.LAZY,cascade={CascadeType.REMOVE, CascadeType.MERGE, CascadeType.PERSIST, CascadeType.REFRESH})
     private List<DataverseFeaturedItem> dataverseFeaturedItems;
 
@@ -507,6 +510,14 @@ public void setStorageQuota(StorageQuota storageQuota) {
         this.storageQuota = storageQuota;
     }
 
+    public Integer getDatasetFileCountLimit() {
+        return datasetFileCountLimit;
+    }
+    public void setDatasetFileCountLimit(Integer datasetFileCountLimit) {
+        // Store as -1 if missing or invalid
+        this.datasetFileCountLimit = datasetFileCountLimit != null && datasetFileCountLimit <= 0 ? Integer.valueOf(-1) : datasetFileCountLimit;
+    }
+
     /**
      * 
      * @param other 

diff --git a/src/main/java/edu/harvard/iq/dataverse/DvObjectContainer.java b/src/main/java/edu/harvard/iq/dataverse/DvObjectContainer.java
@@ -261,4 +261,16 @@ public PidProvider getEffectivePidGenerator() {
         return pidGenerator;
     }
 
+    public Integer getEffectiveDatasetFileCountLimit() {
+        if (isDatasetFileCountLimitNotSet(getDatasetFileCountLimit()) && getOwner() != null) {
+            return getOwner().getEffectiveDatasetFileCountLimit();
+        } else if (isDatasetFileCountLimitNotSet(getDatasetFileCountLimit())) {
+                Optional<Integer> opt = JvmSettings.DEFAULT_DATASET_FILE_COUNT_LIMIT.lookupOptional(Integer.class);
+                return (opt.isPresent()) ? opt.get() : null;
+        }
+        return getDatasetFileCountLimit();
+    }
+    public boolean isDatasetFileCountLimitNotSet(Integer datasetFileCountLimit) {
+        return datasetFileCountLimit == null || datasetFileCountLimit <= 0 ? true : false;
+    }
 }
diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java
@@ -26,8 +26,7 @@
 import edu.harvard.iq.dataverse.datasetutility.OptionalFileParams;
 import edu.harvard.iq.dataverse.engine.command.Command;
 import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
-import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
-import edu.harvard.iq.dataverse.engine.command.exception.UnforcedCommandException;
+import edu.harvard.iq.dataverse.engine.command.exception.*;
 import edu.harvard.iq.dataverse.engine.command.impl.*;
 import edu.harvard.iq.dataverse.export.DDIExportServiceBean;
 import edu.harvard.iq.dataverse.export.ExportService;
@@ -97,9 +96,7 @@
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import static edu.harvard.iq.dataverse.api.ApiConstants.*;
-import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException;
 
-import edu.harvard.iq.dataverse.engine.command.exception.PermissionException;
 import edu.harvard.iq.dataverse.dataset.DatasetType;
 import edu.harvard.iq.dataverse.dataset.DatasetTypeServiceBean;
 import edu.harvard.iq.dataverse.license.License;
@@ -1971,6 +1968,40 @@ public Response removeFileRetention(@Context ContainerRequestContext crc, @PathP
         }
     }
 
+    @POST
+    @AuthRequired
+    @Path("{id}/files/limits")
+    public Response updateDatasetFilesLimits(@Context ContainerRequestContext crc,
+                                                @PathParam("id") String id, String jsonBody,
+                                                @QueryParam("fileCountLimit") Integer datasetFileCountLimit) {
+
+        // user is authenticated
+        AuthenticatedUser authenticatedUser = null;
+        try {
+            authenticatedUser = getRequestAuthenticatedUserOrDie(crc);
+        } catch (WrappedResponse ex) {
+            return error(Status.UNAUTHORIZED, "Authentication is required.");
+        }
+
+        Dataset dataset;
+        try {
+            dataset = findDatasetOrDie(id);
+        } catch (WrappedResponse ex) {
+            return ex.getResponse();
+        }
+
+        if (authenticatedUser.isSuperuser() || permissionService.hasPermissionsFor(authenticatedUser, dataset,
+                EnumSet.of(Permission.EditDataset))) {
+
+            dataset.setDatasetFileCountLimit(datasetFileCountLimit);
+            datasetService.merge(dataset);
+
+            return ok("ok");
+        } else {
+            return error(Status.FORBIDDEN, "User is not a superuser or user does not have EditDataset permissions");
+        }
+    }
+
     @PUT
     @AuthRequired
     @Path("{linkedDatasetId}/link/{linkingDataverseAlias}")
@@ -2553,7 +2584,6 @@ public Response deleteCurationStatus(@Context ContainerRequestContext crc, @Path
             return Response.fromResponse(wr.getResponse()).status(Response.Status.BAD_REQUEST).build();
         }
     }
-
     @GET
     @AuthRequired
     @Path("{id}/uploadurls")
@@ -2562,7 +2592,8 @@ public Response getMPUploadUrls(@Context ContainerRequestContext crc, @PathParam
             Dataset dataset = findDatasetOrDie(idSupplied);
 
             boolean canUpdateDataset = false;
-            canUpdateDataset = permissionSvc.requestOn(createDataverseRequest(getRequestUser(crc)), dataset)
+            User user = getRequestUser(crc);
+            canUpdateDataset = permissionSvc.requestOn(createDataverseRequest(user), dataset)
                     .canIssue(UpdateDatasetVersionCommand.class);
             if (!canUpdateDataset) {
                 return error(Response.Status.FORBIDDEN, "You are not permitted to upload files to this dataset.");
@@ -2572,6 +2603,17 @@ public Response getMPUploadUrls(@Context ContainerRequestContext crc, @PathParam
                 return error(Response.Status.NOT_FOUND,
                         "Direct upload not supported for files in this dataset: " + dataset.getId());
             }
+            if (!user.isSuperuser()) {
+                Integer effectiveDatasetFileCountLimit = dataset.getEffectiveDatasetFileCountLimit();
+                boolean hasFileCountLimit = !dataset.isDatasetFileCountLimitNotSet(effectiveDatasetFileCountLimit);
+                if (hasFileCountLimit) {
+                    long uploadedFileCount = datasetService.getDataFileCountByOwner(dataset.getId());
+                    if (uploadedFileCount >= effectiveDatasetFileCountLimit) {
+                        return error(Response.Status.BAD_REQUEST,
+                                BundleUtil.getStringFromBundle("file.add.count_exceeds_limit", Arrays.asList(String.valueOf(effectiveDatasetFileCountLimit))));
+                    }
+                }
+            }
             Long maxSize = systemConfig.getMaxFileUploadSizeForStore(dataset.getEffectiveStorageDriverId());
             if (maxSize != null) {
                 if(fileSize > maxSize) {
@@ -2585,7 +2627,7 @@ public Response getMPUploadUrls(@Context ContainerRequestContext crc, @PathParam
                 if(fileSize > limit.getRemainingQuotaInBytes()) {
                     return error(Response.Status.BAD_REQUEST,
                             "The file you are trying to upload is too large to be uploaded to this dataset. " +
-                                    "The remaing file size quota is " + limit.getRemainingQuotaInBytes() + " bytes.");
+                                    "The remaining file size quota is " + limit.getRemainingQuotaInBytes() + " bytes.");
                 }
             }
             JsonObjectBuilder response = null;

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/dto/DatasetDTO.java b/src/main/java/edu/harvard/iq/dataverse/api/dto/DatasetDTO.java
@@ -19,6 +19,7 @@ public class DatasetDTO implements java.io.Serializable {
         private String metadataLanguage;
         private DatasetVersionDTO datasetVersion;
         private List<DataFileDTO> dataFiles;
+    private Integer datasetFileCountLimit;
 
     public String getId() {
         return id;
@@ -114,4 +115,11 @@ public String getMetadataLanguage() {
         return metadataLanguage;
     }
 
+    public Integer getDatasetFileCountLimit() {
+        return datasetFileCountLimit;
+    }
+
+    public void setDatasetFileCountLimit(Integer datasetFileCountLimit) {
+        this.datasetFileCountLimit = datasetFileCountLimit;
+    }
 }
diff --git a/src/main/java/edu/harvard/iq/dataverse/api/dto/DataverseDTO.java b/src/main/java/edu/harvard/iq/dataverse/api/dto/DataverseDTO.java
@@ -12,6 +12,7 @@ public class DataverseDTO {
     private String affiliation;
     private List<DataverseContact> dataverseContacts;
     private Dataverse.DataverseType dataverseType;
+    private Integer datasetFileCountLimit;
 
     public String getAlias() {
         return alias;
@@ -45,6 +46,14 @@ public void setAffiliation(String affiliation) {
         this.affiliation = affiliation;
     }
 
+    public Integer getDatasetFileCountLimit() {
+        return datasetFileCountLimit;
+    }
+
+    public void setDatasetFileCountLimit(Integer datasetFileCountLimit) {
+        this.datasetFileCountLimit = datasetFileCountLimit;
+    }
+
     public List<DataverseContact> getDataverseContacts() {
         return dataverseContacts;
     }

diff --git a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java
@@ -1217,7 +1217,8 @@ private boolean step_030_createNewFilesViaIngest(OptionalFileParams optionalFile
             if (systemConfig.isStorageQuotasEnforced()) {
                 quota = fileService.getUploadSessionQuotaLimit(dataset);
             }
-            Command<CreateDataFileResult> cmd = new CreateNewDataFilesCommand(dvRequest, workingVersion, newFileInputStream, newFileName, newFileContentType, newStorageIdentifier, quota, newCheckSum, newCheckSumType, suppliedFileSize);
+            Command<CreateDataFileResult> cmd = new CreateNewDataFilesCommand(dvRequest, workingVersion, newFileInputStream, newFileName, newFileContentType, newStorageIdentifier,
+                    quota, newCheckSum, newCheckSumType, suppliedFileSize, isFileReplaceOperation());
             CreateDataFileResult createDataFilesResult = commandEngine.submit(cmd);
             initialFileList = createDataFilesResult.getDataFiles();