IQSS
diff --git a/‎conf/solr/solrconfig.xml‎
Lines changed: 1 addition & 1 deletion b/‎conf/solr/solrconfig.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/release-notes/11374-indexing-improvement.md‎
Lines changed: 5 additions & 0 deletions b/‎doc/release-notes/11374-indexing-improvement.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎doc/sphinx-guides/source/_static/Dataverse_3ring-brand_icon_EqualSpace.svg‎
Lines changed: 13 additions & 0 deletions b/‎doc/sphinx-guides/source/_static/Dataverse_3ring-brand_icon_EqualSpace.svg‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎doc/sphinx-guides/source/api/native-api.rst‎
Lines changed: 1 addition & 1 deletion b/‎doc/sphinx-guides/source/api/native-api.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/sphinx-guides/source/installation/config.rst‎
Lines changed: 11 additions & 0 deletions b/‎doc/sphinx-guides/source/installation/config.rst‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎doc/sphinx-guides/source/style/foundations.rst‎
Lines changed: 2 additions & 0 deletions b/‎doc/sphinx-guides/source/style/foundations.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/sphinx-guides/source/user/tabulardataingest/stata.rst‎
Lines changed: 33 additions & 0 deletions b/‎doc/sphinx-guides/source/user/tabulardataingest/stata.rst‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎doc/sphinx-guides/source/user/tabulardataingest/supportedformats.rst‎
Lines changed: 1 addition & 1 deletion b/‎doc/sphinx-guides/source/user/tabulardataingest/supportedformats.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/main/java/edu/harvard/iq/dataverse/DataFile.java‎
Lines changed: 22 additions & 0 deletions b/‎src/main/java/edu/harvard/iq/dataverse/DataFile.java‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/main/java/edu/harvard/iq/dataverse/Dataset.java‎
Lines changed: 20 additions & 0 deletions b/‎src/main/java/edu/harvard/iq/dataverse/Dataset.java‎
Lines changed: 20 additions & 0 deletions
@@ -238,7 +238,7 @@
          have some sort of hard autoCommit to limit the log size.
       -->
     <autoCommit>
-      <maxTime>${solr.autoCommit.maxTime:30000}</maxTime>
+      <maxTime>${solr.autoCommit.maxTime:300000}</maxTime>
       <openSearcher>false</openSearcher>
     </autoCommit>
 
 
@@ -0,0 +1,5 @@
+### Solr Indexing speed improved
+
+The performance of Solr indexing has been significantly improved, particularly for datasets with many files.
+
+A new dataverse.solr.min-files-to-use-proxy microprofile setting can be used to further improve performance/lower memory requirements for datasets with many files (e.g. 500+) (defaults to Integer.MAX, disabling use of the new functionality)
@@ -1167,7 +1167,7 @@ To set or change the storage allocation quota for a collection:
 
 .. code-block:: 
 
-  curl -X PUT -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/dataverses/$ID/storage/quota/$SIZE_IN_BYTES"
+  curl -X POST -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/dataverses/$ID/storage/quota/$SIZE_IN_BYTES"
 
 This is API is superuser-only.
 
 
@@ -2689,6 +2689,17 @@ when using it to configure your core name!
 
 Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_PATH``.
 
+dataverse.solr.min-files-to-use-proxy
++++++++++++++++++++++++++++++++++++++
+
+Specifies when to use a smaller datafile proxy object for the purposes of dataset indexing. This can lower memory requirements
+and improve performance when reindexing large datasets (e.g. those with hundreds or thousands of files). (Creating the proxy may slightly slow indexing datasets with only a few files.)
+
+This setting represents a number of files for which the datafile procy should be used. By default, this is set to Interger.MAX which disables using the proxy.
+A recommended value would be ~1000 but the optimal value may vary depending on details of your installation.  
+
+Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_MIN_FILES_TO_USE_PROXY``.
+
 dataverse.solr.concurrency.max-async-indexes
 ++++++++++++++++++++++++++++++++++++++++++++
 
 
@@ -353,3 +353,5 @@ Create both print and web version of the Dataverse collection logo by downloadin
 
 .. |image1| image:: ./img/dataverse-icon.jpg
    :class: img-responsive
+
+Here is another vector-based SVG file with three rings: :download:`Dataverse_3ring-brand_icon_EqualSpace.svg <../_static/Dataverse_3ring-brand_icon_EqualSpace.svg>`
@@ -5,3 +5,36 @@ Stata
 	:local:
 
 Of all the third party statistical software providers, Stata does the best job at documenting the internal format of their files, by far. And at making that documentation freely and easily available to developers (yes, we are looking at you, SPSS). Because of that, Stata is the best supported format for tabular data ingest.  
+
+Supported Format Versions
+=========================
+
+
+Of the **"New Stata dta"** formats (variations of the format in use since Stata 13) our ingest supports the following:
+
+
+=================== ================= =================
+Stata format name   Introduced in     Used by 
+=================== ================= =================
+dta_117             Stata 13          Stata 13
+dta_118             Stata 14          Stata 14 - 19 
+dta_119             Stata 15          Stata 15 - 19
+=================== ================= =================
+
+This means that, in theory, every dta file produced by Stata v.13 - 17 should be ingestible. (Please see below for more information on Stata 18 and 19). In practice, we cannot *guarantee* that our code will in fact be able to parse any such file. There is always a possibility that we missed a certain way to compose the data that the ingest will fail to understand. So, if you encounter such an error, where Dataverse **tries but fails** to ingest a Stata file in one of these 3 formats, please open a GitHub issue and we will try to address it. Please note that this a different scenario from when Dataverse skips even trying to ingest a file (with no ingest errors shown in the UI). As that will in most cases be the result of the file exceeding the size limit set by the Dataverse instance administrators, or a client uploading the file with a wrong content type attached, so that Dataverse fails to recognize it as Stata.
+
+Please note that there was an issue present in older versions of Dataverse where Stata 13-17 files were not ingested when deposited via direct upload to S3. The issue was accompanied by the confusing error message ``The file is not in a STATA format that we can read or support`` shown in the UI. Fortunately, a case like this can be addressed by running the reIngest API on the affected file. 
+
+The following 2 formats have been introduced in 2024 and are **not yet supported**:
+
+=================== ================ =================
+Stata format name   Introduced in    Used by 
+=================== ================ =================
+dta_120             Stata 18         Stata 18 - 19
+dta_121             Stata 18         Stata 18 - 19 
+=================== ================ =================
+
+Please note however, that this does not mean that no files produced by Stata 18 or 19 are ingestable! In reality, in most cases these versions of Stata still save files in the ``dta_118`` (i.e., Stata 14) format, with the later formats only utilized when necessary. When, for example, the number of variables in the datafile exceeds what ``dta_118`` can handle, or when it has "alias variables" introduced in Stata 18. Case in point, in a year since the introduction of these 2 newest formats, it appears that not a single file in either of them has been uploaded on production Dataverse instance at IQSS. We are planning to eventually add support for these formats, but it is not considered a priority as of yet. However, please feel free to open a GitHub issue if this is an important use case for you.
+
+**"Old Stata"**, a distinctly different format used by Stata versions prior to 13 is supported. 
+However, this functionality is considered legacy code that we no longer actively maintain. If any problems or bugs are found in it, we cannot promise that the core development team will be able to prioritize looking into such. We will of course gladly accept a properly submitted pull request from the user community. 
@@ -11,7 +11,7 @@ Tabular Data ingest supports the following file formats:
 File format                      Versions supported 
 ================================ ==================================
 SPSS (POR and SAV formats)	 7 to 22
-STATA				 4 to 15
+STATA				 4 to 17 (see the Stata subsection)
 R 				 up to 3
 Excel				 XLSX only (XLS is NOT supported)
 CSV (comma-separated values) 	 (limited support)
 
@@ -13,6 +13,7 @@
 import edu.harvard.iq.dataverse.datasetutility.FileSizeChecker;
 import edu.harvard.iq.dataverse.ingest.IngestReport;
 import edu.harvard.iq.dataverse.ingest.IngestRequest;
+import edu.harvard.iq.dataverse.search.SolrIndexServiceBean;
 import edu.harvard.iq.dataverse.util.BundleUtil;
 import edu.harvard.iq.dataverse.util.FileUtil;
 import edu.harvard.iq.dataverse.util.ShapefileHandler;
@@ -23,6 +24,7 @@
 import java.util.Objects;
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
+import java.util.Date;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
@@ -50,6 +52,26 @@
         @NamedQuery(name="DataFile.findDataFileThatReplacedId",
                 query="SELECT s.id FROM DataFile s WHERE s.previousDataFileId=:identifier")
 })
+@NamedNativeQuery(
+        name = "DataFile.getDataFileInfoForPermissionIndexing",
+        query = "SELECT fm.label, df.id, dvo.publicationDate " +
+                "FROM filemetadata fm " +
+                "JOIN datafile df ON fm.datafile_id = df.id " +
+                "JOIN dvobject dvo ON df.id = dvo.id " +
+                "WHERE fm.datasetversion_id = ?",
+        resultSetMapping = "DataFileInfoMapping"
+    )
+    @SqlResultSetMapping(
+        name = "DataFileInfoMapping",
+        classes = @ConstructorResult(
+            targetClass = SolrIndexServiceBean.DataFileProxy.class,
+            columns = {
+                @ColumnResult(name = "label", type = String.class),
+                @ColumnResult(name = "id", type = Long.class),
+                @ColumnResult(name = "publicationDate", type = Date.class)
+            }
+        )
+    )
 @Entity
 @Table(indexes = {@Index(columnList="ingeststatus")
         , @Index(columnList="checksumvalue")
 
@@ -20,17 +20,20 @@
 import java.util.Objects;
 import java.util.Set;
 import jakarta.persistence.CascadeType;
+import jakarta.persistence.ColumnResult;
 import jakarta.persistence.Entity;
 import jakarta.persistence.Index;
 import jakarta.persistence.JoinColumn;
 import jakarta.persistence.ManyToOne;
+import jakarta.persistence.NamedNativeQuery;
 import jakarta.persistence.NamedQueries;
 import jakarta.persistence.NamedQuery;
 import jakarta.persistence.NamedStoredProcedureQuery;
 import jakarta.persistence.OneToMany;
 import jakarta.persistence.OneToOne;
 import jakarta.persistence.OrderBy;
 import jakarta.persistence.ParameterMode;
+import jakarta.persistence.SqlResultSetMapping;
 import jakarta.persistence.StoredProcedureParameter;
 import jakarta.persistence.Table;
 import jakarta.persistence.Temporal;
@@ -71,6 +74,23 @@
     @NamedQuery(name = "Dataset.countAll",
                 query = "SELECT COUNT(ds) FROM Dataset ds")
 })
+@NamedNativeQuery(
+        name = "Dataset.findAllOrSubsetOrderByFilesOwned",
+        query = "SELECT DISTINCT CAST(o.id AS BIGINT) as id, COUNT(f.id) as numFiles " +
+                "FROM dvobject o " +
+                "LEFT JOIN dvobject f ON f.owner_id = o.id " +
+                "WHERE o.dtype = 'Dataset' " +
+                "AND (? = false OR o.indexTime IS NULL) " +
+                "GROUP BY o.id " +
+                "ORDER BY numfiles ASC, id",
+        resultSetMapping = "DatasetIdMapping"
+    )
+@SqlResultSetMapping(
+    name = "DatasetIdMapping",
+    columns = {
+        @ColumnResult(name = "id", type = Long.class)
+    }
+)
 
 /*
     Below is the database stored procedure for getting a string dataset id.