Distribute changes to the src/ files as comments

MichaelChirico · web-flow · commit fdeb4c38cb35 · 2024-09-03T10:32:48.000-07:00
diff --git a/man/openmp-utils.Rd b/man/openmp-utils.Rd
@@ -38,121 +38,22 @@
 
   \itemize{
     \item\file{between.c} - \code{\link{between}()}
-
-    OpenMP is used here to parallelize:
-    \itemize{
-      \item The loops that check if each element of the vector provided is between the specified \code{lower} and \code{upper} bounds, for integer (\code{INTSXP}) and real (\code{REALSXP}) types
-      \item The checking and handling of undefined values (such as NaNs)
-    }
-
-    Since this function is used to find rows where a column's value falls within a specific range, it benefits more from parallelization when the input data consists of a large number of rows.
-
     \item\file{cj.c} - \code{\link{CJ}()}
-
-    OpenMP is used here to parallelize:
-
-    \itemize{
-      \item The element assignment in vectors
-      \item The memory copying operations (blockwise replication of data using \code{memcpy})
-      \item The creation of all combinations of the input vectors over the cross-product space
-    }
-
-    Given that the number of combinations increases exponentially as more columns are added, better speedup can be expected when dealing with a large number of columns.
-
     \item\file{coalesce.c} - \code{\link{fcoalesce}()}
-
-    OpenMP is used here to parallelize:
-    \itemize{
-      \item The operation that iterates over the rows to coalesce the data (which can be of type integer, real, or complex)
-      \item The replacement of NAs with non-NA values from subsequent vectors
-      \item The conditional checks within parallelized loops
-      }
-
-    Significant speedup can be expected for more number of columns here, given that this function operates efficiently across multiple columns to find non-NA values.
-
     \item\file{fifelse.c} - \code{\link{fifelse}()}
-
-    For logical, integer, and real types, OpenMP is being used here to parallelize loops that perform conditional checks along with assignment operations over the elements of the supplied logical vector based on the condition (\code{test}) and values provided for the remaining arguments (\code{yes}, \code{no}, and \code{na}).
-
-    Better speedup can be expected for more number of columns here as well, given that this function operates column-wise with independent vector operations.
-
-    \item\file{fread.c} - \code{\link{fread}()}
-
-    OpenMP is used here to:
-
-    \itemize{
-      \item Parallelize the reading of data in chunks
-      \item Avoid race conditions or concurrent writes to the output \code{data.table} by having atomic operations on the string data 
-      \item Manage synchronized updates to the progress bar and serialize the output to the console
-      }
-
-    This function is highly optimized in reading and processing data with both large numbers of rows and columns, but the efficiency is more pronounced across rows.
-
+    \item\file{fread.c} - \code{\link{fread}(). Parallelized across row-based chunks of the file.}
     \item\file{forder.c}, \file{fsort.c}, and \file{reorder.c} - \code{\link{forder}()} and related
-
-    OpenMP is used here to parallelize multiple operations that come together to sort a \code{data.table} using the Radix algorithm. These include:
-
-    \itemize{
-      \item The counting of unique values and recursively sorting subsets of data across different threads (specific to \file{forder.c})
-      \item The process of finding the range and distribution of data for efficient grouping and sorting (applies to both \file{forder.c} and \file{fsort.c})
-      \item Creation of histograms which are used to sort data based on significant bits (each thread processes a separate batch of the data, computes the MSB of each element, and then increments the corresponding bins), with the distribution and merging of buckets (specific to \file{fsort.c})
-      \item The process of reordering a vector or each column in a list of vectors (such as in a \code{data.table}) based on a given vector that dictates the new ordering of elements (specific to \file{reorder.c})
-      }
-
-    Better speedups can be expected when the input data contains a large number of rows as the sorting complexity increases with more rows.
-
     \item\file{froll.c}, \file{frolladaptive.c}, and \file{frollR.c} - \code{\link{froll}()} and family
-
-    OpenMP is used here to parallelize the loops that compute the rolling means (\code{frollmean}) and sums (\code{frollsum}) over a sliding window for each position in the input vector.
-
-    These functions benefit more in terms of speedup when the data has a large number of columns, primarily due to the efficient memory access patterns (cache-friendly) used when processing the data for each column sequentially in memory to compute the rolling statistic.
-
-    \item\file{fwrite.c} - \code{\link{fwrite}()}
-
-    OpenMP is used here primarily to parallelize the process of writing rows to the output file, but error handling and compression (if enabled) are also managed within the parallel region. Special attention is paid to thread safety and synchronization, especially in the ordered sections where output to the file and handling of errors is serialized to maintain the correct sequence of rows.
-    
-    Similar to \code{\link{fread}()}, this function is highly efficient in parallely processing data with large numbers of both rows and columns, but it has more notable speedups with an increased number of rows.
-
-    \item\file{gsumm.c} - GForce in various places, see \link{GForce}
-
-    Functions with GForce optimization are internally parallelized to speed up grouped summaries over a large \code{data.table}. OpenMP is used here to parallelize operations involved in calculating group-wise statistics like sum, mean, and median (implying faster computation of \code{sd}, \code{var}, and \code{prod} as well). 
-
-    These optimized grouping operations benefit more in terms of speedup if the input data contains a large number of groups since they leverage parallelization more efficiently by eliminating the overhead of individual group evaluations.
-
+    \item\file{fwrite.c} - \code{\link{fwrite}(). Parallelized across rows.}
+    \item\file{gsumm.c} - GForce in various places, see \link{GForce}. Parallelized across groups.
     \item\file{nafill.c} - \code{\link{nafill}()}
-
-    OpenMP is being used here to parallelize the loop that fills missing values over columns of the input data. This includes handling different data types (double, integer, and integer64) and applying the designated filling method (constant, last observation carried forward, or next observation carried backward) to each column in parallel. 
-
-    Given its optimization for column-wise operations, better speedups can be expected when the input data consists of a large number of columns.
-
     \item\file{subset.c} - Used in \code{\link[=data.table]{[.data.table}} subsetting
-
-    OpenMP is used here to parallelize the loops that perform the subsetting of vectors, with conditional checks and filtering of data. 
-
-    Since subset operations tend to be usually row-dependent, better speedups can be expected when dealing with a large number of rows. However, it also depends on whether the computations are focused on rows or columns (as dictated by the subsetting criteria).
-
     \item\file{types.c} - Internal testing usage
-
-    This caters to internal tests (not impacting any user-facing operations or functions), and OpenMP is being used here to test a message printing function inside a nested loop which has been collapsed into a single loop of the combined iteration space using \code{collapse(2)}, along with specification of dynamic scheduling for distributing the iterations in a way that can balance the workload among the threads.
   }
 
-In general, or as applicable to all the aforementioned use cases, better speedup can be expected when dealing with large datasets.
-
-Having such data when using \code{\link{fread}()} or \code{\link{fwrite}()} (ones with significant speedups for larger file sizes) also means that while one part of the data is being read from or written to disk (I/O operations), another part can be simultaneously processed using multiple cores (parallel computations). This overlap reduces the total time taken for the read or write operation (as the system can perform computations during otherwise idle I/O time).
-
-Apart from increasing the size of the input data, function-specific parameters when considered can benefit more from parallelization or lead to an increase in speedup. For instance, these can be:
-
-    \itemize{
-      \item Having a large number of groups when using \code{\link{forder}()} or a multitude of combinations when using \code{\link{CJ}()}
-      \item Having several missing values in your data when using \code{\link{fcoalesce}()} or \code{\link{nafill}()}
-      \item Using larger window sizes and/or time series data when using \code{\link{froll}()}
-      \item Having more and/or complex conditional logic when using \code{\link{fifelse}()} or \code{\link{subset}()}
-      }
-
-Note: The information above is based on implementation-specific details as of March 2024.
-
+  We endeavor to keep this list up to date, but note that the canonical reference here is the source code itself.
 }
 
 \examples{
   getDTthreads(verbose=TRUE)
-}
+}
diff --git a/src/between.c b/src/between.c
@@ -1,5 +1,11 @@
 #include "data.table.h"
 
+/*
+  OpenMP is used here to parallelize:
+   - The loops that check if each element of the vector provided is between 
+     the specified lower and upper bounds, for INTSXP and REALSXP types
+   - The checking and handling of undefined values (such as NaNs)
+*/
 SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAboundsArg, SEXP checkArg) {
   int nprotect = 0;
   R_len_t nx = length(x), nl = length(lower), nu = length(upper);
diff --git a/src/cj.c b/src/cj.c
@@ -1,5 +1,11 @@
 #include "data.table.h"
 
+/*
+  OpenMP is used here to parallelize:
+   - The element assignment in vectors
+   - The memory copying operations (blockwise replication of data using memcpy)
+   - The creation of all combinations of the input vectors over the cross-product space
+*/
 SEXP cj(SEXP base_list) {
   int ncol = LENGTH(base_list);
   SEXP out = PROTECT(allocVector(VECSXP, ncol));
diff --git a/src/coalesce.c b/src/coalesce.c
@@ -1,5 +1,11 @@
 #include "data.table.h"
 
+/*
+  OpenMP is used here to parallelize:
+    - The operation that iterates over the rows to coalesce the data
+    - The replacement of NAs with non-NA values from subsequent vectors
+    - The conditional checks within parallelized loops
+*/
 SEXP coalesce(SEXP x, SEXP inplaceArg) {
   if (TYPEOF(x)!=VECSXP) internal_error(__func__, "input is list(...) at R level"); // # nocov
   if (!IS_TRUE_OR_FALSE(inplaceArg)) internal_error(__func__, "argument 'inplaceArg' must be TRUE or FALSE"); // # nocov
diff --git a/src/fifelse.c b/src/fifelse.c
@@ -1,5 +1,11 @@
 #include "data.table.h"
 
+/*
+  OpenMP is being used here to parallelize loops that perform conditional
+    checks along with assignment operations over the elements of the
+    supplied logical vector based on the condition (test) and values
+    provided for the remaining arguments (yes, no, and na).
+*/
 SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) {
   if (!isLogical(l)) {
     error(_("Argument 'test' must be logical."));
diff --git a/src/forder.c b/src/forder.c
@@ -433,6 +433,19 @@ uint64_t dtwiddle(double x) //const void *p, int i)
 
 void radix_r(const int from, const int to, const int radix);
 
+/*
+  OpenMP is used here to parallelize multiple operations that come together to
+    sort a data.table using the Radix algorithm. These include:
+
+    - The counting of unique values and recursively sorting subsets of data
+      across different threads
+    - The process of finding the range and distribution of data for efficient
+      grouping and sorting
+    - Creation of histograms which are used to sort data based on significant
+      bits (each thread processes a separate batch of the data, computes the
+      MSB of each element, and then increments the corresponding bins), with
+      the distribution and merging of buckets
+*/
 SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg)
 // sortGroups TRUE from setkey and regular forder, FALSE from by= for efficiency so strings don't have to be sorted and can be left in appearance order
 // when sortGroups is TRUE, ascArg contains +1/-1 for ascending/descending of each by column; when FALSE ascArg is ignored
diff --git a/src/fread.c b/src/fread.c
@@ -1268,6 +1268,14 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped
 //
 // Returns 1 if it finishes successfully, and 0 otherwise.
 //
+//  OpenMP is used here to:
+//    - Parallelize the reading of data in chunks
+//    - Avoid race conditions or concurrent writes to the output data.table by having atomic
+//      operations on the string data 
+//    - Manage synchronized updates to the progress bar and serialize the output to the console
+//  This function is highly optimized in reading and processing data with both large numbers of
+//    rows and columns, but the efficiency is more pronounced across rows.
+//
 //=================================================================================================
 int freadMain(freadMainArgs _args) {
   args = _args;  // assign to global for use by DTPRINT() in other functions
diff --git a/src/froll.c b/src/froll.c
@@ -1,5 +1,14 @@
 #include "data.table.h"
 
+/*
+  OpenMP is used here to parallelize the loops in frollmean and frollsum.
+
+  These functions benefit more in terms of speedup when the data has a large
+    number of columns, primarily due to the efficient memory access patterns
+    (cache-friendly) used when processing the data for each column
+    sequentially in memory to compute the rolling statistic.
+*/
+
 /* fast rolling mean - router
  * early stopping for window bigger than input
  * also handles 'align' in single place
diff --git a/src/fsort.c b/src/fsort.c
@@ -98,6 +98,10 @@ int qsort_cmp(const void *a, const void *b) {
   return (x<y)-(x>y);   // largest first in a safe branchless way casting long to int
 }
 
+/*
+  OpenMP is used here to find the range and distribution of data for efficient
+    grouping and sorting.
+*/
 SEXP fsort(SEXP x, SEXP verboseArg) {
   double t[10];
   t[0] = wallclock();
diff --git a/src/fwrite.c b/src/fwrite.c
@@ -587,6 +587,14 @@ int compressbuff(z_stream *stream, void* dest, size_t *destLen, const void* sour
 }
 #endif
 
+/*
+  OpenMP is used here primarily to parallelize the process of writing rows
+    to the output file, but error handling and compression (if enabled) are
+    also managed within the parallel region. Special attention is paid to
+    thread safety and synchronization, especially in the ordered sections
+    where output to the file and handling of errors is serialized to maintain
+    the correct sequence of rows.
+*/
 void fwriteMain(fwriteMainArgs args)
 {
   double startTime = wallclock();
diff --git a/src/gsumm.c b/src/gsumm.c
@@ -37,6 +37,11 @@ static int nbit(int n)
   return nb;
 }
 
+/*
+  Functions with GForce optimization are internally parallelized to speed up
+    grouped summaries over a large data.table. OpenMP is used here to
+    parallelize operations involved in calculating common group-wise statistics.
+*/
 SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   double started = wallclock();
   const bool verbose = GetVerbose();
diff --git a/src/nafill.c b/src/nafill.c
@@ -87,6 +87,11 @@ void nafillInteger64(int64_t *x, uint_fast64_t nx, unsigned int type, int64_t fi
     snprintf(ans->message[0], 500, _("%s: took %.3fs\n"), __func__, omp_get_wtime()-tic);
 }
 
+/*
+  OpenMP is being used here to parallelize the loop that fills missing values
+    over columns of the input data. This includes handling different data types
+    and applying the designated filling method to each column in parallel. 
+*/
 SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, SEXP cols) {
   int protecti=0;
   const bool verbose = GetVerbose();
diff --git a/src/reorder.c b/src/reorder.c
@@ -1,5 +1,10 @@
 #include "data.table.h"
 
+/*
+  OpenMP is used here to reorder a vector or each column in a list of vectors
+    (such as in a data.table) based on a given vector that dictates the new
+    ordering of elements
+*/
 SEXP reorder(SEXP x, SEXP order)
 {
   // For internal use only by setkey().
diff --git a/src/subset.c b/src/subset.c
@@ -272,8 +272,9 @@ static void checkCol(SEXP col, int colNum, int nrow, SEXP x)
 *   2) Originally for subsetting vectors in fcast and now the beginnings of [.data.table ported to C
 *   3) Immediate need is for R 3.1 as lglVec[1] now returns R's global TRUE and we don't want := to change that global [think 1 row data.tables]
 *   4) Could do it other ways but may as well go to C now as we were going to do that anyway
+*
+*  OpenMP is used here to parallelize the loops that perform the subsetting of vectors, with conditional checks and filtering of data. 
 */
-
 SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md and man/cdt.Rd
   int nprotect=0;
   if (!isNewList(x)) internal_error(__func__, "Argument '%s' to %s is type '%s' not '%s'", "x", "CsubsetDT", type2char(TYPEOF(rows)), "list"); // # nocov
diff --git a/src/types.c b/src/types.c
@@ -50,6 +50,13 @@ void testRaiseMsg(ans_t *ans, int istatus, bool verbose) {
   }
   ans->int_v[0] = ans->status;
 }
+/*
+  This caters to internal tests (not user-facing), and OpenMP is being used
+    here to test a message printing function inside a nested loop which has
+    been collapsed into a single loop of the combined iteration space using
+    collapse(2), along with specification of dynamic scheduling for distributing
+    the iterations in a way that can balance the workload among the threads.
+*/
 SEXP testMsgR(SEXP status, SEXP x, SEXP k) {
   if (!isInteger(status) || !isInteger(x) || !isInteger(k))
     internal_error(__func__, "status, nx, nk must be integer"); // # nocov

Original file line number	Diff line number	Diff line change
`@@ -587,6 +587,14 @@ int compressbuff(z_stream stream, void dest, size_t destLen, const void sour`
`587`	`587`	`}`
`588`	`588`	`#endif`
`589`	`589`
	`590`	`+/*`
	`591`	`+ OpenMP is used here primarily to parallelize the process of writing rows`
	`592`	`+ to the output file, but error handling and compression (if enabled) are`
	`593`	`+ also managed within the parallel region. Special attention is paid to`
	`594`	`+ thread safety and synchronization, especially in the ordered sections`
	`595`	`+ where output to the file and handling of errors is serialized to maintain`
	`596`	`+ the correct sequence of rows.`
	`597`	`+*/`
`590`	`598`	`void fwriteMain(fwriteMainArgs args)`
`591`	`599`	`{`
`592`	`600`	`double startTime = wallclock();`