E3SM-Project
diff --git a/‎zstash/extract.py‎
Lines changed: 98 additions & 1 deletion b/‎zstash/extract.py‎
Lines changed: 98 additions & 1 deletion
@@ -304,24 +304,39 @@ def multiprocess_extract(
 ) -> List[FilesRow]:
     """
     Extract the files from the matches in parallel.
+
+    A single unit of work is a tar and all of
+    the files in it to extract.
     """
+    # A dict of tar -> size of files in it.
+    # This is because we're trying to balance the load between
+    # the processes.
     tar_to_size_unsorted: DefaultDict[str, float] = collections.defaultdict(float)
+    db_row: FilesRow
     for db_row in matches:
         tar_to_size_unsorted[db_row.tar] += db_row.size
 
     tar_to_size: collections.OrderedDict[str, float] = collections.OrderedDict(
         sorted(tar_to_size_unsorted.items(), key=lambda x: x[1])
     )
 
+    # We don't want to instantiate more processes than we need to.
+    # So, if the number of tars is less than the number of workers,
+    # set the number of workers to the number of tars.
     num_workers = min(num_workers, len(tar_to_size))
 
+    # For worker i, workers_to_tars[i] is a set of tars
+    # that worker i will work on.
     # Round-robin assignment for predictable ordering
     workers_to_tars: List[set] = [set() for _ in range(num_workers)]
+    tar: str
     for idx, tar in enumerate(sorted(tar_to_size.keys())):
         workers_to_tars[idx % num_workers].add(tar)
 
     workers_to_matches: List[List[FilesRow]] = [[] for _ in range(num_workers)]
+    workers_idx: int
     for db_row in matches:
+        tar = db_row.tar
         for workers_idx in range(len(workers_to_tars)):
             if db_row.tar in workers_to_tars[workers_idx]:
                 workers_to_matches[workers_idx].append(db_row)
@@ -336,6 +351,7 @@ def multiprocess_extract(
         tar_ordering, manager=manager
     )
 
+    # The return value for extractFiles will be added here.
     failure_queue: multiprocessing.Queue[FilesRow] = multiprocessing.Queue()
     processes: List[multiprocessing.Process] = []
 
@@ -352,6 +368,10 @@ def multiprocess_extract(
         process.start()
         processes.append(process)
 
+    # While the processes are running, we need to empty the queue.
+    # Otherwise, it causes hanging.
+    # No need to join() each of the processes when doing this,
+    # because we'll be in this loop until completion.
     failures: List[FilesRow] = []
     while any(p.is_alive() for p in processes):
         while not failure_queue.empty():
@@ -361,6 +381,7 @@ def multiprocess_extract(
     while not failure_queue.empty():
         failures.append(failure_queue.get())
 
+    # Sort the failures, since they can come in at any order.
     failures.sort(key=lambda t: (t.name, t.tar, t.offset))
     return failures
 
@@ -371,46 +392,61 @@ def check_sizes_match(cur, tfname, error_on_duplicate_tar):
         actual_size: int = os.path.getsize(tfname)
         name_only: str = os.path.split(tfname)[1]
 
+        # Get ALL entries for this tar name
         cur.execute(
             "SELECT size FROM tars WHERE name = ? ORDER by id DESC", (name_only,)
         )
         results = cur.fetchall()
 
         if not results:
+            # Cannot access size information; assume the sizes match.
             logger.error(f"No database entries found for {name_only}")
             return True
 
+        # Check for multiple entries
         if len(results) > 1:
+            # Extract just the size values
             sizes: List[int] = [row[0] for row in results]
             error_str: str = (
                 f"Database corruption detected! Found {len(results)} database entries for {name_only}, with sizes {sizes}"
             )
 
             if error_on_duplicate_tar:
+                # Tested by database_corruption.bash Case 5
                 logger.error(error_str)
                 raise RuntimeError(error_str)
             logger.warning(error_str)
 
+            # We ordered the results by id DESC,
+            # so the first entry is the most recent.
             most_recent_size: int = sizes[0]
             if actual_size == most_recent_size:
+                # Tested by database_corruption.bash Case 7
+                # If the actual size matches the most recent size,
+                # then we can assume that the tar is valid.
                 logger.info(
                     f"{name_only}: The most recent database entry has the same size as the actual file size: {actual_size}."
                 )
                 return True
             unique_sizes: Set[int] = set(sizes)
             if actual_size in unique_sizes:
+                # Tested by database_corruption.bash Case 8
                 logger.info(
                     f"{name_only}: A database entry matches the actual file size, {actual_size}, but it is not the most recent entry."
                 )
             else:
+                # Tested by database_corruption.bash Case 6
                 logger.info(
                     f"{name_only}: No database entry matches the actual file size: {actual_size}."
                 )
             return False
         else:
+            # Tested by database_corruption.bash Cases 1,2,4
+            # Single entry - normal case
             logger.info(f"{name_only}: Found a single database entry.")
             expected_size = results[0][0]
 
+        # Now check if actual size matches expected size
         if expected_size != actual_size:
             error_msg = (
                 f"{name_only}: Size mismatch! "
@@ -420,13 +456,16 @@ def check_sizes_match(cur, tfname, error_on_duplicate_tar):
             logger.error(error_msg)
             return False
         else:
+            # Sizes match
             logger.info(f"{name_only}: Size check passed ({actual_size} bytes)")
             return True
     else:
+        # Cannot access size information; assume the sizes match.
         logger.debug("Cannot access tar size information; assuming sizes match")
         return True
 
 
+# FIXME: C901 'extractFiles' is too complex (33)
 def extractFiles(  # noqa: C901
     files: List[FilesRow],
     keep_files: bool,
@@ -437,7 +476,20 @@ def extractFiles(  # noqa: C901
     cur: Optional[sqlite3.Cursor] = None,
 ) -> List[FilesRow]:
     """
-    Given a list of database rows, extract the files from the tar archives.
+    Given a list of database rows, extract the files from the
+    tar archives to the current location on disk.
+
+    If keep_files is False, the files are not extracted.
+    This is used for when checking if the files in an HPSS
+    repository are valid.
+
+    If keep_tars is True, the tar archives that are downloaded are kept,
+    even after the program has terminated. Otherwise, they are deleted.
+
+    If running in parallel, then multiprocess_worker is the Worker
+    that called this function.
+    We need a reference to it so we can signal it to print
+    the contents of what's in its print queue.
 
     If cur is None (when running in parallel), a new database connection
     will be opened for this worker process.
@@ -459,19 +511,26 @@ def extractFiles(  # noqa: C901
 
     # Set up logging redirection for multiprocessing
     if multiprocess_worker:
+        # All messages to the logger will now be sent to
+        # this queue, instead of sys.stdout.
         sh = logging.StreamHandler(multiprocess_worker.print_queue)
         sh.setLevel(logging.DEBUG)
         formatter: logging.Formatter = logging.Formatter("%(levelname)s: %(message)s")
         sh.setFormatter(formatter)
         logger.addHandler(sh)
+        # Don't have the logger print to the console as the message come in.
         logger.propagate = False
 
     for i in range(nfiles):
         files_row: FilesRow = files[i]
 
+        # Open new tar archive
         if newtar:
             newtar = False
             tfname = os.path.join(cache, files_row.tar)
+            # Everytime we're extracting a new tar, if running in parallel,
+            # let the process know.
+            # This is to synchronize the print statements.
 
             # Wait for turn before processing this tar
             if multiprocess_worker:
@@ -487,6 +546,8 @@ def extractFiles(  # noqa: C901
                 raise TypeError("Invalid args.hpss={}".format(args.hpss))
 
             tries: int = args.retries + 1
+            # Set to True to test the `--retries` option with a forced failure.
+            # Then run `python -m unittest tests.test_extract.TestExtract.testExtractRetries`
             test_retry: bool = False
 
             while tries > 0:
@@ -512,10 +573,12 @@ def extractFiles(  # noqa: C901
                             raise RuntimeError(
                                 f"{tfname} size does not match expected size."
                             )
+                    # `hpss_get` successful or not needed: no more tries needed
                     break
                 except RuntimeError as e:
                     if tries > 0:
                         logger.info(f"Retrying HPSS get: {tries} tries remaining.")
+                        # Run the try-except block again
                         continue
                     else:
                         raise e
@@ -526,25 +589,34 @@ def extractFiles(  # noqa: C901
         # Extract file
         cmd: str = "Extracting" if keep_files else "Checking"
         logger.info(cmd + " %s" % (files_row.name))
+        # if multiprocess_worker:
+        #     print('{} is {} {} from {}'.format(multiprocess_worker, cmd, file[1], file[5]))
 
         if keep_files and not should_extract_file(files_row):
+            # If we were going to extract, but aren't
+            # because a matching file is on disk
             msg: str = "Not extracting {}, because it"
             msg += " already exists on disk with the same"
             msg += " size and modification date."
             logger.info(msg.format(files_row.name))
 
+        # True if we should actually extract the file from the tar
         extract_this_file: bool = keep_files and should_extract_file(files_row)
 
         try:
+            # Seek file position
             if tar.fileobj is not None:
                 fileobj = tar.fileobj
             else:
                 raise TypeError("Invalid tar.fileobj={}".format(tar.fileobj))
             fileobj.seek(files_row.offset)
 
+            # Get next member
             tarinfo: tarfile.TarInfo = tar.tarinfo.fromtarfile(tar)
 
             if tarinfo.isfile():
+                # fileobj to extract
+                # error: Name 'tarfile.ExFileObject' is not defined
                 extracted_file: Optional[tarfile.ExFileObject] = tar.extractfile(tarinfo)  # type: ignore
                 if extracted_file:
                     fin: tarfile.ExFileObject = extracted_file
@@ -557,8 +629,11 @@ def extractFiles(  # noqa: C901
                     path, name = os.path.split(fname)
                     if path != "" and extract_this_file:
                         if not os.path.isdir(path):
+                            # The path doesn't exist, so create it.
                             os.makedirs(path)
                     if extract_this_file:
+                        # If we're keeping the files,
+                        # then have an output file
                         fout: _io.BufferedWriter = open(fname, "wb")
 
                     hash_md5: _hashlib.HASH = hashlib.md5()
@@ -577,12 +652,17 @@ def extractFiles(  # noqa: C901
 
                 md5: str = hash_md5.hexdigest()
                 if extract_this_file:
+                    # numeric_owner is a required arg in Python 3.
+                    # If True, "only the numbers for user/group names
+                    # are used and not the names".
                     tar.chown(tarinfo, fname, numeric_owner=False)
                     tar.chmod(tarinfo, fname)
                     tar.utime(tarinfo, fname)
+                    # Verify size
                     if os.path.getsize(fname) != files_row.size:
                         logger.error("size mismatch for: {}".format(fname))
 
+                # Verify md5 checksum
                 files_row_md5: Optional[str] = files_row.md5
                 if md5 != files_row_md5:
                     logger.error("md5 mismatch for: {}".format(fname))
@@ -597,28 +677,38 @@ def extractFiles(  # noqa: C901
                     tar.extract(tarinfo, filter="tar")
                 else:
                     tar.extract(tarinfo)
+                # Note: tar.extract() will not restore time stamps of symbolic
+                # links. Could not find a Python-way to restore it either, so
+                # relying here on 'touch'. This is not the prettiest solution.
+                # Maybe a better one can be implemented later.
                 if tarinfo.issym():
                     tmp1 = tarinfo.mtime
                     tmp2: datetime = datetime.fromtimestamp(tmp1)
                     tmp3: str = tmp2.strftime("%Y%m%d%H%M.%S")
                     os.system("touch -h -t %s %s" % (tmp3, tarinfo.name))
 
         except Exception:
+            # Catch all exceptions here.
             traceback.print_exc()
             logger.error("Retrieving {}".format(files_row.name))
             failures.append(files_row)
 
         # Close current archive?
         if i == nfiles - 1 or files[i].tar != files[i + 1].tar:
+            # We're either on the last file or the tar is distinct from the tar of the next file.
+
+            # Close current archive file
             logger.debug("Closing tar archive {}".format(tfname))
             tar.close()
 
             if multiprocess_worker:
                 multiprocess_worker.done_enqueuing_output_for_tar(files_row.tar)
                 multiprocess_worker.print_all_contents()
 
+            # Open new archive next time
             newtar = True
 
+            # Delete this tar if the corresponding command-line arg was used.
             if not keep_tars:
                 if tfname is not None:
                     os.remove(tfname)
@@ -630,6 +720,9 @@ def extractFiles(  # noqa: C901
         cur.close()
         con.close()
 
+        # Add the failures to the queue.
+        # When running with multiprocessing, the function multiprocess_extract()
+        # that calls this extractFiles() function will return the failures as a list.
     if multiprocess_worker:
         for f in failures:
             multiprocess_worker.failure_queue.put(f)
@@ -648,11 +741,15 @@ def should_extract_file(db_row: FilesRow) -> bool:
     file_name, size_db, mod_time_db = db_row.name, db_row.size, db_row.mtime
 
     if not os.path.exists(file_name):
+        # The file doesn't exist locally.
+        # We must get files that are not on disk.
         return True
 
     size_disk: int = os.path.getsize(file_name)
     mod_time_disk: datetime = datetime.utcfromtimestamp(os.path.getmtime(file_name))
 
+    # Only extract when the times and sizes are not the same (within tolerance)
+    # We have a TIME_TOL because mod_time_disk doesn't have the microseconds.
     return not (
         (size_disk == size_db)
         and (abs(mod_time_disk - mod_time_db).total_seconds() < TIME_TOL)