Remove starting_idx parameter (#47)

EmersonFras · web-flow · commit 2b8706439d93 · 2025-12-19T15:13:20.000-05:00
diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ After downloading the images, cautious-robot calls [`sum-buddy`](https://github.
 ### Command Line Usage
 ```
 usage: cautious-robot [-h] -i [INPUT_FILE] -o [OUTPUT_DIR] [-s [SUBDIR_COL]] [-n [IMG_NAME_COL]] [-u [URL_COL]] [-w WAIT_TIME]
-                      [-r MAX_RETRIES] [-l SIDE_LENGTH] [-x STARTING_IDX] [-a CHECKSUM_ALGORITHM] [-v [VERIFIER_COL]]
+                      [-r MAX_RETRIES] [-l SIDE_LENGTH] [-a CHECKSUM_ALGORITHM] [-v [VERIFIER_COL]]
 
 options:
   -h, --help            show this help message and exit
@@ -53,8 +53,6 @@ optional arguments:
                         max times to retry download on a single image (default: 5)
   -l SIDE_LENGTH, --side-length SIDE_LENGTH
                         number of pixels per side for resized square images (default: no resized images created)
-  -x STARTING_IDX, --starting-idx STARTING_IDX
-                        index of CSV at which to start download (default: 0)
   -a CHECKSUM_ALGORITHM, --checksum-algorithm CHECKSUM_ALGORITHM
                         checksum algorithm to use on images (default: md5, available: sha256, sha384, md5-sha1, blake2b, sha512,
                         sha1, sm3, sha3_256, sha512_256, sha224, sha3_224, ripemd160, sha3_384, shake_128, blake2s, md5, sha3_512,
diff --git a/src/cautiousrobot/__main__.py b/src/cautiousrobot/__main__.py
@@ -37,7 +37,6 @@ def parse_args():
     opt_args.add_argument("-l", "--side-length", required = False,
                         help = "number of pixels per side for resized square images (default: no resized images created)",
                         type = int)
-    opt_args.add_argument("-x", "--starting-idx", default = 0, help = "index of CSV at which to start download (default: 0)", type = int)
     opt_args.add_argument("-a", "--checksum-algorithm", default = 'md5', #choices = available_algorithms,
                         help = f"checksum algorithm to use on images (default: md5, available: {available_algorithms})"
                         )
@@ -161,7 +160,7 @@ def main():
 
     # Validate and handle existing output directory
     img_dir = args.output_dir
-    source_df, filtered_df = check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders, args.starting_idx)
+    source_df, filtered_df = check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders)
 
     # Set up log paths
     log_filepath, error_log_filepath, metadata_path = setup_log_paths(csv_path)
@@ -180,8 +179,7 @@ def main():
                        downsample=args.side_length,
                        file_url=url_col,
                        wait=args.wait_time,
-                       retry=args.max_retries,
-                       starting_index=args.starting_idx)
+                       retry=args.max_retries)
         print(f"Images downloaded from {csv_path} to {img_dir}, with downsampled images in {downsample_dest_path}.")
     else:
         # Download images from urls without downsample copy
@@ -193,8 +191,7 @@ def main():
                        subfolders=subfolders,
                        file_url=url_col,
                        wait=args.wait_time,
-                       retry=args.max_retries,
-                       starting_index=args.starting_idx)
+                       retry=args.max_retries)
         print(f"Images downloaded from {csv_path} to {img_dir}.")
     
     print(f"Download logs are in {log_filepath} and {error_log_filepath}.")
diff --git a/src/cautiousrobot/download.py b/src/cautiousrobot/download.py
@@ -383,7 +383,7 @@ def process_downsampling(data, i, image_name, image_dir_path, downsample_path,
 
 def download_images(data, img_dir, log_filepath, error_log_filepath, filename="filename",
                    subfolders=None, downsample_path=None, downsample=None,
-                   file_url="file_url", wait=3, retry=5, starting_index=0):
+                   file_url="file_url", wait=3, retry=5):
     """
     Download images to img_dir and downsampled images to a chosen downsized image path.
 
@@ -399,7 +399,6 @@ def download_images(data, img_dir, log_filepath, error_log_filepath, filename="f
     - file_url (str): Name of column to use for image urls (default: 'file_url')
     - wait (int): Seconds to wait between retries for an image (default: 3)
     - retry (int): Max number of times to retry downloading an image (default: 5)
-    - starting_index (int): Index at which to start the download (default: 0)
     
     Returns:
     - None
@@ -409,8 +408,6 @@ def download_images(data, img_dir, log_filepath, error_log_filepath, filename="f
 
     # Process each row in the DataFrame
     for i in tqdm(data.index):
-        if i < starting_index:
-            continue
 
         # Get URL and handle missing URLs first (before any path operations)
         url = data[file_url][i]
diff --git a/src/cautiousrobot/utils.py b/src/cautiousrobot/utils.py
@@ -84,7 +84,7 @@ def downsample_and_save_image(image_dir_path, image_name, downsample_dir_path, d
         )
         update_log(log=log_errors, index=image_index, filepath=error_log_filepath)
         
-def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders = None, starting_idx = 0):
+def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders = None):
     """
     Checks which files from the CSV already exist in the image directory.
 
@@ -100,7 +100,6 @@ def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders
         source_df (pd.DataFrame): DataFrame loaded from the CSV, containing image metadata.
         filename_col (str): Name of the column in source_df that contains image filenames.
         subfolders (str): Name of the column in source_df that contains subfolder names. (optional)
-        starting_idx (int): Index to start checking from. (optional)
 
     Returns:
         updated_df (pd.DataFrame): DataFrame with new column 'in_img_dir' indicating presence in img_dir.
@@ -113,9 +112,6 @@ def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders
         # Directory doesn't exist, so nothing to check
         df["in_img_dir"] = False
         
-        # If we have a starting index, we still need to mark the skipped ones as True
-        if starting_idx > 0:
-             df.iloc[:starting_idx, df.columns.get_loc("in_img_dir")] = True
         # Return the updated df and the filtered dataframe of items that still need downloading
         filtered_df = df[~df["in_img_dir"]].copy()
         return df, filtered_df
@@ -143,9 +139,6 @@ def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders
     expected_present = df["expected_path"].isin(existing_full_paths)
     df["in_img_dir"] = expected_present.copy()
     
-    if starting_idx > 0:
-        df.iloc[:starting_idx, df.columns.get_loc("in_img_dir")] = True
-    
     # Clean up the temporary column before returning.
     df = df.drop(columns=["expected_path"])
     
diff --git a/tests/test_download_images.py b/tests/test_download_images.py
@@ -328,7 +328,6 @@ def test_main_successful_execution(self, mock_input, mock_exists, mock_BuddyChec
         mock_args.side_length = None
         mock_args.wait_time = 0
         mock_args.max_retries = 3
-        mock_args.starting_idx = 0
         mock_args.checksum_algorithm = 'md5'
         mock_args.verifier_col = None