Skip to content

Commit 2b87064

Browse files
authored
Remove starting_idx parameter (#47)
1 parent fe031cb commit 2b87064

File tree

5 files changed

+6
-22
lines changed

5 files changed

+6
-22
lines changed

README.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ After downloading the images, cautious-robot calls [`sum-buddy`](https://github.
2929
### Command Line Usage
3030
```
3131
usage: cautious-robot [-h] -i [INPUT_FILE] -o [OUTPUT_DIR] [-s [SUBDIR_COL]] [-n [IMG_NAME_COL]] [-u [URL_COL]] [-w WAIT_TIME]
32-
[-r MAX_RETRIES] [-l SIDE_LENGTH] [-x STARTING_IDX] [-a CHECKSUM_ALGORITHM] [-v [VERIFIER_COL]]
32+
[-r MAX_RETRIES] [-l SIDE_LENGTH] [-a CHECKSUM_ALGORITHM] [-v [VERIFIER_COL]]
3333
3434
options:
3535
-h, --help show this help message and exit
@@ -53,8 +53,6 @@ optional arguments:
5353
max times to retry download on a single image (default: 5)
5454
-l SIDE_LENGTH, --side-length SIDE_LENGTH
5555
number of pixels per side for resized square images (default: no resized images created)
56-
-x STARTING_IDX, --starting-idx STARTING_IDX
57-
index of CSV at which to start download (default: 0)
5856
-a CHECKSUM_ALGORITHM, --checksum-algorithm CHECKSUM_ALGORITHM
5957
checksum algorithm to use on images (default: md5, available: sha256, sha384, md5-sha1, blake2b, sha512,
6058
sha1, sm3, sha3_256, sha512_256, sha224, sha3_224, ripemd160, sha3_384, shake_128, blake2s, md5, sha3_512,

src/cautiousrobot/__main__.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ def parse_args():
3737
opt_args.add_argument("-l", "--side-length", required = False,
3838
help = "number of pixels per side for resized square images (default: no resized images created)",
3939
type = int)
40-
opt_args.add_argument("-x", "--starting-idx", default = 0, help = "index of CSV at which to start download (default: 0)", type = int)
4140
opt_args.add_argument("-a", "--checksum-algorithm", default = 'md5', #choices = available_algorithms,
4241
help = f"checksum algorithm to use on images (default: md5, available: {available_algorithms})"
4342
)
@@ -161,7 +160,7 @@ def main():
161160

162161
# Validate and handle existing output directory
163162
img_dir = args.output_dir
164-
source_df, filtered_df = check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders, args.starting_idx)
163+
source_df, filtered_df = check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders)
165164

166165
# Set up log paths
167166
log_filepath, error_log_filepath, metadata_path = setup_log_paths(csv_path)
@@ -180,8 +179,7 @@ def main():
180179
downsample=args.side_length,
181180
file_url=url_col,
182181
wait=args.wait_time,
183-
retry=args.max_retries,
184-
starting_index=args.starting_idx)
182+
retry=args.max_retries)
185183
print(f"Images downloaded from {csv_path} to {img_dir}, with downsampled images in {downsample_dest_path}.")
186184
else:
187185
# Download images from urls without downsample copy
@@ -193,8 +191,7 @@ def main():
193191
subfolders=subfolders,
194192
file_url=url_col,
195193
wait=args.wait_time,
196-
retry=args.max_retries,
197-
starting_index=args.starting_idx)
194+
retry=args.max_retries)
198195
print(f"Images downloaded from {csv_path} to {img_dir}.")
199196

200197
print(f"Download logs are in {log_filepath} and {error_log_filepath}.")

src/cautiousrobot/download.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ def process_downsampling(data, i, image_name, image_dir_path, downsample_path,
383383

384384
def download_images(data, img_dir, log_filepath, error_log_filepath, filename="filename",
385385
subfolders=None, downsample_path=None, downsample=None,
386-
file_url="file_url", wait=3, retry=5, starting_index=0):
386+
file_url="file_url", wait=3, retry=5):
387387
"""
388388
Download images to img_dir and downsampled images to a chosen downsized image path.
389389
@@ -399,7 +399,6 @@ def download_images(data, img_dir, log_filepath, error_log_filepath, filename="f
399399
- file_url (str): Name of column to use for image urls (default: 'file_url')
400400
- wait (int): Seconds to wait between retries for an image (default: 3)
401401
- retry (int): Max number of times to retry downloading an image (default: 5)
402-
- starting_index (int): Index at which to start the download (default: 0)
403402
404403
Returns:
405404
- None
@@ -409,8 +408,6 @@ def download_images(data, img_dir, log_filepath, error_log_filepath, filename="f
409408

410409
# Process each row in the DataFrame
411410
for i in tqdm(data.index):
412-
if i < starting_index:
413-
continue
414411

415412
# Get URL and handle missing URLs first (before any path operations)
416413
url = data[file_url][i]

src/cautiousrobot/utils.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def downsample_and_save_image(image_dir_path, image_name, downsample_dir_path, d
8484
)
8585
update_log(log=log_errors, index=image_index, filepath=error_log_filepath)
8686

87-
def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders = None, starting_idx = 0):
87+
def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders = None):
8888
"""
8989
Checks which files from the CSV already exist in the image directory.
9090
@@ -100,7 +100,6 @@ def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders
100100
source_df (pd.DataFrame): DataFrame loaded from the CSV, containing image metadata.
101101
filename_col (str): Name of the column in source_df that contains image filenames.
102102
subfolders (str): Name of the column in source_df that contains subfolder names. (optional)
103-
starting_idx (int): Index to start checking from. (optional)
104103
105104
Returns:
106105
updated_df (pd.DataFrame): DataFrame with new column 'in_img_dir' indicating presence in img_dir.
@@ -113,9 +112,6 @@ def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders
113112
# Directory doesn't exist, so nothing to check
114113
df["in_img_dir"] = False
115114

116-
# If we have a starting index, we still need to mark the skipped ones as True
117-
if starting_idx > 0:
118-
df.iloc[:starting_idx, df.columns.get_loc("in_img_dir")] = True
119115
# Return the updated df and the filtered dataframe of items that still need downloading
120116
filtered_df = df[~df["in_img_dir"]].copy()
121117
return df, filtered_df
@@ -143,9 +139,6 @@ def check_existing_images(csv_path, img_dir, source_df, filename_col, subfolders
143139
expected_present = df["expected_path"].isin(existing_full_paths)
144140
df["in_img_dir"] = expected_present.copy()
145141

146-
if starting_idx > 0:
147-
df.iloc[:starting_idx, df.columns.get_loc("in_img_dir")] = True
148-
149142
# Clean up the temporary column before returning.
150143
df = df.drop(columns=["expected_path"])
151144

tests/test_download_images.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,6 @@ def test_main_successful_execution(self, mock_input, mock_exists, mock_BuddyChec
328328
mock_args.side_length = None
329329
mock_args.wait_time = 0
330330
mock_args.max_retries = 3
331-
mock_args.starting_idx = 0
332331
mock_args.checksum_algorithm = 'md5'
333332
mock_args.verifier_col = None
334333

0 commit comments

Comments
 (0)