Skip to content

Commit 14239dd

Browse files
committed
fix: enhance cache management and validation logic
- Updated cache saving logic in `DownloadCache` to trigger more frequently, reducing the risk of data loss. - Improved GitHub token validation in `GHFolderDownloadConfig` to support various valid formats and provide clearer error messages. - Refined input validation patterns in `InputValidator` for better accuracy in GitHub username/org checks. - Enhanced regex in `_parse_size_string` for stricter matching of size strings, ensuring valid input. - Improved error handling in `RetryHandler` for better matching of temporary error indicators.
1 parent 0391eb5 commit 14239dd

File tree

6 files changed

+90
-24
lines changed

6 files changed

+90
-24
lines changed

gh_folder_download/cache.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,9 @@ def add_file_to_cache(
208208
self.cache_data[cache_key] = cache_entry
209209
self.logger.debug(f"Added file to cache: {file_path}")
210210

211-
# Save cache periodically (every 10 new entries) or when cache is small
212-
if len(self.cache_data) % 10 == 0 or len(self.cache_data) <= 5:
211+
# Save cache more frequently to prevent data loss
212+
# (every 5 new entries) or when cache is small
213+
if len(self.cache_data) % 5 == 0 or len(self.cache_data) <= 3:
213214
self._save_cache()
214215

215216
def get_cached_checksums(
@@ -283,4 +284,7 @@ def clear_cache(self) -> None:
283284

284285
def finalize(self) -> None:
285286
"""Finalize cache operations (save to disk)."""
286-
self._save_cache()
287+
# Always save to ensure no data loss
288+
if self.cache_data:
289+
self._save_cache()
290+
self.logger.debug("Cache finalized and saved")

gh_folder_download/config.py

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
import os
6+
import re
67
import sys
78
from pathlib import Path
89
from typing import Any
@@ -146,10 +147,36 @@ class GHFolderDownloadConfig(BaseModel):
146147
@validator("github_token", pre=True)
147148
def validate_github_token(cls, v):
148149
"""Validate GitHub token format."""
149-
if v and not isinstance(v, str):
150+
if not v:
151+
return v
152+
153+
if not isinstance(v, str):
150154
raise ValueError("GitHub token must be a string")
151-
if v and len(v) < 10:
152-
raise ValueError("GitHub token appears to be too short")
155+
156+
v = v.strip()
157+
if not v:
158+
return None
159+
160+
# Check for valid GitHub token formats
161+
162+
is_valid_format = False
163+
164+
if v.startswith("ghp_") and len(v) == 40:
165+
# Classic personal access token
166+
is_valid_format = True
167+
elif v.startswith("github_pat_") and len(v) >= 50:
168+
# Fine-grained personal access token
169+
is_valid_format = True
170+
elif len(v) == 40:
171+
# Legacy format - must be hexadecimal
172+
is_valid_format = bool(re.match(r"^[a-fA-F0-9]{40}$", v))
173+
174+
if not is_valid_format:
175+
raise ValueError(
176+
"Invalid GitHub token format. Expected: classic (ghp_...), "
177+
"fine-grained (github_pat_...), or legacy (40-char hex) token."
178+
)
179+
153180
return v
154181

155182

@@ -277,6 +304,9 @@ def _set_nested_value(self, data: dict[str, Any], path: str, value: Any) -> None
277304

278305
def _convert_env_value(self, value: str) -> Any:
279306
"""Convert environment variable string to appropriate type."""
307+
if not value:
308+
return value
309+
280310
# Boolean values
281311
if value.lower() in ("true", "1", "yes", "on"):
282312
return True
@@ -285,14 +315,22 @@ def _convert_env_value(self, value: str) -> Any:
285315

286316
# Numeric values
287317
try:
318+
# Check for float first (contains decimal point)
288319
if "." in value:
289-
return float(value)
320+
parsed_float = float(value)
321+
# Ensure it's a valid number (not NaN or infinity)
322+
if not (
323+
parsed_float != parsed_float or abs(parsed_float) == float("inf")
324+
):
325+
return parsed_float
290326
else:
291-
return int(value)
292-
except ValueError:
327+
parsed_int = int(value)
328+
return parsed_int
329+
except (ValueError, OverflowError):
330+
# If numeric parsing fails, continue to return as string
293331
pass
294332

295-
# String value
333+
# String value (default)
296334
return value
297335

298336
def save_config(self, file_path: Path | None = None) -> bool:
@@ -375,15 +413,15 @@ def create_sample_config(self, file_path: Path | None = None) -> bool:
375413
# Extension filters (include/exclude specific file types)
376414
# include_extensions: [".py", ".js", ".md"]
377415
# exclude_extensions: [".log", ".tmp"]
378-
416+
379417
# Size filters
380418
# min_size_bytes: 1024 # Minimum file size in bytes
381419
# max_size_bytes: 10485760 # Maximum file size in bytes (10MB)
382-
420+
383421
# Pattern filters (glob patterns)
384422
# include_patterns: ["src/**", "docs/**"]
385423
# exclude_patterns: ["**/test/**", "**/*.pyc"]
386-
424+
387425
# Special filters
388426
exclude_binary: false # Exclude binary files
389427
exclude_large_files: false # Exclude files larger than 10MB

gh_folder_download/main.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -960,14 +960,18 @@ def _parse_size_string(size_str: str) -> int | None:
960960
# Try to extract number and unit
961961
import re
962962

963-
match = re.match(r"^(\d*\.?\d+)\s*([KMGT]?B?)$", size_str)
963+
# Fixed regex: requires at least one digit, properly handles decimal numbers
964+
match = re.match(r"^(\d+(?:\.\d+)?)\s*([KMGT]?B?)$", size_str)
964965
if not match:
965966
return None
966967

967968
number_str, unit = match.groups()
968969

969970
try:
970971
number = float(number_str)
972+
# Ensure positive number
973+
if number < 0:
974+
return None
971975
except ValueError:
972976
return None
973977

gh_folder_download/rate_limiter.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,9 @@ def _update_rate_limit_info(self) -> None:
113113
def _should_update_rate_limit(self) -> bool:
114114
"""Check if rate limit info should be updated."""
115115
# Update every 30 seconds or if we don't have info
116-
return self._core_rate_limit is None or time.time() - self._last_update > 30
116+
# Protected access to _last_update to prevent race conditions
117+
with self._lock:
118+
return self._core_rate_limit is None or time.time() - self._last_update > 30
117119

118120
def _calculate_adaptive_delay(self, rate_limit: RateLimitInfo) -> float:
119121
"""Calculate adaptive delay based on current rate limit status."""

gh_folder_download/retry.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Retry mechanism with exponential backoff for gh-folder-download.
33
"""
44

5+
import re
56
import time
67
from functools import wraps
78
from typing import Any, Callable, cast
@@ -154,7 +155,12 @@ def _is_retryable(self, exception: Exception) -> bool:
154155
"rate limit",
155156
]
156157

157-
return any(indicator in error_msg for indicator in temporary_indicators)
158+
# Use word boundaries for more accurate matching
159+
160+
return any(
161+
re.search(rf"\b{re.escape(indicator)}\b", error_msg)
162+
for indicator in temporary_indicators
163+
)
158164

159165
def _calculate_delay(self, attempt: int, config: RetryConfig) -> float:
160166
"""Calculate delay for the next retry attempt."""

gh_folder_download/validation.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ class InputValidator:
2525
r"^https://github\.com/([a-zA-Z0-9._-]+)/([a-zA-Z0-9._-]+)(?:/tree/([a-zA-Z0-9._/-]+))?(?:/(.+))?$"
2626
)
2727

28-
# Valid GitHub username/org pattern
29-
GITHUB_NAME_PATTERN = re.compile(r"^[a-zA-Z0-9._-]+$")
28+
# Valid GitHub username/org pattern - must start and end with alphanumeric
29+
GITHUB_NAME_PATTERN = re.compile(r"^[a-zA-Z0-9]([a-zA-Z0-9._-]*[a-zA-Z0-9])?$")
3030

3131
def __init__(self):
3232
self.logger = get_logger()
@@ -194,12 +194,24 @@ def validate_github_token(self, token: str | None) -> str | None:
194194
# GitHub token patterns
195195
# Classic personal access tokens: ghp_xxxx (40 chars total)
196196
# Fine-grained personal access tokens: github_pat_xxxx
197-
if not (
198-
(token.startswith("ghp_") and len(token) == 40)
199-
or (token.startswith("github_pat_") and len(token) >= 50)
200-
or (len(token) == 40 and token.isalnum()) # Legacy format
201-
):
202-
self.logger.warning("Token format doesn't match expected GitHub patterns")
197+
# Legacy tokens: 40-char hex strings
198+
is_valid_format = False
199+
200+
if token.startswith("ghp_") and len(token) == 40:
201+
# Classic personal access token
202+
is_valid_format = True
203+
elif token.startswith("github_pat_") and len(token) >= 50:
204+
# Fine-grained personal access token
205+
is_valid_format = True
206+
elif len(token) == 40:
207+
# Legacy format - must be hexadecimal
208+
is_valid_format = bool(re.match(r"^[a-fA-F0-9]{40}$", token))
209+
210+
if not is_valid_format:
211+
raise ValidationError(
212+
"Invalid GitHub token format. Expected: classic (ghp_...), "
213+
"fine-grained (github_pat_...), or legacy (40-char hex) token."
214+
)
203215

204216
# Test token validity
205217
try:

0 commit comments

Comments
 (0)