diff --git a/scripts/1-fetch/wikicommons_fetch.py b/scripts/1-fetch/wikicommons_fetch.py new file mode 100644 index 00000000..808a642d --- /dev/null +++ b/scripts/1-fetch/wikicommons_fetch.py @@ -0,0 +1,1905 @@ +#!/usr/bin/env python +""" +Fetch CC Legal Tool usage data from WikiCommons API. +""" + +# Standard library +import argparse +import csv +import os +import re +import sys +import time +import textwrap +import traceback +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed + +# Third-party +import requests +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +__version__ = "1.0.0" +BASE_URL = "https://commons.wikimedia.org/w/api.php" +CC_LICENSE_CATEGORIES = [ + "CC BY 4.0", + "CC BY-SA 4.0", + "CC BY-NC 4.0", + "CC BY-NC-SA 4.0", + "CC BY-NC-ND 4.0", + "CC BY-ND 4.0", + "CC BY 3.0", + "CC BY-SA 3.0", + "CC BY-NC 3.0", + "CC BY-NC-SA 3.0", + "CC BY-NC-ND 3.0", + "CC BY-ND 3.0", + "CC BY 2.5", + "CC BY-SA 2.5", + "CC BY-NC 2.5", + "CC BY-NC-SA 2.5", + "CC BY-NC-ND 2.5", + "CC BY-ND 2.5", + "CC BY 2.0", + "CC BY-SA 2.0", + "CC BY-NC 2.0", + "CC BY-NC-SA 2.0", + "CC BY-NC-ND 2.0", + "CC BY-ND 2.0", + "CC BY 1.0", + "CC BY-SA 1.0", + "CC BY-NC 1.0", + "CC BY-NC-SA 1.0", + "CC BY-NC-ND 1.0", + "CC BY-ND 1.0", + "CC0 1.0", + "Public Domain Mark 1.0", +] + + +def license_to_wikicommons_category(license_name): + """ + Convert human-readable license name to WikiCommons category format. + + WikiCommons uses hyphenated category names (e.g., "CC-BY-4.0") while + we store human-readable names (e.g., "CC BY 4.0"). This function + converts between the two formats. + + Args: + license_name: Human-readable license name (e.g., "CC BY 4.0") + + Returns: + str: WikiCommons category name (e.g., "CC-BY-4.0") + """ + # Handle special case: "Public Domain Mark 1.0" -> "PDM-1.0" + # Note: LICENSE_NORMALIZATION has "PDM-1.0": "PDM 1.0", but we use + # "Public Domain Mark 1.0" in CC_LICENSE_CATEGORIES + if license_name == "Public Domain Mark 1.0": + return "PDM-1.0" + + # Create reverse mapping from LICENSE_NORMALIZATION + # LICENSE_NORMALIZATION maps: "CC-BY-4.0" -> "CC BY 4.0" + # So reverse_map maps: "CC BY 4.0" -> "CC-BY-4.0" + reverse_map = {v: k for k, v in shared.LICENSE_NORMALIZATION.items()} + + # Use reverse mapping if available + if license_name in reverse_map: + return reverse_map[license_name] + + # If not found in mapping, return as-is (shouldn't happen) + msg = ( + f"License '{license_name}' not found in normalization map, " + "using as-is" + ) + LOGGER.warning(msg) + return license_name + + +FILE1_COUNT = shared.path_join(PATHS["data_phase"], "wikicommons_1_count.csv") +HEADER1_COUNT = ["LICENSE", "FILE_COUNT", "PAGE_COUNT"] +QUARTER = os.path.basename(PATHS["data_quarter"]) + + +def test_category_conversion(): + """ + Test the license_to_wikicommons_category() function to ensure + conversions are working correctly. + + This function validates that all license names in CC_LICENSE_CATEGORIES + convert to the expected WikiCommons category format. + """ + LOGGER.info("Testing category conversion function...") + + # Expected mappings based on LICENSE_NORMALIZATION + # Format: (input, expected_output, is_special_case) + expected_mappings = [ + ("CC BY 4.0", "CC-BY-4.0", False), + ("CC BY-SA 4.0", "CC-BY-SA-4.0", False), + ("CC BY-NC 4.0", "CC-BY-NC-4.0", False), + ("CC BY-NC-SA 4.0", "CC-BY-NC-SA-4.0", False), + ("CC BY-NC-ND 4.0", "CC-BY-NC-ND-4.0", False), + ("CC BY-ND 4.0", "CC-BY-ND-4.0", False), + ("CC BY 3.0", "CC-BY-3.0", False), + ("CC BY-SA 3.0", "CC-BY-SA-3.0", False), + ("CC BY-NC 3.0", "CC-BY-NC-3.0", False), + ("CC BY-NC-SA 3.0", "CC-BY-NC-SA-3.0", False), + ("CC BY-NC-ND 3.0", "CC-BY-NC-ND-3.0", False), + ("CC BY-ND 3.0", "CC-BY-ND-3.0", False), + ("CC BY 2.5", "CC-BY-2.5", False), + ("CC BY-SA 2.5", "CC-BY-SA-2.5", False), + ("CC BY-NC 2.5", "CC-BY-NC-2.5", False), + ("CC BY-NC-SA 2.5", "CC-BY-NC-SA-2.5", False), + ("CC BY-NC-ND 2.5", "CC-BY-NC-ND-2.5", False), + ("CC BY-ND 2.5", "CC-BY-ND-2.5", False), + ("CC BY 2.0", "CC-BY-2.0", False), + ("CC BY-SA 2.0", "CC-BY-SA-2.0", False), + ("CC BY-NC 2.0", "CC-BY-NC-2.0", False), + ("CC BY-NC-SA 2.0", "CC-BY-NC-SA-2.0", False), + ("CC BY-NC-ND 2.0", "CC-BY-NC-ND-2.0", False), + ("CC BY-ND 2.0", "CC-BY-ND-2.0", False), + ("CC BY 1.0", "CC-BY-1.0", False), + ("CC BY-SA 1.0", "CC-BY-SA-1.0", False), + ("CC BY-NC 1.0", "CC-BY-NC-1.0", False), + ("CC BY-NC-SA 1.0", "CC-BY-NC-SA-1.0", False), + ("CC BY-NC-ND 1.0", "CC-BY-NC-ND-1.0", False), + ("CC BY-ND 1.0", "CC-BY-ND-1.0", False), + ("CC0 1.0", "CC0-1.0", True), # Special case: CC0 + ("Public Domain Mark 1.0", "PDM-1.0", True), # Special case: PDM + ] + + # Test all licenses in CC_LICENSE_CATEGORIES + all_passed = True + special_cases_found = [] + failures = [] + + for input_license, expected_output, is_special in expected_mappings: + actual_output = license_to_wikicommons_category(input_license) + + if actual_output == expected_output: + status = "✓" + if is_special: + special_cases_found.append( + (input_license, expected_output) + ) + else: + status = "✗" + all_passed = False + failures.append( + (input_license, expected_output, actual_output) + ) + + LOGGER.debug( + f"{status} {input_license:25} -> {actual_output:20} " + f"(expected: {expected_output})" + ) + + # Log summary + if special_cases_found: + LOGGER.info( + f"Found {len(special_cases_found)} special cases that need " + "explicit handling:" + ) + for input_license, output in special_cases_found: + LOGGER.info(f" '{input_license}' -> '{output}'") + + if failures: + LOGGER.error(f"Conversion test failed for {len(failures)} licenses:") + for input_license, expected, actual in failures: + LOGGER.error( + f" '{input_license}' -> expected '{expected}', " + f"got '{actual}'" + ) + raise shared.QuantifyingException( + "Category conversion test failed. Please fix the conversion " + "function before proceeding.", + 1 + ) + + if all_passed: + LOGGER.info( + f"✓ All {len(expected_mappings)} category conversions passed!" + ) + + # Test some edge cases + LOGGER.info("Testing edge cases...") + edge_cases = [ + ("Unknown License", "Unknown License"), # Should return as-is + ] + + for input_license, expected_behavior in edge_cases: + actual_output = license_to_wikicommons_category(input_license) + if actual_output == expected_behavior: + LOGGER.debug( + f"✓ Edge case handled: '{input_license}' -> " + f"'{actual_output}' (as expected)" + ) + else: + LOGGER.warning( + f"Edge case '{input_license}' returned '{actual_output}' " + f"(expected '{expected_behavior}')" + ) + + return all_passed + + +def parse_arguments(): + """ + Parse command-line options, returns parsed argument namespace. + """ + LOGGER.info("Parsing command-line options") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions (fetch, merge, add, commit, and push)", + ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable detailed debug logging including API URLs and raw responses", + ) + parser.add_argument( + "--limit", + type=int, + metavar="N", + help="Process only first N licenses (for quick testing)", + ) + parser.add_argument( + "--skip-validation", + action="store_true", + help="Skip category validation (for faster re-runs)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Test conversions and validation without running full queries", + ) + parser.add_argument( + "--timeout", + type=int, + default=60, + metavar="SECONDS", + help="Set custom timeout for API calls (default: 60)", + ) + parser.add_argument( + "--max-depth", + type=int, + default=4, + metavar="N", + help="Set maximum recursion depth (default: 4)", + ) + parser.add_argument( + "--parallel", + type=int, + default=4, + metavar="N", + help="Enable parallel processing with N workers (default: 4)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.limit is not None and args.limit < 1: + parser.error("--limit must be at least 1") + if args.timeout < 1: + parser.error("--timeout must be at least 1 second") + if args.max_depth < 1: + parser.error("--max-depth must be at least 1") + if args.parallel < 1: + parser.error("--parallel must be at least 1") + return args + + +def check_for_completion(): + """Check if data fetch is already completed for this quarter.""" + try: + with open(FILE1_COUNT, "r", encoding="utf-8") as file_obj: + reader = csv.DictReader(file_obj, dialect="unix") + if len(list(reader)) >= len(CC_LICENSE_CATEGORIES): + raise shared.QuantifyingException( + f"Data fetch completed for {QUARTER}", 0 + ) + except FileNotFoundError: + pass # File may not be found without --enable-save, etc. + + +# Constants for timeout and retry configuration (defaults, can be overridden) +CATEGORY_COUNT_TIMEOUT = 300 # 5 minutes timeout for entire category count +INITIAL_BACKOFF = 2 # Initial backoff in seconds (exponential) +MAX_RETRIES = 3 # Maximum retries for failed requests + +# Global cache for category counts (shared across all license queries) +# Format: {category_name: {"files": int, "pages": int, "timestamp": float}} +_category_cache = {} +_cache_lock = threading.Lock() + +# Performance tracking +_cache_hits = 0 +_cache_misses = 0 +_perf_lock = threading.Lock() + + +def get_requests_session(request_timeout=60, debug=False): + """ + Create a requests session with retry logic and timeout handling. + + Args: + request_timeout: Timeout for API requests in seconds + debug: Enable debug logging for API calls + + Returns: + requests.Session: Configured session with retry and timeout + """ + max_retries = Retry( + total=MAX_RETRIES, + backoff_factor=10, + # Add rate limit (429) to retry status list + status_forcelist=shared.RETRY_STATUS_FORCELIST + [429], + allowed_methods=["GET"], # Only retry GET requests + ) + session = requests.Session() + session.mount("https://", HTTPAdapter(max_retries=max_retries)) + headers = { + "User-Agent": shared.USER_AGENT + } + session.headers.update(headers) + + # Store timeout and debug flag in session for later use + session._request_timeout = request_timeout + session._debug = debug + + return session + + +def check_category_exists(session, category_name): + """ + Check if a category exists on WikiCommons. + + This is a lightweight check that only verifies existence, + not the full category info. + + Args: + session: requests.Session object + category_name: Name of the category to check + + Returns: + bool: True if category exists, False otherwise + """ + params = { + "action": "query", + "titles": f"Category:{category_name}", + "format": "json" + } + + timeout = getattr(session, '_request_timeout', 60) + debug = getattr(session, '_debug', False) + + url = BASE_URL + if debug: + LOGGER.debug(f"Checking category existence: {url}") + LOGGER.debug(f" Params: {params}") + + try: + with session.get(url, params=params, timeout=timeout) as response: + if debug: + LOGGER.debug( + f" Response status: {response.status_code}" + ) + LOGGER.debug(f" Response headers: {dict(response.headers)}") + response.raise_for_status() + data = response.json() + if debug: + LOGGER.debug(f" Response data: {data}") + + pages = data.get("query", {}).get("pages", {}) + if not pages: + return False + + # Check if page ID is -1 (missing page) + page_id = list(pages.keys())[0] + return page_id != "-1" + + except (requests.HTTPError, requests.RequestException, KeyError) as e: + if debug: + LOGGER.debug(f" Error: {e}") + return False + + +def suggest_alternative_category_name(category_name): + """ + Suggest alternative category names if the original doesn't exist. + + Args: + category_name: The category name that wasn't found + + Returns: + list: List of suggested alternative names + """ + suggestions = [] + + # Common variations to try + variations = [ + category_name.replace("-", " "), # Hyphens to spaces + category_name.replace(" ", "-"), # Spaces to hyphens + category_name.upper(), # All caps + category_name.lower(), # All lowercase + ] + + # Remove duplicates and the original + for variation in variations: + if variation != category_name and variation not in suggestions: + suggestions.append(variation) + + return suggestions + + +def validate_categories_exist(session): + """ + Validate that all converted WikiCommons categories actually exist. + + This lightweight check prevents expensive recursive queries on + non-existent categories. + + Args: + session: requests.Session object + + Returns: + list: List of tuples (license_name, category_name) for valid + categories that exist on WikiCommons + """ + LOGGER.info("Validating category existence on WikiCommons...") + + valid_categories = [] + missing_categories = [] + + for license_name in CC_LICENSE_CATEGORIES: + # Convert to WikiCommons format + category_name = license_to_wikicommons_category(license_name) + + # Check if category exists + exists = check_category_exists(session, category_name) + + if exists: + valid_categories.append((license_name, category_name)) + LOGGER.debug( + f"✓ Category exists: '{category_name}' " + f"(for license: '{license_name}')" + ) + else: + missing_categories.append((license_name, category_name)) + LOGGER.warning( + f"✗ Category not found: '{category_name}' " + f"(for license: '{license_name}')" + ) + + # Suggest alternatives + suggestions = suggest_alternative_category_name(category_name) + if suggestions: + LOGGER.info( + f" Suggestions for '{category_name}': " + f"{', '.join(suggestions[:3])}" + ) + + # Summary + total = len(CC_LICENSE_CATEGORIES) + valid_count = len(valid_categories) + missing_count = len(missing_categories) + + LOGGER.info( + f"Category validation complete: {valid_count}/{total} categories " + f"exist, {missing_count} missing" + ) + + if missing_categories: + LOGGER.warning( + f"Found {missing_count} missing categories. These will be " + "skipped or may return zero counts:" + ) + for license_name, category_name in missing_categories: + LOGGER.warning(f" - {license_name} -> {category_name}") + + if valid_count == 0: + raise shared.QuantifyingException( + "No valid categories found! Please check category names and " + "conversion function.", + 1 + ) + + return valid_categories + + +def get_category_info(session, category_name, retry_count=0): + """ + Get file and page count for a specific category with retry logic. + + Args: + session: requests.Session object + category_name: Name of the category to query + retry_count: Current retry attempt (for exponential backoff) + + Returns: + dict: Dictionary with 'files' and 'pages' counts + """ + params = { + "action": "query", + "prop": "categoryinfo", + "titles": f"Category:{category_name}", + "format": "json" + } + + timeout = getattr(session, '_request_timeout', 60) + debug = getattr(session, '_debug', False) + + url = BASE_URL + if debug: + LOGGER.debug(f"Getting category info: {url}") + LOGGER.debug(f" Category: {category_name}") + LOGGER.debug(f" Params: {params}") + LOGGER.debug(f" Retry count: {retry_count}") + + try: + with session.get(url, params=params, timeout=timeout) as response: + if debug: + LOGGER.debug( + f" Response status: {response.status_code}" + ) + LOGGER.debug(f" Response headers: {dict(response.headers)}") + # Handle rate limiting + if response.status_code == 429: + default_backoff = INITIAL_BACKOFF ** (retry_count + 1) + retry_after = int( + response.headers.get("Retry-After", default_backoff) + ) + if retry_count < MAX_RETRIES: + LOGGER.warning( + f"Rate limited. Waiting {retry_after}s before retry " + f"({retry_count + 1}/{MAX_RETRIES})..." + ) + time.sleep(retry_after) + return get_category_info(session, category_name, retry_count + 1) + else: + raise shared.QuantifyingException( + f"Rate limit exceeded after {MAX_RETRIES} retries", 1 + ) + + response.raise_for_status() + data = response.json() + + pages = data.get("query", {}).get("pages", {}) + if not pages: + LOGGER.warning(f"No data found for category: {category_name}") + return {"files": 0, "pages": 0} + + # Get the first (and usually only) page result + page_data = list(pages.values())[0] + categoryinfo = page_data.get("categoryinfo", {}) + + files = categoryinfo.get("files", 0) + pages = categoryinfo.get("pages", 0) + + LOGGER.debug( + f"Category {category_name}: {files} files, {pages} pages" + ) + return {"files": files, "pages": pages} + + except requests.Timeout: + if retry_count < MAX_RETRIES: + backoff = INITIAL_BACKOFF ** (retry_count + 1) + LOGGER.warning( + f"Request timeout for {category_name}. " + f"Retrying in {backoff}s ({retry_count + 1}/{MAX_RETRIES})..." + ) + time.sleep(backoff) + return get_category_info(session, category_name, retry_count + 1) + else: + LOGGER.error( + f"Request timeout after {MAX_RETRIES} retries for " + f"{category_name}" + ) + return {"files": 0, "pages": 0} + except requests.HTTPError as e: + retryable_statuses = [429, 500, 502, 503, 504] + if retry_count < MAX_RETRIES and e.response.status_code in retryable_statuses: + backoff = INITIAL_BACKOFF ** (retry_count + 1) + LOGGER.warning( + f"HTTP {e.response.status_code} for {category_name}. " + f"Retrying in {backoff}s ({retry_count + 1}/{MAX_RETRIES})..." + ) + time.sleep(backoff) + return get_category_info(session, category_name, retry_count + 1) + raise shared.QuantifyingException(f"HTTP Error: {e}", 1) + except requests.RequestException as e: + if retry_count < MAX_RETRIES: + backoff = INITIAL_BACKOFF ** (retry_count + 1) + LOGGER.warning( + f"Request exception for {category_name}: {e}. " + f"Retrying in {backoff}s ({retry_count + 1}/{MAX_RETRIES})..." + ) + time.sleep(backoff) + return get_category_info(session, category_name, retry_count + 1) + raise shared.QuantifyingException(f"Request Exception: {e}", 1) + except KeyError as e: + raise shared.QuantifyingException(f"KeyError: {e}", 1) + + +def get_subcategories(session, category_name, retry_count=0): + """ + Get subcategories for a given category with retry logic. + + Args: + session: requests.Session object + category_name: Name of the parent category + retry_count: Current retry attempt (for exponential backoff) + + Returns: + list: List of subcategory names + """ + params = { + "action": "query", + "list": "categorymembers", + "cmtitle": f"Category:{category_name}", + "cmtype": "subcat", + "cmlimit": "500", # Maximum allowed + "format": "json" + } + + timeout = getattr(session, '_request_timeout', 60) + debug = getattr(session, '_debug', False) + + url = BASE_URL + if debug: + LOGGER.debug(f"Getting subcategories: {url}") + LOGGER.debug(f" Category: {category_name}") + LOGGER.debug(f" Params: {params}") + LOGGER.debug(f" Retry count: {retry_count}") + + try: + with session.get(url, params=params, timeout=timeout) as response: + if debug: + LOGGER.debug( + f" Response status: {response.status_code}" + ) + LOGGER.debug(f" Response headers: {dict(response.headers)}") + # Handle rate limiting + if response.status_code == 429: + default_backoff = INITIAL_BACKOFF ** (retry_count + 1) + retry_after = int( + response.headers.get("Retry-After", default_backoff) + ) + if retry_count < MAX_RETRIES: + LOGGER.warning( + f"Rate limited getting subcategories. " + f"Waiting {retry_after}s " + f"({retry_count + 1}/{MAX_RETRIES})..." + ) + time.sleep(retry_after) + return get_subcategories(session, category_name, retry_count + 1) + else: + LOGGER.error( + f"Rate limit exceeded after {MAX_RETRIES} retries" + ) + return [] + + response.raise_for_status() + data = response.json() + if debug: + LOGGER.debug(f" Response data: {data}") + + members = data.get("query", {}).get("categorymembers", []) + subcategories = [] + + for member in members: + title = member.get("title", "") + if title.startswith("Category:"): + subcat_name = title.replace("Category:", "") + subcategories.append(subcat_name) + + LOGGER.debug( + f"Found {len(subcategories)} subcategories for {category_name}" + ) + return subcategories + + except requests.Timeout: + if retry_count < MAX_RETRIES: + backoff = INITIAL_BACKOFF ** (retry_count + 1) + LOGGER.warning( + f"Timeout getting subcategories for {category_name}. " + f"Retrying in {backoff}s ({retry_count + 1}/{MAX_RETRIES})..." + ) + time.sleep(backoff) + return get_subcategories(session, category_name, retry_count + 1) + else: + LOGGER.error( + f"Timeout after {MAX_RETRIES} retries for {category_name}" + ) + return [] + except requests.HTTPError as e: + retryable_statuses = [429, 500, 502, 503, 504] + if retry_count < MAX_RETRIES and e.response.status_code in retryable_statuses: + backoff = INITIAL_BACKOFF ** (retry_count + 1) + LOGGER.warning( + f"HTTP {e.response.status_code} getting subcategories. " + f"Retrying in {backoff}s " + f"({retry_count + 1}/{MAX_RETRIES})..." + ) + time.sleep(backoff) + return get_subcategories(session, category_name, retry_count + 1) + raise shared.QuantifyingException(f"HTTP Error: {e}", 1) + except requests.RequestException as e: + if retry_count < MAX_RETRIES: + backoff = INITIAL_BACKOFF ** (retry_count + 1) + LOGGER.warning( + f"Request exception getting subcategories: {e}. " + f"Retrying in {backoff}s ({retry_count + 1}/{MAX_RETRIES})..." + ) + time.sleep(backoff) + return get_subcategories(session, category_name, retry_count + 1) + raise shared.QuantifyingException(f"Request Exception: {e}", 1) + except KeyError as e: + raise shared.QuantifyingException(f"KeyError: {e}", 1) + + +def get_cached_category_count(category_name): + """ + Get cached category count if available. + + Args: + category_name: Name of the category + + Returns: + dict or None: Cached count dict or None if not cached + """ + with _cache_lock: + result = _category_cache.get(category_name) + if result is not None: + with _perf_lock: + global _cache_hits + _cache_hits += 1 + else: + with _perf_lock: + global _cache_misses + _cache_misses += 1 + return result + + +def set_cached_category_count(category_name, counts): + """ + Cache category count for future use. + + Args: + category_name: Name of the category + counts: Dictionary with 'files' and 'pages' counts + """ + with _cache_lock: + _category_cache[category_name] = { + "files": counts["files"], + "pages": counts["pages"], + "timestamp": time.time() + } + + +def clear_category_cache(): + """Clear the global category cache.""" + with _cache_lock: + _category_cache.clear() + LOGGER.info("Category cache cleared") + + +def get_cache_stats(): + """ + Get statistics about the category cache. + + Returns: + dict: Cache statistics + """ + with _cache_lock: + with _perf_lock: + total_requests = _cache_hits + _cache_misses + hit_rate = ( + (_cache_hits / total_requests * 100) + if total_requests > 0 else 0 + ) + return { + "size": len(_category_cache), + "categories": list(_category_cache.keys()), + "hits": _cache_hits, + "misses": _cache_misses, + "hit_rate": hit_rate + } + + +def recursively_count_category( + session, + category_name, + visited=None, + depth=0, + max_retries=MAX_RETRIES, + start_time=None, + max_depth=4 +): + """ + Recursively count files and pages in a category and its subcategories. + + Uses global caching to avoid duplicate work across different license + queries. Implements recursion depth limiting to prevent infinite recursion. + + Args: + session: requests.Session object + category_name: Name of the category to count (WikiCommons format) + visited: Set of already visited categories (for cycle detection) + depth: Current recursion depth + max_retries: Maximum number of retries for failed operations + start_time: Start time for timeout checking (None = no timeout) + max_depth: Maximum recursion depth (default: 4) + + Returns: + dict: Dictionary with 'files' and 'pages' counts + """ + # Check timeout for entire category count operation + if start_time is not None: + elapsed = time.time() - start_time + if elapsed > CATEGORY_COUNT_TIMEOUT: + LOGGER.warning( + f"Category count timeout ({CATEGORY_COUNT_TIMEOUT}s) " + f"for {category_name} at depth {depth}" + ) + return {"files": 0, "pages": 0} + + # Check recursion depth limit + if depth >= max_depth: + LOGGER.debug( + f"Max recursion depth ({max_depth}) reached for " + f"{category_name}. Stopping recursion." + ) + # Still get direct counts for this level + try: + counts = get_category_info(session, category_name) + return counts if counts else {"files": 0, "pages": 0} + except Exception: + return {"files": 0, "pages": 0} + + # Check global cache first + cached = get_cached_category_count(category_name) + if cached is not None: + LOGGER.debug( + f"Using cached result for {category_name}: " + f"{cached['files']} files, {cached['pages']} pages" + ) + return {"files": cached["files"], "pages": cached["pages"]} + + if visited is None: + visited = set() + + if category_name in visited: + LOGGER.warning(f"Cycle detected for category: {category_name}") + return {"files": 0, "pages": 0} + + visited.add(category_name) + + # Get direct counts for this category with retry logic + try: + counts = get_category_info(session, category_name) + except Exception as e: + LOGGER.error( + f"Failed to get category info for {category_name}: {e}" + ) + return {"files": 0, "pages": 0} + + # Handle case where category doesn't exist (returns None) + if counts is None: + LOGGER.warning(f"Category {category_name} not found or has no data") + return {"files": 0, "pages": 0} + + # Get subcategories and recursively count them + try: + subcategories = get_subcategories(session, category_name) + except Exception as e: + LOGGER.error( + f"Failed to get subcategories for {category_name}: {e}" + ) + # Cache and return what we have so far + set_cached_category_count(category_name, counts) + return counts + + # Recursively count subcategories (with depth limit) + # Note: Don't use visited.copy() - share the same visited set to prevent + # revisiting categories across different branches + for subcat in subcategories: + if subcat not in visited: # Avoid infinite recursion + try: + subcat_counts = recursively_count_category( + session, + subcat, + visited, + depth + 1, + max_retries, + start_time, + max_depth + ) + counts["files"] += subcat_counts["files"] + counts["pages"] += subcat_counts["pages"] + except Exception as e: + LOGGER.warning( + f"Failed to count subcategory {subcat}: {e}. " + "Continuing with other subcategories..." + ) + # Continue with other subcategories + + # Cache the result for future use + set_cached_category_count(category_name, counts) + + return counts + + +def write_data(args, license_data): + """Write the collected data to CSV file.""" + if not args.enable_save: + return args + + # Create data directory for this phase + os.makedirs(PATHS["data_phase"], exist_ok=True) + + if len(license_data) < len(CC_LICENSE_CATEGORIES): + LOGGER.error("Unable to fetch all records. Aborting.") + return args + + with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj: + writer = csv.DictWriter( + file_obj, fieldnames=HEADER1_COUNT, dialect="unix" + ) + writer.writeheader() + for row in license_data: + writer.writerow(row) + + LOGGER.info(f"Data written to {FILE1_COUNT}") + return args + + +def format_time_remaining(seconds): + """ + Format time remaining in a human-readable format. + + Args: + seconds: Number of seconds remaining + + Returns: + str: Formatted time string (e.g., "2h 30m 15s") + """ + if seconds < 60: + return f"{int(seconds)}s" + elif seconds < 3600: + minutes = int(seconds // 60) + secs = int(seconds % 60) + return f"{minutes}m {secs}s" + else: + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + return f"{hours}h {minutes}m {secs}s" + + +def print_startup_banner(args): + """ + Print a professional startup banner with script information. + + Args: + args: Parsed command-line arguments + """ + LOGGER.info("=" * 80) + LOGGER.info("WikiCommons CC License Data Fetcher") + LOGGER.info(f"Version {__version__}") + LOGGER.info("=" * 80) + LOGGER.info("Configuration:") + LOGGER.info(f" Quarter: {QUARTER}") + LOGGER.info(f" Timeout: {args.timeout}s") + LOGGER.info(f" Max Depth: {args.max_depth}") + LOGGER.info(f" Parallel Workers: {args.parallel}") + if args.limit: + LOGGER.info(f" Limit: {args.limit} licenses (testing mode)") + if args.skip_validation: + LOGGER.info(" Validation: SKIPPED") + if args.debug: + LOGGER.info(" Debug Mode: ENABLED") + if args.dry_run: + LOGGER.info(" Mode: DRY RUN") + LOGGER.info("=" * 80) + + +def process_single_license( + session, license_name, category_name, index, total_licenses, start_time, + max_depth=4 +): + """ + Process a single license category count. + + This function is designed to be called in parallel by ThreadPoolExecutor. + + Args: + session: requests.Session object (thread-safe) + license_name: Human-readable license name + category_name: WikiCommons category name + index: Current index (for progress reporting) + total_licenses: Total number of licenses + start_time: Start time of overall operation + + Returns: + tuple: (license_name, result_dict, elapsed_time, success) + """ + license_start_time = time.time() + progress_pct = (index / total_licenses) * 100 + + LOGGER.info( + f"[{index}/{total_licenses}] ({progress_pct:.1f}%) " + f"Processing: {license_name}" + ) + LOGGER.info(f" Category: {category_name}") + + try: + # Use timeout for entire category count + counts = recursively_count_category( + session, + category_name, + visited=None, + depth=0, + start_time=license_start_time, + max_depth=max_depth + ) + license_elapsed = time.time() - license_start_time + + LOGGER.info( + f" ✓ Completed in {license_elapsed:.1f}s - " + f"{counts['files']:,} files, {counts['pages']:,} pages" + ) + + return ( + license_name, + { + "LICENSE": license_name, + "FILE_COUNT": counts["files"], + "PAGE_COUNT": counts["pages"] + }, + license_elapsed, + True, + None + ) + + except Exception as e: + license_elapsed = time.time() - license_start_time + LOGGER.error( + f" ✗ Failed after {license_elapsed:.1f}s: {e}" + ) + + return ( + license_name, + { + "LICENSE": license_name, + "FILE_COUNT": 0, + "PAGE_COUNT": 0 + }, + license_elapsed, + False, + str(e) + ) + + +def query_wikicommons(args, session, valid_categories): + """ + Query WikiCommons API for CC license data with progress tracking. + + Uses parallel processing with ThreadPoolExecutor for better performance. + Uses global caching to avoid duplicate category queries. + + Args: + args: Parsed command-line arguments + session: requests.Session object + valid_categories: List of tuples (license_name, category_name) for + categories that have been validated to exist + + Returns: + tuple: (license_data list, summary dict with success/failure counts) + """ + # Apply limit if specified + if args.limit is not None: + valid_categories = valid_categories[:args.limit] + LOGGER.info(f"Limited to first {args.limit} licenses for testing") + + total_licenses = len(valid_categories) + start_time = time.time() + success_count = 0 + failure_count = 0 + failed_licenses = [] + license_data = [] + + # Clear cache at start (optional - can be removed to persist cache) + # clear_category_cache() + + max_workers = args.parallel + max_depth = args.max_depth + + LOGGER.info( + f"Starting to process {total_licenses} licenses " + f"with {max_workers} parallel workers..." + ) + LOGGER.info(f"Using max recursion depth: {max_depth}") + LOGGER.info( + f"Using cache: {len(_category_cache)} categories already cached" + ) + + # Use ThreadPoolExecutor for parallel processing + # Note: requests.Session is not fully thread-safe, so we create + # a session per thread + def create_session(): + """Create a new session for each thread.""" + return get_requests_session( + request_timeout=args.timeout, + debug=args.debug + ) + + # Process licenses in parallel + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_license = {} + for index, (license_name, category_name) in enumerate( + valid_categories, 1 + ): + # Create a new session for each task to ensure thread safety + task_session = create_session() + future = executor.submit( + process_single_license, + task_session, + license_name, + category_name, + index, + total_licenses, + start_time, + max_depth + ) + future_to_license[future] = (license_name, index) + + # Process completed tasks as they finish + completed = 0 + for future in as_completed(future_to_license): + completed += 1 + license_name, index = future_to_license[future] + + try: + ( + result_license_name, + result_data, + elapsed, + success, + error + ) = future.result() + + license_data.append(result_data) + + if success: + success_count += 1 + else: + failure_count += 1 + if error: + failed_licenses.append((license_name, error)) + + # Log progress + remaining = total_licenses - completed + if remaining > 0: + avg_time = (time.time() - start_time) / completed + estimated_remaining = avg_time * remaining + time_remaining_str = format_time_remaining( + estimated_remaining + ) + LOGGER.info( + f"Progress: {completed}/{total_licenses} completed. " + f"Estimated time remaining: {time_remaining_str}" + ) + + except Exception as e: + failure_count += 1 + failed_licenses.append((license_name, str(e))) + license_data.append({ + "LICENSE": license_name, + "FILE_COUNT": 0, + "PAGE_COUNT": 0 + }) + LOGGER.error( + f"Unexpected error processing {license_name}: {e}" + ) + + # Sort license_data by original order + license_dict = {item["LICENSE"]: item for item in license_data} + license_data = [ + license_dict.get(license_name, { + "LICENSE": license_name, + "FILE_COUNT": 0, + "PAGE_COUNT": 0 + }) + for license_name, _ in valid_categories + ] + + # Add entries for missing categories with zero counts + processed_licenses = { + license_name for license_name, _ in valid_categories + } + skipped_licenses = [] + for license_name in CC_LICENSE_CATEGORIES: + if license_name not in processed_licenses: + skipped_licenses.append(license_name) + LOGGER.warning( + f"Skipping '{license_name}' - category not found on " + "WikiCommons" + ) + license_data.append({ + "LICENSE": license_name, + "FILE_COUNT": 0, + "PAGE_COUNT": 0 + }) + + total_time = time.time() - start_time + + # Calculate total files and pages + total_files = sum(item.get("FILE_COUNT", 0) for item in license_data) + total_pages = sum(item.get("PAGE_COUNT", 0) for item in license_data) + + # Get cache statistics + cache_stats = get_cache_stats() + + summary = { + "total": total_licenses, + "success": success_count, + "failure": failure_count, + "failed_licenses": failed_licenses, + "skipped": len(skipped_licenses), + "skipped_licenses": skipped_licenses, + "total_time": total_time, + "avg_time_per_license": ( + total_time / total_licenses if total_licenses > 0 else 0 + ), + "total_files": total_files, + "total_pages": total_pages, + "cache_size": cache_stats["size"], + "cache_hits": cache_stats["hits"], + "cache_misses": cache_stats["misses"], + "cache_hit_rate": cache_stats["hit_rate"] + } + + return license_data, summary + + +def check_popular_licenses(license_data): + """ + Check that popular licenses have reasonable file counts. + + Args: + license_data: List of dicts with LICENSE and FILE_COUNT keys + + Returns: + list: List of warnings for licenses with suspiciously low counts + """ + warnings = [] + popular_licenses = { + "CC BY 4.0": 1000, + "CC BY-SA 4.0": 1000, + "CC0 1.0": 1000, + } + + license_dict = {item["LICENSE"]: item for item in license_data} + + for license_name, min_count in popular_licenses.items(): + if license_name in license_dict: + file_count = license_dict[license_name]["FILE_COUNT"] + if file_count < min_count: + warnings.append( + f"⚠️ {license_name} has only {file_count:,} files " + f"(expected at least {min_count:,})" + ) + + return warnings + + +def extract_license_version(license_name): + """ + Extract version number from license name. + + Args: + license_name: License name like "CC BY 4.0" or "CC0 1.0" + + Returns: + float or None: Version number (e.g., 4.0, 3.0, 1.0) or None + """ + # Match version patterns like "4.0", "3.0", "1.0", "2.5", "2.0" + match = re.search(r'(\d+\.\d+)', license_name) + if match: + return float(match.group(1)) + return None + + +def extract_license_type(license_name): + """ + Extract license type from license name. + + Args: + license_name: License name like "CC BY 4.0" or "CC BY-SA 3.0" + + Returns: + str: License type (e.g., "CC BY", "CC BY-SA", "CC0", "PDM") + """ + if license_name.startswith("CC0"): + return "CC0" + if license_name.startswith("Public Domain Mark"): + return "PDM" + # Extract base type (e.g., "CC BY", "CC BY-SA", "CC BY-NC") + parts = license_name.split() + if len(parts) >= 2: + if parts[1] == "BY": + if len(parts) >= 3 and parts[2] in ["SA", "NC", "ND"]: + return f"CC BY-{parts[2]}" + return "CC BY" + elif parts[1] == "BY-SA": + return "CC BY-SA" + elif parts[1] == "BY-NC": + if len(parts) >= 3 and parts[2] == "SA": + return "CC BY-NC-SA" + elif len(parts) >= 3 and parts[2] == "ND": + return "CC BY-NC-ND" + return "CC BY-NC" + elif parts[1] == "BY-ND": + return "CC BY-ND" + return "Unknown" + + +def verify_version_progression(license_data): + """ + Verify that newer license versions generally have more files. + + Args: + license_data: List of dicts with LICENSE and FILE_COUNT keys + + Returns: + list: List of warnings for version progression anomalies + """ + warnings = [] + + # Group licenses by type + license_dict = {item["LICENSE"]: item for item in license_data} + license_groups = {} + + for license_name, data in license_dict.items(): + license_type = extract_license_type(license_name) + version = extract_license_version(license_name) + + if license_type and version: + if license_type not in license_groups: + license_groups[license_type] = [] + license_groups[license_type].append({ + "name": license_name, + "version": version, + "count": data["FILE_COUNT"] + }) + + # Check each license type group + for license_type, licenses in license_groups.items(): + # Sort by version (newest first) + licenses.sort(key=lambda x: x["version"], reverse=True) + + # Check that newer versions generally have more files + for i in range(len(licenses) - 1): + newer = licenses[i] + older = licenses[i + 1] + + # Allow some tolerance (older can be up to 50% of newer) + # but flag if older has significantly more + if older["count"] > newer["count"] * 1.5: + warnings.append( + f"⚠️ Version anomaly: {older['name']} " + f"({older['count']:,} files) has more than " + f"{newer['name']} ({newer['count']:,} files)" + ) + + return warnings + + +def compare_license_counts(license_data): + """ + Compare counts across licenses and flag anomalies. + + Args: + license_data: List of dicts with LICENSE and FILE_COUNT keys + + Returns: + list: List of warnings for count anomalies + """ + warnings = [] + license_dict = {item["LICENSE"]: item for item in license_data} + + # Calculate statistics + counts = [item["FILE_COUNT"] for item in license_data] + if not counts: + return warnings + + max_count = max(counts) + min_count = min(counts) + avg_count = sum(counts) / len(counts) + + # Flag licenses with unusually high or low counts + for license_name, data in license_dict.items(): + count = data["FILE_COUNT"] + + # Flag if count is unusually high (more than 10x average) + if count > avg_count * 10 and avg_count > 0: + warnings.append( + f"⚠️ {license_name} has unusually high count: " + f"{count:,} files (avg: {avg_count:,.0f})" + ) + + # Flag if count is zero for non-special licenses + if count == 0 and license_name not in ["CC0 1.0", "Public Domain Mark 1.0"]: + # Check if it's an old version (1.0, 2.0) which might legitimately be 0 + version = extract_license_version(license_name) + if version and version >= 2.5: + warnings.append( + f"⚠️ {license_name} has zero files (may indicate " + "category issue)" + ) + + return warnings + + +def validate_special_licenses(license_data): + """ + Validate that CC0-1.0 and PDM-1.0 return reasonable file counts. + + Args: + license_data: List of dicts with LICENSE and FILE_COUNT keys + + Returns: + list: List of warnings for special license issues + """ + warnings = [] + license_dict = {item["LICENSE"]: item for item in license_data} + + special_licenses = { + "CC0 1.0": { + "min_reasonable": 10000, # CC0 should have many files + "description": "CC0" + }, + "Public Domain Mark 1.0": { + "min_reasonable": 1000, # PDM should have some files + "description": "Public Domain Mark" + } + } + + for license_name, criteria in special_licenses.items(): + if license_name in license_dict: + count = license_dict[license_name]["FILE_COUNT"] + min_reasonable = criteria["min_reasonable"] + + if count < min_reasonable: + warnings.append( + f"⚠️ {criteria['description']} ({license_name}) has " + f"only {count:,} files (expected at least " + f"{min_reasonable:,})" + ) + elif count == 0: + warnings.append( + f"⚠️ {criteria['description']} ({license_name}) has " + "zero files - this may indicate a category naming issue" + ) + + return warnings + + +def validate_total_counts(license_data): + """ + Validate that total file counts make sense (should be millions). + + Args: + license_data: List of dicts with LICENSE and FILE_COUNT keys + + Returns: + list: List of warnings for total count issues + """ + warnings = [] + + total_files = sum(item["FILE_COUNT"] for item in license_data) + + # WikiCommons should have millions of CC-licensed files + if total_files < 1_000_000: + warnings.append( + f"⚠️ Total file count ({total_files:,}) is suspiciously low. " + "Expected millions of files on WikiCommons." + ) + elif total_files < 5_000_000: + warnings.append( + f"⚠️ Total file count ({total_files:,}) is lower than " + "expected. WikiCommons typically has tens of millions of " + "CC-licensed files." + ) + else: + LOGGER.info( + f"✓ Total file count: {total_files:,} files (reasonable)" + ) + + return warnings + + +def perform_sanity_checks(license_data): + """ + Perform all sanity checks and validations on license data. + + Args: + license_data: List of dicts with LICENSE and FILE_COUNT keys + + Returns: + dict: Dictionary with all warnings organized by category + """ + LOGGER.info("Performing sanity checks and validations...") + + checks = { + "popular_licenses": check_popular_licenses(license_data), + "version_progression": verify_version_progression(license_data), + "count_anomalies": compare_license_counts(license_data), + "special_licenses": validate_special_licenses(license_data), + "total_counts": validate_total_counts(license_data), + } + + total_warnings = sum(len(w) for w in checks.values()) + + if total_warnings == 0: + LOGGER.info("✓ All sanity checks passed!") + else: + LOGGER.warning( + f"Found {total_warnings} warning(s) across sanity checks" + ) + + return checks + + +def print_summary_report(summary, sanity_checks=None, license_data=None): + """ + Print a comprehensive summary report of the query operation. + + Args: + summary: Dictionary with summary statistics + sanity_checks: Optional dict with sanity check results + license_data: Optional list of license data for detailed stats + """ + LOGGER.info("") + LOGGER.info("=" * 80) + LOGGER.info("EXECUTION SUMMARY REPORT") + LOGGER.info("=" * 80) + + # Processing Statistics + LOGGER.info("PROCESSING STATISTICS") + LOGGER.info("-" * 80) + LOGGER.info(f"Total licenses in dataset: {len(CC_LICENSE_CATEGORIES)}") + LOGGER.info(f"Licenses processed: {summary['total']}") + LOGGER.info(f" ✓ Successful: {summary['success']}") + LOGGER.info(f" ✗ Failed: {summary['failure']}") + if summary.get('skipped', 0) > 0: + LOGGER.info(f" ⊘ Skipped: {summary['skipped']}") + + if summary['failure'] > 0: + success_rate = (summary['success'] / summary['total']) * 100 + LOGGER.warning(f"Success rate: {success_rate:.1f}%") + LOGGER.warning("Failed licenses:") + for license_name, error in summary['failed_licenses']: + LOGGER.warning(f" - {license_name}: {error}") + else: + LOGGER.info("Success rate: 100.0%") + + if summary.get('skipped_licenses'): + LOGGER.warning("Skipped licenses (category not found):") + for license_name in summary['skipped_licenses']: + LOGGER.warning(f" - {license_name}") + + # Data Statistics + LOGGER.info("") + LOGGER.info("DATA STATISTICS") + LOGGER.info("-" * 80) + total_files = summary.get('total_files', 0) + total_pages = summary.get('total_pages', 0) + LOGGER.info(f"Total files counted: {total_files:,}") + LOGGER.info(f"Total pages counted: {total_pages:,}") + + if license_data: + # Find licenses with suspiciously low counts + low_count_licenses = [] + for item in license_data: + file_count = item.get("FILE_COUNT", 0) + license_name = item.get("LICENSE", "Unknown") + # Flag licenses with less than 100 files (except old versions) + version = extract_license_version(license_name) + if file_count < 100 and (not version or version >= 3.0): + low_count_licenses.append((license_name, file_count)) + + if low_count_licenses: + LOGGER.warning("") + LOGGER.warning("Licenses with suspiciously low counts (< 100 files):") + for license_name, count in sorted( + low_count_licenses, key=lambda x: x[1] + ): + LOGGER.warning(f" - {license_name}: {count:,} files") + + # Performance Statistics + LOGGER.info("") + LOGGER.info("PERFORMANCE STATISTICS") + LOGGER.info("-" * 80) + LOGGER.info(f"Total execution time: {format_time_remaining(summary['total_time'])}") + LOGGER.info( + f"Average time per license: " + f"{summary['avg_time_per_license']:.1f}s" + ) + if summary['total'] > 0: + licenses_per_minute = (summary['total'] / summary['total_time']) * 60 + LOGGER.info(f"Processing rate: {licenses_per_minute:.2f} licenses/min") + + # Cache Statistics + if 'cache_size' in summary: + LOGGER.info("") + LOGGER.info("CACHE STATISTICS") + LOGGER.info("-" * 80) + LOGGER.info(f"Categories cached: {summary['cache_size']}") + if 'cache_hits' in summary and 'cache_misses' in summary: + total_cache_requests = ( + summary['cache_hits'] + summary['cache_misses'] + ) + LOGGER.info(f"Cache hits: {summary['cache_hits']:,}") + LOGGER.info(f"Cache misses: {summary['cache_misses']:,}") + if total_cache_requests > 0: + LOGGER.info( + f"Cache hit rate: {summary.get('cache_hit_rate', 0):.1f}%" + ) + + # Sanity Checks & Validation + if sanity_checks: + LOGGER.info("") + LOGGER.info("DATA QUALITY & VALIDATION") + LOGGER.info("-" * 80) + + total_warnings = sum(len(w) for w in sanity_checks.values()) + + if total_warnings == 0: + LOGGER.info("✓ All sanity checks passed!") + else: + LOGGER.warning(f"Found {total_warnings} warning(s) across sanity checks:") + + if sanity_checks.get("popular_licenses"): + LOGGER.warning("") + LOGGER.warning("Popular License Checks:") + for warning in sanity_checks["popular_licenses"]: + LOGGER.warning(f" {warning}") + + if sanity_checks.get("version_progression"): + LOGGER.warning("") + LOGGER.warning("Version Progression Issues:") + for warning in sanity_checks["version_progression"]: + LOGGER.warning(f" {warning}") + + if sanity_checks.get("count_anomalies"): + LOGGER.warning("") + LOGGER.warning("Count Anomalies:") + for warning in sanity_checks["count_anomalies"]: + LOGGER.warning(f" {warning}") + + if sanity_checks.get("special_licenses"): + LOGGER.warning("") + LOGGER.warning("Special License Issues:") + for warning in sanity_checks["special_licenses"]: + LOGGER.warning(f" {warning}") + + if sanity_checks.get("total_counts"): + LOGGER.warning("") + LOGGER.warning("Total Count Issues:") + for warning in sanity_checks["total_counts"]: + LOGGER.warning(f" {warning}") + + LOGGER.info("=" * 80) + + +def main(): + """Main function to orchestrate the WikiCommons data fetch.""" + script_start_time = time.time() + exit_code = 0 + + try: + args = parse_arguments() + + # Print startup banner + print_startup_banner(args) + + shared.paths_log(LOGGER, PATHS) + + # Set debug logging level if requested + if args.debug: + import logging + LOGGER.setLevel(logging.DEBUG) + LOGGER.debug("Debug logging enabled") + + # Test category conversion before proceeding + test_category_conversion() + + # Handle dry-run mode + if args.dry_run: + LOGGER.info("DRY RUN MODE: Testing conversions and validation only") + session = get_requests_session( + request_timeout=args.timeout, + debug=args.debug + ) + + # Validate categories exist + if not args.skip_validation: + valid_categories = validate_categories_exist(session) + LOGGER.info( + f"Dry run complete: {len(valid_categories)} categories " + "validated" + ) + else: + LOGGER.info("Dry run complete: Skipped validation") + + total_time = time.time() - script_start_time + LOGGER.info("") + LOGGER.info(f"Dry run completed in {format_time_remaining(total_time)}") + return 0 + + check_for_completion() + + session = get_requests_session( + request_timeout=args.timeout, + debug=args.debug + ) + + # Validate categories exist before expensive recursive queries + if args.skip_validation: + LOGGER.info("Skipping category validation (--skip-validation)") + # Create valid_categories from all licenses without validation + valid_categories = [ + (license_name, license_to_wikicommons_category(license_name)) + for license_name in CC_LICENSE_CATEGORIES + ] + else: + valid_categories = validate_categories_exist(session) + + license_data, summary = query_wikicommons( + args, session, valid_categories + ) + + # Perform sanity checks and validations + sanity_checks = perform_sanity_checks(license_data) + + # Print comprehensive summary report + print_summary_report(summary, sanity_checks, license_data) + + # Write data to file + args = write_data(args, license_data) + + # Git operations + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit new WikiCommons data for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + # Calculate total execution time + total_execution_time = time.time() - script_start_time + + # Print final success message + LOGGER.info("") + LOGGER.info("=" * 80) + LOGGER.info("EXECUTION COMPLETED SUCCESSFULLY") + LOGGER.info("=" * 80) + LOGGER.info(f"Total execution time: {format_time_remaining(total_execution_time)}") + + if args.enable_save: + LOGGER.info(f"Output file: {FILE1_COUNT}") + LOGGER.info(f"Total records written: {len(license_data)}") + + # Determine exit code based on results + if summary['failure'] > 0: + exit_code = 1 + LOGGER.warning( + f"Completed with {summary['failure']} failure(s). " + "Please review the summary report above." + ) + else: + total_warnings = ( + sum(len(w) for w in sanity_checks.values()) + if sanity_checks else 0 + ) + if total_warnings > 0: + exit_code = 0 # Warnings don't fail the script + LOGGER.info( + f"Completed with {total_warnings} warning(s). " + "Data quality checks completed." + ) + else: + LOGGER.info("✓ All operations completed successfully!") + LOGGER.info("✓ All data quality checks passed!") + + LOGGER.info("") + LOGGER.info("Next steps:") + LOGGER.info(" 1. Review the output file for data accuracy") + LOGGER.info(" 2. Check the summary report for any warnings") + if args.enable_save: + LOGGER.info(f" 3. Process the data file: {FILE1_COUNT}") + LOGGER.info("=" * 80) + + return exit_code + + except shared.QuantifyingException as e: + total_execution_time = time.time() - script_start_time + LOGGER.error("") + LOGGER.error("=" * 80) + LOGGER.error("EXECUTION FAILED") + LOGGER.error("=" * 80) + LOGGER.error(f"Error: {e.message}") + LOGGER.error(f"Execution time: {format_time_remaining(total_execution_time)}") + LOGGER.error("=" * 80) + return e.exit_code + except Exception as e: + total_execution_time = time.time() - script_start_time + LOGGER.critical("") + LOGGER.critical("=" * 80) + LOGGER.critical("UNEXPECTED ERROR") + LOGGER.critical("=" * 80) + LOGGER.critical(f"Error: {str(e)}") + LOGGER.critical(f"Execution time: {format_time_remaining(total_execution_time)}") + LOGGER.critical("=" * 80) + raise + + +if __name__ == "__main__": + try: + exit_code = main() + sys.exit(exit_code) + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code if e.code is not None else 0) + except KeyboardInterrupt: + LOGGER.info("") + LOGGER.info("=" * 80) + LOGGER.warning("Execution interrupted by user (KeyboardInterrupt)") + LOGGER.info("=" * 80) + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical("") + LOGGER.critical("=" * 80) + LOGGER.critical("UNHANDLED EXCEPTION") + LOGGER.critical("=" * 80) + LOGGER.critical(f"Traceback:\n{traceback_formatted}") + LOGGER.critical("=" * 80) + sys.exit(1) diff --git a/scripts/shared.py b/scripts/shared.py index 541988fc..60ec1467 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -35,6 +35,58 @@ def __init__(self, message, exit_code=None): super().__init__(self.message) +# HTTP status codes that should trigger retries +RETRY_STATUS_FORCELIST = [ + 408, # Request Timeout + 429, # Too Many Requests + 500, # Internal Server Error + 502, # Bad Gateway + 503, # Service Unavailable + 504, # Gateway Timeout +] + +# User-Agent string for HTTP requests +USER_AGENT = "QuantifyingTheCommons/1.0 (https://github.com/creativecommons/quantifying)" + +# Hyphenated to CC legal tool identifier mapping +# Except PDM-1.0, follows SPDX identifier. Used by WikiCommons. +LICENSE_NORMALIZATION = { + "CC-BY-4.0": "CC BY 4.0", + "CC-BY-SA-4.0": "CC BY-SA 4.0", + "CC-BY-NC-4.0": "CC BY-NC 4.0", + "CC-BY-NC-SA-4.0": "CC BY-NC-SA 4.0", + "CC-BY-NC-ND-4.0": "CC BY-NC-ND 4.0", + "CC-BY-ND-4.0": "CC BY-ND 4.0", + "CC-BY-3.0": "CC BY 3.0", + "CC-BY-SA-3.0": "CC BY-SA 3.0", + "CC-BY-NC-3.0": "CC BY-NC 3.0", + "CC-BY-NC-SA-3.0": "CC BY-NC-SA 3.0", + "CC-BY-NC-ND-3.0": "CC BY-NC-ND 3.0", + "CC-BY-ND-3.0": "CC BY-ND 3.0", + "CC-BY-2.5": "CC BY 2.5", + "CC-BY-SA-2.5": "CC BY-SA 2.5", + "CC-BY-NC-2.5": "CC BY-NC 2.5", + "CC-BY-NC-SA-2.5": "CC BY-NC-SA 2.5", + "CC-BY-NC-ND-2.5": "CC BY-NC-ND 2.5", + "CC-BY-ND-2.5": "CC BY-ND 2.5", + "CC-BY-2.0": "CC BY 2.0", + "CC-BY-SA-2.0": "CC BY-SA 2.0", + "CC-BY-NC-2.0": "CC BY-NC 2.0", + "CC-BY-NC-SA-2.0": "CC BY-NC-SA 2.0", + "CC-BY-NC-ND-2.0": "CC BY-NC-ND 2.0", + "CC-BY-ND-2.0": "CC BY-ND 2.0", + "CC-BY-1.0": "CC BY 1.0", + "CC-BY-SA-1.0": "CC BY-SA 1.0", + "CC-BY-NC-1.0": "CC BY-NC 1.0", + "CC-BY-NC-SA-1.0": "CC BY-NC-SA 1.0", + "CC-BY-NC-ND-1.0": "CC BY-NC-ND 1.0", + "CC-BY-ND-1.0": "CC BY-ND 1.0", + "CC0-1.0": "CC0 1.0", + "PDM-1.0": "PDM 1.0", +} +def get_session(accept_header=None): + """Create a reusable HTTP session with retry logic.""" + session = Session() def get_session(accept_header=None, session=None): """ Create or configure a reusable HTTPS session with retry logic and