Skip to content

Commit 1175466

Browse files
committed
refactor fetch to use Scraping API and add --limit option
1 parent e402be3 commit 1175466

File tree

1 file changed

+54
-62
lines changed

1 file changed

+54
-62
lines changed

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 54 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
# CSV headers
4242
HEADER1 = ["LICENSE", "COUNT"]
4343
HEADER2 = ["LICENSE", "LANGUAGE", "COUNT"]
44-
44+
LIMIT_DEFAULT = 100000
4545
QUARTER = os.path.basename(PATHS["data_quarter"])
4646

4747
ISO639_CACHE = {}
@@ -50,6 +50,12 @@
5050
def parse_arguments():
5151
LOGGER.info("Parsing command-line options")
5252
parser = argparse.ArgumentParser(description=__doc__)
53+
parser.add_argument(
54+
"--limit",
55+
type=int,
56+
default=LIMIT_DEFAULT,
57+
help=f"Limit total results (default: {LIMIT_DEFAULT})",
58+
)
5359
parser.add_argument(
5460
"--enable-save", action="store_true", help="Enable saving results"
5561
)
@@ -301,49 +307,40 @@ def query_internet_archive(args):
301307
unmapped_licenseurl_counter = Counter()
302308
unmapped_language_counter = Counter()
303309

304-
fields = ["licenseurl", "language"]
305-
query = "licenseurl:*creativecommons.org*"
306310
license_mapping = load_license_mapping()
307311

308-
rows = 1000000
309-
total_rows = 0
310-
total_processed = 0
311-
312312
session = shared.get_session(
313313
accept_header="application/json", session=ArchiveSession()
314314
)
315-
while True:
316-
# Loop until no more results are returned by the API
317-
LOGGER.info(f"Fetching {rows} items starting at {total_rows}...")
318-
319-
# Use search_items for simpler pagination management
320-
response = session.search_items(
321-
query,
322-
fields=fields,
323-
params={"rows": rows, "start": total_rows},
324-
request_kwargs={"timeout": 30},
325-
)
326315

327-
# Convert to list to iterate over
328-
results = list(response)
329-
total_rows += len(results)
330-
331-
if not results:
332-
LOGGER.info("No more results. Ending pagination.")
333-
break
334-
335-
for result in results:
336-
# Extract and normalize license URL
337-
licenseurl = result.get("licenseurl", "")
338-
if isinstance(licenseurl, list):
339-
licenseurl = licenseurl[0] if licenseurl else "UNKNOWN"
340-
if not licenseurl:
341-
licenseurl = "UNKNOWN"
316+
LOGGER.info("Beginning fetch.")
317+
# Use search_items for simpler pagination management
318+
response = session.search_items(
319+
query="licenseurl:*creativecommons.org*",
320+
fields=["licenseurl", "language"],
321+
params={"count": 10000},
322+
request_kwargs={"timeout": 30},
323+
)
324+
found = response.num_found
325+
LOGGER.info(
326+
f"Found {found:,} results. Processing a maximum of" f" {args.limit:,}."
327+
)
342328

343-
normalized_url = normalize_license(licenseurl, license_mapping)
344-
if normalized_url == "UNKNOWN":
345-
unmapped_licenseurl_counter[licenseurl] += 1
346-
continue # Skip
329+
total_processed = 0
330+
for result in response:
331+
if result.get("error"):
332+
raise shared.QuantifyingException(result.get("error"), 1)
333+
# Extract and normalize license URL
334+
licenseurl = result.get("licenseurl", "")
335+
if isinstance(licenseurl, list):
336+
licenseurl = licenseurl[0] if licenseurl else "UNKNOWN"
337+
if not licenseurl:
338+
licenseurl = "UNKNOWN"
339+
normalized_url = normalize_license(licenseurl, license_mapping)
340+
if normalized_url == "UNKNOWN":
341+
unmapped_licenseurl_counter[licenseurl] += 1
342+
else:
343+
license_counter[(normalized_url)] += 1
347344

348345
# Extract and normalize language
349346
raw_language = result.get("language", "Undetermined")
@@ -356,46 +353,41 @@ def query_internet_archive(args):
356353
if normalized_lang == "Undetermined":
357354
unmapped_language_counter[raw_language] += 1
358355

359-
license_counter[(normalized_url)] += 1
360356
language_counter[(normalized_url, normalized_lang)] += 1
361-
total_processed += 1
362-
363-
LOGGER.info(
364-
f"Processed {len(results)} new items, total: {total_processed}"
365-
)
366-
LOGGER.info(f"Total items processed so far: {total_processed}")
367-
LOGGER.info(
368-
f"Unique licenses: {len(license_counter)}|"
369-
f"Languages:{len(language_counter)}"
370-
)
371-
372-
# If the results is less than the requested rows, implies the end
373-
if len(results) < rows:
357+
total_processed += 1
358+
if not total_processed % 10000:
374359
LOGGER.info(
375-
"Fewer results returned than requested. Pagination complete."
360+
f"Processed {total_processed:,} items so far:"
361+
f" {len(license_counter):,} unique legal tools,"
362+
f" {len(language_counter):,} unique languages."
376363
)
364+
if total_processed >= args.limit:
365+
LOGGER.warning("Aborting fetch. Limit reached.")
377366
break
378367

379368
LOGGER.info(
380-
"Finished processing.\n"
381-
"Number of unmapped licenses: "
382-
f"{sum(unmapped_licenseurl_counter.values())}"
369+
f"Finished fetching {total_processed:,} items:"
370+
f" {len(license_counter):,} unique legal tools,"
371+
f" {len(language_counter):,} unique languages."
383372
)
384373

385-
# Log unmapped languages once at the end
386374
if unmapped_licenseurl_counter:
375+
LOGGER.warning(
376+
"Number of unmapped legal tools: "
377+
f"{sum(unmapped_licenseurl_counter.values()):,}"
378+
)
387379
for license, count in unmapped_licenseurl_counter.items():
388-
LOGGER.warning(f"Unmapped llicense: {license} : {count}")
380+
LOGGER.warning(f" Unmapped legal tools: {license}: {count:,}")
389381

390-
LOGGER.info(
391-
"\n Number of unmapped languages: "
392-
f"{sum(unmapped_language_counter.values())}"
393-
)
394382
if unmapped_language_counter:
383+
LOGGER.warning(
384+
"Number of unmapped languages: "
385+
f"{sum(unmapped_language_counter.values()):,}"
386+
)
395387
for lang, count in unmapped_language_counter.items():
396388
cleaned = normalize_key(strip_noise(lang))
397389
LOGGER.warning(
398-
f"Unmapped language: {lang} (cleaned: {cleaned}): {count}"
390+
f" Unmapped language: {lang} (cleaned: {cleaned}): {count:,}"
399391
)
400392

401393
return license_counter, language_counter

0 commit comments

Comments
 (0)