4141# CSV headers
4242HEADER1 = ["LICENSE" , "COUNT" ]
4343HEADER2 = ["LICENSE" , "LANGUAGE" , "COUNT" ]
44-
44+ LIMIT_DEFAULT = 100000
4545QUARTER = os .path .basename (PATHS ["data_quarter" ])
4646
4747ISO639_CACHE = {}
5050def parse_arguments ():
5151 LOGGER .info ("Parsing command-line options" )
5252 parser = argparse .ArgumentParser (description = __doc__ )
53+ parser .add_argument (
54+ "--limit" ,
55+ type = int ,
56+ default = LIMIT_DEFAULT ,
57+ help = f"Limit total results (default: { LIMIT_DEFAULT } )" ,
58+ )
5359 parser .add_argument (
5460 "--enable-save" , action = "store_true" , help = "Enable saving results"
5561 )
@@ -301,49 +307,40 @@ def query_internet_archive(args):
301307 unmapped_licenseurl_counter = Counter ()
302308 unmapped_language_counter = Counter ()
303309
304- fields = ["licenseurl" , "language" ]
305- query = "licenseurl:*creativecommons.org*"
306310 license_mapping = load_license_mapping ()
307311
308- rows = 1000000
309- total_rows = 0
310- total_processed = 0
311-
312312 session = shared .get_session (
313313 accept_header = "application/json" , session = ArchiveSession ()
314314 )
315- while True :
316- # Loop until no more results are returned by the API
317- LOGGER .info (f"Fetching { rows } items starting at { total_rows } ..." )
318-
319- # Use search_items for simpler pagination management
320- response = session .search_items (
321- query ,
322- fields = fields ,
323- params = {"rows" : rows , "start" : total_rows },
324- request_kwargs = {"timeout" : 30 },
325- )
326315
327- # Convert to list to iterate over
328- results = list (response )
329- total_rows += len (results )
330-
331- if not results :
332- LOGGER .info ("No more results. Ending pagination." )
333- break
334-
335- for result in results :
336- # Extract and normalize license URL
337- licenseurl = result .get ("licenseurl" , "" )
338- if isinstance (licenseurl , list ):
339- licenseurl = licenseurl [0 ] if licenseurl else "UNKNOWN"
340- if not licenseurl :
341- licenseurl = "UNKNOWN"
316+ LOGGER .info ("Beginning fetch." )
317+ # Use search_items for simpler pagination management
318+ response = session .search_items (
319+ query = "licenseurl:*creativecommons.org*" ,
320+ fields = ["licenseurl" , "language" ],
321+ params = {"count" : 10000 },
322+ request_kwargs = {"timeout" : 30 },
323+ )
324+ found = response .num_found
325+ LOGGER .info (
326+ f"Found { found :,} results. Processing a maximum of" f" { args .limit :,} ."
327+ )
342328
343- normalized_url = normalize_license (licenseurl , license_mapping )
344- if normalized_url == "UNKNOWN" :
345- unmapped_licenseurl_counter [licenseurl ] += 1
346- continue # Skip
329+ total_processed = 0
330+ for result in response :
331+ if result .get ("error" ):
332+ raise shared .QuantifyingException (result .get ("error" ), 1 )
333+ # Extract and normalize license URL
334+ licenseurl = result .get ("licenseurl" , "" )
335+ if isinstance (licenseurl , list ):
336+ licenseurl = licenseurl [0 ] if licenseurl else "UNKNOWN"
337+ if not licenseurl :
338+ licenseurl = "UNKNOWN"
339+ normalized_url = normalize_license (licenseurl , license_mapping )
340+ if normalized_url == "UNKNOWN" :
341+ unmapped_licenseurl_counter [licenseurl ] += 1
342+ else :
343+ license_counter [(normalized_url )] += 1
347344
348345 # Extract and normalize language
349346 raw_language = result .get ("language" , "Undetermined" )
@@ -356,46 +353,41 @@ def query_internet_archive(args):
356353 if normalized_lang == "Undetermined" :
357354 unmapped_language_counter [raw_language ] += 1
358355
359- license_counter [(normalized_url )] += 1
360356 language_counter [(normalized_url , normalized_lang )] += 1
361- total_processed += 1
362-
363- LOGGER .info (
364- f"Processed { len (results )} new items, total: { total_processed } "
365- )
366- LOGGER .info (f"Total items processed so far: { total_processed } " )
367- LOGGER .info (
368- f"Unique licenses: { len (license_counter )} |"
369- f"Languages:{ len (language_counter )} "
370- )
371-
372- # If the results is less than the requested rows, implies the end
373- if len (results ) < rows :
357+ total_processed += 1
358+ if not total_processed % 10000 :
374359 LOGGER .info (
375- "Fewer results returned than requested. Pagination complete."
360+ f"Processed { total_processed :,} items so far:"
361+ f" { len (license_counter ):,} unique legal tools,"
362+ f" { len (language_counter ):,} unique languages."
376363 )
364+ if total_processed >= args .limit :
365+ LOGGER .warning ("Aborting fetch. Limit reached." )
377366 break
378367
379368 LOGGER .info (
380- "Finished processing. \n "
381- "Number of unmapped licenses: "
382- f"{ sum ( unmapped_licenseurl_counter . values ()) } "
369+ f "Finished fetching { total_processed :, } items: "
370+ f" { len ( license_counter ):, } unique legal tools, "
371+ f" { len ( language_counter ):, } unique languages. "
383372 )
384373
385- # Log unmapped languages once at the end
386374 if unmapped_licenseurl_counter :
375+ LOGGER .warning (
376+ "Number of unmapped legal tools: "
377+ f"{ sum (unmapped_licenseurl_counter .values ()):,} "
378+ )
387379 for license , count in unmapped_licenseurl_counter .items ():
388- LOGGER .warning (f"Unmapped llicense : { license } : { count } " )
380+ LOGGER .warning (f" Unmapped legal tools : { license } : { count :, } " )
389381
390- LOGGER .info (
391- "\n Number of unmapped languages: "
392- f"{ sum (unmapped_language_counter .values ())} "
393- )
394382 if unmapped_language_counter :
383+ LOGGER .warning (
384+ "Number of unmapped languages: "
385+ f"{ sum (unmapped_language_counter .values ()):,} "
386+ )
395387 for lang , count in unmapped_language_counter .items ():
396388 cleaned = normalize_key (strip_noise (lang ))
397389 LOGGER .warning (
398- f"Unmapped language: { lang } (cleaned: { cleaned } ): { count } "
390+ f" Unmapped language: { lang } (cleaned: { cleaned } ): { count :, } "
399391 )
400392
401393 return license_counter , language_counter
0 commit comments