11#!/usr/bin/env python
22"""
3- This file is dedicated to processing Google Custom Search data
4- for analysis and comparison between quarters.
3+ Process Google Custom Search (GCS) data.
54"""
65# Standard library
76import argparse
@@ -205,7 +204,7 @@ def data_to_csv(args, data, file_path):
205204
206205
207206def process_top_25_tools (args , count_data ):
208- LOGGER .info ("Processing top 25 tools" )
207+ LOGGER .info ("Processing count data: top 25 tools" )
209208 data = count_data .sort_values ("COUNT" , ascending = False )
210209 data .reset_index (drop = True , inplace = True )
211210 data = data .iloc [:25 ]
@@ -218,7 +217,7 @@ def process_top_25_tools(args, count_data):
218217
219218
220219def process_totals_by_product (args , count_data ):
221- LOGGER .info ("Processing totals by product" )
220+ LOGGER .info ("Processing count data: totals by product" )
222221 data = {
223222 "Licenses version 4.0" : 0 ,
224223 "Licenses version 3.0" : 0 ,
@@ -259,7 +258,7 @@ def process_totals_by_product(args, count_data):
259258
260259
261260def process_totals_by_unit (args , count_data ):
262- LOGGER .info ("Processing totals by unit" )
261+ LOGGER .info ("Processing count data: totals by unit" )
263262 data = {}
264263 for row in count_data .itertuples (index = False ):
265264 tool = row [0 ]
@@ -287,11 +286,14 @@ def process_totals_by_unit(args, count_data):
287286 data_to_csv (args , data , file_path )
288287
289288
289+ # https://creativecommons.org/public-domain/freeworks/
290290def process_totals_by_free_cultural (args , count_data ):
291- LOGGER .info ("Processing totals by Approved for Free Cultural Works" )
291+ LOGGER .info (
292+ "Processing count data: totals by Approved for Free Cultural Works"
293+ )
292294 data = {
293295 "Approved for Free Cultural Works" : 0 ,
294- "Limited uses " : 0 ,
296+ "Limited use " : 0 ,
295297 }
296298 for row in count_data .itertuples (index = False ):
297299 tool = row [0 ]
@@ -304,7 +306,7 @@ def process_totals_by_free_cultural(args, count_data):
304306 if unit in ["by-sa" , "by" , "sa" , "sampling+" ]:
305307 key = "Approved for Free Cultural Works"
306308 else :
307- key = "Limited uses "
309+ key = "Limited use "
308310 data [key ] += count
309311
310312 data = pd .DataFrame (data .items (), columns = ["Category" , "Count" ])
@@ -317,22 +319,27 @@ def process_totals_by_free_cultural(args, count_data):
317319
318320
319321def process_totals_by_restrictions (args , count_data ):
320- LOGGER .info ("Processing totals by restriction" )
321- data = {"level 0" : 0 , "level 1" : 0 , "level 2" : 0 , "level 3" : 0 }
322+ LOGGER .info ("Processing count data: totals by restriction" )
323+ data = {
324+ "level 0 - unrestricted" : 0 ,
325+ "level 1 - few restrictions" : 0 ,
326+ "level 2 - some restrictions" : 0 ,
327+ "level 3 - many restrictions" : 0 ,
328+ }
322329 for row in count_data .itertuples (index = False ):
323330 tool = row [0 ]
324331 count = row [1 ]
325332 if tool .startswith ("PDM" ) or "CC0" in tool or "PUBLICDOMAIN" in tool :
326- key = "level 0"
333+ key = "level 0 - unrestricted "
327334 else :
328335 parts = tool .split ()
329336 unit = parts [1 ].lower ()
330337 if unit in ["by-sa" , "by" , "sa" , "sampling+" ]:
331- key = "level 1"
338+ key = "level 1 - few restrictions "
332339 elif unit in ["by-nc" , "by-nc-sa" , "sampling" , "nc" , "nc-sa" ]:
333- key = "level 2"
340+ key = "level 2 - some restrictions "
334341 else :
335- key = "level 3"
342+ key = "level 3 - many restrictions "
336343 data [key ] += count
337344
338345 data = pd .DataFrame (data .items (), columns = ["Category" , "Count" ])
@@ -342,6 +349,64 @@ def process_totals_by_restrictions(args, count_data):
342349 data_to_csv (args , data , file_path )
343350
344351
352+ def process_totals_by_langauage (args , data ):
353+ LOGGER .info ("Processing language data: totals by language" )
354+ data = data .groupby (["LANGUAGE" ], as_index = False )["COUNT" ].sum ()
355+ data = data .sort_values ("COUNT" , ascending = False )
356+ data .reset_index (drop = True , inplace = True )
357+ data .rename (
358+ columns = {
359+ "LANGUAGE" : "Language" ,
360+ "COUNT" : "Count" ,
361+ },
362+ inplace = True ,
363+ )
364+ file_path = shared .path_join (
365+ PATHS ["data_phase" ], "gcs_totals_by_langauage.csv"
366+ )
367+ data_to_csv (args , data , file_path )
368+
369+
370+ def process_totals_by_country (args , data ):
371+ LOGGER .info ("Processing country data: totals by country" )
372+ data = data .groupby (["COUNTRY" ], as_index = False )["COUNT" ].sum ()
373+ data = data .sort_values ("COUNT" , ascending = False )
374+ data .reset_index (drop = True , inplace = True )
375+ data .rename (
376+ columns = {
377+ "COUNTRY" : "Country" ,
378+ "COUNT" : "Count" ,
379+ },
380+ inplace = True ,
381+ )
382+ file_path = shared .path_join (
383+ PATHS ["data_phase" ], "gcs_totals_by_country.csv"
384+ )
385+ data_to_csv (args , data , file_path )
386+
387+
388+ # Data is already limited to licenses 4.0, CC0, and PDM
389+ #
390+ # def process_license_40_totals_by_langauage(args, data):
391+ # LOGGER.info("Processing language data: top 25 languages")
392+ # data = data[data["TOOL_IDENTIFIER"].str.contains("CC BY")]
393+ # data = data[data["TOOL_IDENTIFIER"].str.contains("4.0")]
394+ # data = data.groupby(["LANGUAGE"], as_index=False)['COUNT'].sum()
395+ # data = data.sort_values("COUNT", ascending=False)
396+ # data.reset_index(drop=True, inplace=True)
397+ # data.rename(
398+ # columns={
399+ # "LANGUAGE": "Language",
400+ # "COUNT": "Count",
401+ # },
402+ # inplace=True,
403+ # )
404+ # file_path = shared.path_join(
405+ # PATHS["data_phase"], "gcs_license_40_totals_by_langauage.csv"
406+ # )
407+ # data_to_csv(args, data, file_path)
408+
409+
345410def main ():
346411 args = parse_arguments ()
347412 shared .log_paths (LOGGER , PATHS )
@@ -355,15 +420,18 @@ def main():
355420 process_totals_by_free_cultural (args , count_data )
356421 process_totals_by_restrictions (args , count_data )
357422
358- # # Langauge data
359- # langauge_data = pd.read_csv(
360- # FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
361- # )
423+ # Langauge data
424+ language_data = pd .read_csv (
425+ FILE2_LANGUAGE , usecols = ["TOOL_IDENTIFIER" , "LANGUAGE" , "COUNT" ]
426+ )
427+ process_totals_by_langauage (args , language_data )
428+ # process_license_40_totals_by_langauage(args, language_data)
362429
363- # # Country data
364- # country_data = pd.read_csv(
365- # FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
366- # )
430+ # Country data
431+ country_data = pd .read_csv (
432+ FILE3_COUNTRY , usecols = ["TOOL_IDENTIFIER" , "COUNTRY" , "COUNT" ]
433+ )
434+ process_totals_by_country (args , country_data )
367435
368436 args = shared .git_add_and_commit (
369437 args ,
0 commit comments