44for analysis and comparison between quarters.
55"""
66# Standard library
7+ import argparse
8+ import csv
79import os
810import sys
11+ import textwrap
912import traceback
1013
11- # import pandas as pd
14+ # Third-party
15+ import pandas as pd
16+ from pygments import highlight
17+ from pygments .formatters import TerminalFormatter
18+ from pygments .lexers import PythonTracebackLexer
1219
1320# Add parent directory so shared can be imported
1421sys .path .append (os .path .join (os .path .dirname (__file__ ), ".." ))
1926# Setup
2027LOGGER , PATHS = shared .setup (__file__ )
2128
29+ # Constants
30+ FILE1_COUNT = shared .path_join (PATHS ["data_1-fetch" ], "gcs_1_count.csv" )
31+ FILE2_LANGUAGE = shared .path_join (
32+ PATHS ["data_1-fetch" ], "gcs_2_count_by_language.csv"
33+ )
34+ FILE3_COUNTRY = shared .path_join (
35+ PATHS ["data_1-fetch" ], "gcs_3_count_by_country.csv"
36+ )
37+ QUARTER = os .path .basename (PATHS ["data_quarter" ])
38+
39+
40+ def parse_arguments ():
41+ """
42+ Parse command-line options, returns parsed argument namespace.
43+ """
44+ LOGGER .info ("Parsing command-line options" )
45+ parser = argparse .ArgumentParser (description = __doc__ )
46+ parser .add_argument (
47+ "--enable-save" ,
48+ action = "store_true" ,
49+ help = "Enable saving results" ,
50+ )
51+ parser .add_argument (
52+ "--enable-git" ,
53+ action = "store_true" ,
54+ help = "Enable git actions (fetch, merge, add, commit, and push)" ,
55+ )
56+ return parser .parse_args ()
57+
58+
2259# def load_quarter_data(quarter):
2360# """
2461# Load data for a specific quarter.
157194# return parser.parse_args()
158195
159196
160- def main ():
161- raise shared .QuantifyingException ("No current code for Phase 2" , 0 )
197+ def data_to_csv (args , data , file_path ):
198+ if not args .enable_save :
199+ return
200+ os .makedirs (PATHS ["data_phase" ], exist_ok = True )
201+ # emulate csv.unix_dialect
202+ data .to_csv (
203+ file_path , index = False , quoting = csv .QUOTE_ALL , lineterminator = "\n "
204+ )
205+
206+
207+ def process_top_25_tools (args , count_data ):
208+ LOGGER .info ("Processing top 25 tools" )
209+ data = count_data .sort_values ("COUNT" , ascending = False )
210+ data .reset_index (drop = True , inplace = True )
211+ data = data .iloc [:25 ]
212+ data .rename (
213+ columns = {"TOOL_IDENTIFIER" : "CC legal tool" , "COUNT" : "Count" },
214+ inplace = True ,
215+ )
216+ file_path = shared .path_join (PATHS ["data_phase" ], "gcs_top_25_tools.csv" )
217+ data_to_csv (args , data , file_path )
218+
219+
220+ def process_totals_by_product (args , count_data ):
221+ LOGGER .info ("Processing totals by product" )
222+ data = {
223+ "Licenses version 4.0" : 0 ,
224+ "Licenses version 3.0" : 0 ,
225+ "Licenses version 2.x" : 0 ,
226+ "Licenses version 1.0" : 0 ,
227+ "CC0 1.0" : 0 ,
228+ "Public Domain Mark 1.0" : 0 ,
229+ "Certification 1.0 US" : 0 ,
230+ }
231+ for row in count_data .itertuples (index = False ):
232+ tool = row [0 ]
233+ count = row [1 ]
234+ if tool .startswith ("PDM" ):
235+ key = "Public Domain Mark 1.0"
236+ elif "CC0" in tool :
237+ key = "CC0 1.0"
238+ elif "PUBLICDOMAIN" in tool :
239+ key = "Certification 1.0 US"
240+ elif "4.0" in tool :
241+ key = "Licenses version 4.0"
242+ elif "3.0" in tool :
243+ key = "Licenses version 3.0"
244+ elif "2." in tool :
245+ key = "Licenses version 2.x"
246+ elif "1.0" in tool :
247+ key = "Licenses version 1.0"
248+ else :
249+ raise shared .QuantifyingException ("Invalid TOOL_IDENTIFIER" )
250+ data [key ] += count
251+
252+ data = pd .DataFrame (
253+ data .items (), columns = ["CC legal tool product" , "Count" ]
254+ )
255+ file_path = shared .path_join (
256+ PATHS ["data_phase" ], "gcs_totals_by_product.csv"
257+ )
258+ data_to_csv (args , data , file_path )
259+
260+
261+ def process_totals_by_unit (args , count_data ):
262+ LOGGER .info ("Processing totals by unit" )
263+ data = {}
264+ for row in count_data .itertuples (index = False ):
265+ tool = row [0 ]
266+ count = row [1 ]
267+ if tool .startswith ("PDM" ):
268+ key = "mark"
269+ elif "CC0" in tool :
270+ key = "cc0"
271+ elif "PUBLICDOMAIN" in tool :
272+ key = "certification"
273+ else :
274+ parts = tool .split ()
275+ key = parts [1 ].lower ()
276+ if key == "by-nd-nc" :
277+ key = "by-nc-nd"
278+ if key not in data .keys ():
279+ data [key ] = count
280+ else :
281+ data [key ] += count
282+
283+ data = pd .DataFrame (data .items (), columns = ["Legal Tool Unit" , "Count" ])
284+ data .sort_values ("Count" , ascending = False , inplace = True )
285+ data .reset_index (drop = True , inplace = True )
286+ file_path = shared .path_join (PATHS ["data_phase" ], "gcs_totals_by_unit.csv" )
287+ data_to_csv (args , data , file_path )
288+
289+
290+ def process_totals_by_free_cultural (args , count_data ):
291+ LOGGER .info ("Processing totals by Approved for Free Cultural Works" )
292+ data = {
293+ "Approved for Free Cultural Works" : 0 ,
294+ "Limited uses" : 0 ,
295+ }
296+ for row in count_data .itertuples (index = False ):
297+ tool = row [0 ]
298+ count = row [1 ]
299+ if tool .startswith ("PDM" ) or "CC0" in tool or "PUBLICDOMAIN" in tool :
300+ key = "Approved for Free Cultural Works"
301+ else :
302+ parts = tool .split ()
303+ unit = parts [1 ].lower ()
304+ if unit in ["by-sa" , "by" , "sa" , "sampling+" ]:
305+ key = "Approved for Free Cultural Works"
306+ else :
307+ key = "Limited uses"
308+ data [key ] += count
309+
310+ data = pd .DataFrame (data .items (), columns = ["Category" , "Count" ])
311+ data .sort_values ("Count" , ascending = False , inplace = True )
312+ data .reset_index (drop = True , inplace = True )
313+ file_path = shared .path_join (
314+ PATHS ["data_phase" ], "gcs_totals_by_free_cultural.csv"
315+ )
316+ data_to_csv (args , data , file_path )
317+
318+
319+ def process_totals_by_restrictions (args , count_data ):
320+ LOGGER .info ("Processing totals by restriction" )
321+ data = {"level 0" : 0 , "level 1" : 0 , "level 2" : 0 , "level 3" : 0 }
322+ for row in count_data .itertuples (index = False ):
323+ tool = row [0 ]
324+ count = row [1 ]
325+ if tool .startswith ("PDM" ) or "CC0" in tool or "PUBLICDOMAIN" in tool :
326+ key = "level 0"
327+ else :
328+ parts = tool .split ()
329+ unit = parts [1 ].lower ()
330+ if unit in ["by-sa" , "by" , "sa" , "sampling+" ]:
331+ key = "level 1"
332+ elif unit in ["by-nc" , "by-nc-sa" , "sampling" , "nc" , "nc-sa" ]:
333+ key = "level 2"
334+ else :
335+ key = "level 3"
336+ data [key ] += count
337+
338+ data = pd .DataFrame (data .items (), columns = ["Category" , "Count" ])
339+ file_path = shared .path_join (
340+ PATHS ["data_phase" ], "gcs_totals_by_restrictions.csv"
341+ )
342+ data_to_csv (args , data , file_path )
343+
162344
163- # # Fetch and merge changes
164- # shared.fetch_and_merge(PATHS["repo"])
345+ def main ():
346+ args = parse_arguments ()
347+ shared .log_paths (LOGGER , PATHS )
348+ shared .git_fetch_and_merge (args , PATHS ["repo" ])
349+
350+ # Count data
351+ count_data = pd .read_csv (FILE1_COUNT , usecols = ["TOOL_IDENTIFIER" , "COUNT" ])
352+ process_top_25_tools (args , count_data )
353+ process_totals_by_product (args , count_data )
354+ process_totals_by_unit (args , count_data )
355+ process_totals_by_free_cultural (args , count_data )
356+ process_totals_by_restrictions (args , count_data )
357+
358+ # # Langauge data
359+ # langauge_data = pd.read_csv(
360+ # FILE2_LANGUAGE, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
361+ # )
165362
166- # # Add and commit changes
167- # shared.add_and_commit (
168- # PATHS["repo"], PATHS["data_quarter"] , "Fetched and updated new data"
363+ # # Country data
364+ # country_data = pd.read_csv (
365+ # FILE3_COUNTRY, usecols=["TOOL_IDENTIFIER", "COUNTRY" , "COUNT"]
169366 # )
170367
171- # # Push changes
172- # shared.push_changes(PATHS["repo"])
368+ args = shared .git_add_and_commit (
369+ args ,
370+ PATHS ["repo" ],
371+ PATHS ["data_quarter" ],
372+ f"Add and commit new Google Custom Search (GCS) data for { QUARTER } " ,
373+ )
374+ shared .git_push_changes (args , PATHS ["repo" ])
173375
174376
175377if __name__ == "__main__" :
@@ -188,5 +390,13 @@ def main():
188390 LOGGER .info ("(130) Halted via KeyboardInterrupt." )
189391 sys .exit (130 )
190392 except Exception :
191- LOGGER .exception (f"(1) Unhandled exception: { traceback .format_exc ()} " )
393+ traceback_formatted = textwrap .indent (
394+ highlight (
395+ traceback .format_exc (),
396+ PythonTracebackLexer (),
397+ TerminalFormatter (),
398+ ),
399+ " " ,
400+ )
401+ LOGGER .critical (f"(1) Unhandled exception:\n { traceback_formatted } " )
192402 sys .exit (1 )
0 commit comments