11#!/usr/bin/env python
22"""
3- This file is dedicated to querying data from the GitHub API.
3+ Fetch CC Legal Tool usage from GitHub API.
44"""
55
66# Standard library
77import argparse
88import csv
99import os
1010import sys
11+ import textwrap
1112import traceback
13+ import urllib .parse
1214
1315# Third-party
1416import requests
15- import yaml
17+ from pygments import highlight
18+ from pygments .formatters import TerminalFormatter
19+ from pygments .lexers import PythonTracebackLexer
1620from requests .adapters import HTTPAdapter
1721from urllib3 .util .retry import Retry
1822
2529# Setup
2630LOGGER , PATHS = shared .setup (__file__ )
2731
28- # Log the start of the script execution
29- LOGGER .info ("Script execution started." )
32+ # Constants
33+ FILE1_COUNT = os .path .join (PATHS ["data_phase" ], "github_1_count.csv" )
34+ GITHUB_RETRY_STATUS_FORCELIST = [
35+ 408 , # Request Timeout
36+ 422 , # Unprocessable Content
37+ # (Validation failed, or the endpoint has been spammed)
38+ 429 , # Too Many Requests
39+ 500 , # Internal Server Error
40+ 502 , # Bad Gateway
41+ 503 , # Service Unavailable
42+ 504 , # Gateway Timeout
43+ ]
44+ # Also see: https://en.wikipedia.org/wiki/Public-domain-equivalent_license
45+ GITHUB_TOOLS = [
46+ {"TOOL_IDENTIFIER" : "BSD Zero Clause License" , "SPDX_IDENTIFIER" : "0BSD" },
47+ {"TOOL_IDENTIFIER" : "CC0 1.0" , "SPDX_IDENTIFIER" : "CC0-1.0" },
48+ {"TOOL_IDENTIFIER" : "CC BY 4.0" , "SPDX_IDENTIFIER" : "CC-BY-4.0" },
49+ {"TOOL_IDENTIFIER" : "CC BY-SA 4.0" , "SPDX_IDENTIFIER" : "CC-BY-SA-4.0" },
50+ {"TOOL_IDENTIFIER" : "MIT No Attribution" , "SPDX_IDENTIFIER" : "MIT-0" },
51+ {"TOOL_IDENTIFIER" : "Unlicense" , "SPDX_IDENTIFIER" : "Unlicense" },
52+ {"TOOL_IDENTIFIER" : "Total public repositories" , "SPDX_IDENTIFIER" : "N/A" },
53+ ]
54+ HEADER1_COUNT = ["TOOL_IDENTIFIER" , "SPDX_IDENTIFIER" , "COUNT" ]
55+ QUARTER = os .path .basename (PATHS ["data_quarter" ])
3056
3157
3258def parse_arguments ():
3359 """
34- Parses command-line arguments , returns parsed arguments .
60+ Parse command-line options , returns parsed argument namespace .
3561 """
36- LOGGER .info ("Parsing command-line arguments " )
37- parser = argparse .ArgumentParser (description = "GitHub Data Fetching Script" )
62+ LOGGER .info ("Parsing command-line options " )
63+ parser = argparse .ArgumentParser (description = __doc__ )
3864 parser .add_argument (
39- "--licenses" , type = int , default = 3 , help = "Number of licenses to query"
65+ "--enable-save" ,
66+ action = "store_true" ,
67+ help = "Enable saving results" ,
68+ )
69+ parser .add_argument (
70+ "--enable-git" ,
71+ action = "store_true" ,
72+ help = "Enable git actions (fetch, merge, add, commit, and push)" ,
4073 )
4174 return parser .parse_args ()
4275
4376
44- def set_up_data_file ():
45- """
46- Sets up the data file for recording results.
47- """
48- LOGGER .info ("Setting up the data file for recording results." )
49- header = "LICENSE_TYPE,Repository Count\n "
50- with open (
51- os .path .join (PATHS ["data_phase" ], "github_fetched.csv" ), "w"
52- ) as f :
53- f .write (header )
54-
55-
56- def get_response_elems (license_type ):
57- """
58- Provides the metadata for a query of
59- specified license type from GitHub API.
60-
61- Args:
62- license_type: A string representing the type of license.
63- Returns:
64- dict: A dictionary mapping metadata
65- to its value provided from the API query.
66- """
67- LOGGER .info (f"Querying metadata for license: { license_type } " )
77+ def check_for_completion ():
6878 try :
69- base_url = "https://api.github.com/search/repositories?q=license:"
70- request_url = f"{ base_url } { license_type } "
71- max_retries = Retry (
72- total = 5 ,
73- backoff_factor = 10 ,
74- status_forcelist = [403 , 408 , 429 , 500 , 502 , 503 , 504 ],
75- )
76- session = requests .Session ()
77- session .mount ("https://" , HTTPAdapter (max_retries = max_retries ))
78- with session .get (request_url ) as response :
79- response .raise_for_status ()
80- search_data = response .json ()
81- return {"totalResults" : search_data ["total_count" ]}
82- except requests .HTTPError as e :
83- LOGGER .error (f"HTTP Error: { e } " )
84- raise shared .QuantifyingException (f"HTTP Error: { e } " , 1 )
85- except requests .RequestException as e :
86- LOGGER .error (f"Request Exception: { e } " )
87- raise shared .QuantifyingException (f"Request Exception: { e } " , 1 )
88- except KeyError as e :
89- LOGGER .error (f"KeyError: { e } ." )
90- raise shared .QuantifyingException (f"KeyError: { e } " , 1 )
91-
92-
93- def retrieve_license_data (args ):
94- """
95- Retrieves the data of all license types specified.
96- """
97- LOGGER .info ("Retrieving the data for all license types." )
98- licenses = ["CC0-1.0" , "CC-BY-4.0" , "CC-BY-SA-4.0" ][: args .licenses ]
99-
100- data = []
101- total_repos_retrieved = 0
102-
103- for license_type in licenses :
104- data_dict = get_response_elems (license_type )
105- total_repos_retrieved += data_dict ["totalResults" ]
106- record_results (license_type , data_dict )
79+ with open (FILE1_COUNT , "r" , newline = "" ) as file_obj :
80+ reader = csv .DictReader (file_obj , dialect = "unix" )
81+ if len (list (reader )) == len (GITHUB_TOOLS ):
82+ raise shared .QuantifyingException (
83+ f"Data fetch completed for { QUARTER } " , 0
84+ )
85+ except FileNotFoundError :
86+ pass # File may not be found without --enable-save, etc.
87+
88+
89+ def get_requests_session ():
90+ max_retries = Retry (
91+ total = 5 ,
92+ backoff_factor = 10 ,
93+ status_forcelist = GITHUB_RETRY_STATUS_FORCELIST ,
94+ )
95+ session = requests .Session ()
96+ session .mount ("https://" , HTTPAdapter (max_retries = max_retries ))
97+ session .headers .update ({"Accept" : "application/vnd.github+json" })
10798
108- for row in data :
109- LOGGER .info (f"Collected data row: { row } " )
99+ return session
110100
111- return data
112101
102+ def write_data (args , tool_data ):
103+ if not args .enable_save :
104+ return args
113105
114- def record_results (license_type , data ):
115- """
116- Records the data for a specific license type into the CSV file.
117- """
118- LOGGER .info (f"Recording data for license: { license_type } " )
119- row = [license_type , data ["totalResults" ]]
120- with open (
121- os .path .join (PATHS ["data_phase" ], "github_fetched.csv" ),
122- "a" ,
123- newline = "" ,
124- ) as f :
125- writer = csv .writer (f , dialect = "unix" )
126- writer .writerow (row )
127-
128-
129- def load_state ():
130- """
131- Loads the state from a YAML file, returns the last recorded state.
132- """
133- if os .path .exists (PATHS ["state" ]):
134- with open (PATHS ["state" ], "r" ) as f :
135- return yaml .safe_load (f )
136- return {"total_records_retrieved (github)" : 0 }
106+ # Create data directory for this phase
107+ os .makedirs (PATHS ["data_phase" ], exist_ok = True )
137108
109+ if len (tool_data ) < len (GITHUB_TOOLS ):
110+ LOGGER .error ("Unable to fetch all records. Aborting." )
111+ return args
138112
139- def save_state (state : dict ):
140- """
141- Saves the state to a YAML file.
142- Parameters:
143- state_file: Path to the state file.
144- state: The state dictionary to save.
145- """
146- with open (PATHS ["state" ], "w" ) as f :
147- yaml .safe_dump (state , f )
113+ with open (FILE1_COUNT , "w" , newline = "" ) as file_obj :
114+ writer = csv .DictWriter (
115+ file_obj , fieldnames = HEADER1_COUNT , dialect = "unix"
116+ )
117+ writer .writeheader ()
118+ for row in tool_data :
119+ writer .writerow (row )
120+ return args
121+
122+
123+ def query_github (args , session ):
124+ tool_data = []
125+ for tool in GITHUB_TOOLS :
126+ tool_identifier = tool ["TOOL_IDENTIFIER" ]
127+ spdx_identifier = tool ["SPDX_IDENTIFIER" ]
128+ LOGGER .info (f"Query: tool: { tool_identifier } , spdx: { spdx_identifier } " )
129+
130+ base_url = "https://api.github.com/search/repositories?per_page=1&q="
131+ search_parameters = "is:public"
132+ if tool_identifier != "Total public repositories" :
133+ search_parameters = (
134+ f"{ search_parameters } license:{ spdx_identifier .lower ()} "
135+ )
136+ search_parameters = urllib .parse .quote (search_parameters , safe = ":/" )
137+ request_url = f"{ base_url } { search_parameters } "
138+
139+ try :
140+ with session .get (request_url ) as response :
141+ response .raise_for_status ()
142+ search_data = response .json ()
143+ count = search_data ["total_count" ]
144+ tool_data .append (
145+ {
146+ "TOOL_IDENTIFIER" : tool_identifier ,
147+ "SPDX_IDENTIFIER" : spdx_identifier ,
148+ "COUNT" : count ,
149+ }
150+ )
151+ LOGGER .info (f"count: { count } " )
152+ except requests .HTTPError as e :
153+ LOGGER .error (f"HTTP Error: { e } " )
154+ raise shared .QuantifyingException (f"HTTP Error: { e } " , 1 )
155+ except requests .RequestException as e :
156+ LOGGER .error (f"Request Exception: { e } " )
157+ raise shared .QuantifyingException (f"Request Exception: { e } " , 1 )
158+ except KeyError as e :
159+ LOGGER .error (f"KeyError: { e } ." )
160+ raise shared .QuantifyingException (f"KeyError: { e } " , 1 )
161+ return tool_data
148162
149163
150164def main ():
151-
152- # Fetch and merge changes
153- shared .fetch_and_merge (PATHS ["repo" ])
154-
155165 args = parse_arguments ()
156-
157- state = load_state ()
158- total_records_retrieved = state ["total_records_retrieved (github)" ]
159- LOGGER .info (f"Initial total_records_retrieved: { total_records_retrieved } " )
160- goal_records = 1000 # Set goal number of records
161-
162- if total_records_retrieved >= goal_records :
163- LOGGER .info (
164- f"Goal of { goal_records } records already achieved."
165- " No further action required."
166- )
167- return
168-
169- # Log the paths being used
170166 shared .log_paths (LOGGER , PATHS )
171-
172- # Create data directory for this phase
173- os .makedirs (PATHS ["data_phase" ], exist_ok = True )
174-
175- if total_records_retrieved == 0 :
176- set_up_data_file ()
177-
178- # Retrieve and record data
179- repos_retrieved = retrieve_license_data (args )
180-
181- # Update the state with the new count of retrieved records
182- total_records_retrieved += repos_retrieved
183- LOGGER .info (
184- f"Total records retrieved after fetching: { total_records_retrieved } "
167+ check_for_completion ()
168+ session = get_requests_session ()
169+ tool_data = query_github (args , session )
170+ args = write_data (args , tool_data )
171+ args = shared .git_add_and_commit (
172+ args ,
173+ PATHS ["repo" ],
174+ PATHS ["data_quarter" ],
175+ f"Add and commit new GitHUB data for { QUARTER } " ,
185176 )
186- state ["total_records_retrieved (github)" ] = total_records_retrieved
187- save_state (state )
188-
189- # Add and commit changes
190- shared .add_and_commit (
191- PATHS ["repo" ], PATHS ["data_quarter" ], "Add and commit GitHub data"
192- )
193-
194- # Push changes
195- shared .push_changes (PATHS ["repo" ])
177+ shared .git_push_changes (args , PATHS ["repo" ])
196178
197179
198180if __name__ == "__main__" :
@@ -211,5 +193,13 @@ def main():
211193 LOGGER .info ("(130) Halted via KeyboardInterrupt." )
212194 sys .exit (130 )
213195 except Exception :
214- LOGGER .exception (f"(1) Unhandled exception: { traceback .format_exc ()} " )
196+ traceback_formatted = textwrap .indent (
197+ highlight (
198+ traceback .format_exc (),
199+ PythonTracebackLexer (),
200+ TerminalFormatter (),
201+ ),
202+ " " ,
203+ )
204+ LOGGER .exception (f"(1) Unhandled exception:\n { traceback_formatted } " )
215205 sys .exit (1 )
0 commit comments