1212import time
1313import traceback
1414import urllib .parse
15+ from copy import copy
1516
1617# Third-party
1718import googleapiclient .discovery
3435load_dotenv (PATHS ["dotenv" ])
3536
3637# Constants
37- DEVELOPER_KEY = os .getenv ("GCS_DEVELOPER_KEY" )
38- CX = os .getenv ("GCS_CX" )
3938BASE_URL = "https://www.googleapis.com/customsearch/v1"
4039FILE1_COUNT = os .path .join (PATHS ["data_phase" ], "gcs_1_count.csv" )
4140FILE2_LANGUAGE = os .path .join (
4241 PATHS ["data_phase" ], "gcs_2_count_by_language.csv"
4342)
4443FILE3_COUNTRY = os .path .join (PATHS ["data_phase" ], "gcs_3_count_by_country.csv" )
44+ GCS_CX = os .getenv ("GCS_CX" )
45+ GCS_DEVELOPER_KEY = os .getenv ("GCS_DEVELOPER_KEY" )
4546HEADER1_COUNT = ["PLAN_INDEX" , "TOOL_IDENTIFIER" , "COUNT" ]
4647HEADER2_LANGUAGE = ["PLAN_INDEX" , "TOOL_IDENTIFIER" , "LANGUAGE" , "COUNT" ]
4748HEADER3_COUNTRY = ["PLAN_INDEX" , "TOOL_IDENTIFIER" , "COUNTRY" , "COUNT" ]
@@ -87,7 +88,11 @@ def get_search_service():
8788 """
8889 LOGGER .info ("Getting Google Custom Search API Service." )
8990 return googleapiclient .discovery .build (
90- "customsearch" , "v1" , developerKey = DEVELOPER_KEY , cache_discovery = False
91+ "customsearch" ,
92+ "v1" ,
93+ developerKey = GCS_DEVELOPER_KEY ,
94+ cache_discovery = False ,
95+ num_retries = 5 ,
9196 )
9297
9398
@@ -184,21 +189,15 @@ def query_gcs(args, service, last_completed_plan_index, plan):
184189
185190 max_tries = 5
186191 initial_delay = 1 # in seconds
192+ rate_delay = copy (initial_delay ) # query gently
187193 start = last_completed_plan_index + 1
188194 stop = start + args .limit
189195
190196 for plan_row in plan [start :stop ]: # noqa: E203
191197 index = plan .index (plan_row )
192198 query_info = f"index: { index } , tool: { plan_row ['TOOL_IDENTIFIER' ]} "
193199 encoded_tool_url = urllib .parse .quote (plan_row ["TOOL_URL" ], safe = ":/" )
194- query_params = {
195- "cx" : CX ,
196- # "num": records_per_query,
197- # "start": start_index,
198- # "cr": cr,
199- # "lr": lr,
200- "q" : encoded_tool_url ,
201- }
200+ query_params = {"cx" : GCS_CX , "q" : encoded_tool_url }
202201 if plan_row ["COUNTRY" ]:
203202 query_info = f"{ query_info } , country: { plan_row ['COUNTRY' ]} "
204203 query_params ["cr" ] = plan_row ["CR" ]
@@ -222,6 +221,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
222221 results .get ("searchInformation" , {}).get ("totalResults" , 0 )
223222 )
224223 success = True
224+ time .sleep (rate_delay )
225225 break # no need to try again
226226
227227 except HttpError as e :
0 commit comments