Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions scripts/1-fetch/gcs_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import time
import traceback
import urllib.parse
from copy import copy

# Third-party
import googleapiclient.discovery
Expand All @@ -34,14 +35,14 @@
load_dotenv(PATHS["dotenv"])

# Constants
DEVELOPER_KEY = os.getenv("GCS_DEVELOPER_KEY")
CX = os.getenv("GCS_CX")
BASE_URL = "https://www.googleapis.com/customsearch/v1"
FILE1_COUNT = os.path.join(PATHS["data_phase"], "gcs_1_count.csv")
FILE2_LANGUAGE = os.path.join(
PATHS["data_phase"], "gcs_2_count_by_language.csv"
)
FILE3_COUNTRY = os.path.join(PATHS["data_phase"], "gcs_3_count_by_country.csv")
GCS_CX = os.getenv("GCS_CX")
GCS_DEVELOPER_KEY = os.getenv("GCS_DEVELOPER_KEY")
HEADER1_COUNT = ["PLAN_INDEX", "TOOL_IDENTIFIER", "COUNT"]
HEADER2_LANGUAGE = ["PLAN_INDEX", "TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
HEADER3_COUNTRY = ["PLAN_INDEX", "TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
Expand Down Expand Up @@ -87,7 +88,11 @@ def get_search_service():
"""
LOGGER.info("Getting Google Custom Search API Service.")
return googleapiclient.discovery.build(
"customsearch", "v1", developerKey=DEVELOPER_KEY, cache_discovery=False
"customsearch",
"v1",
developerKey=GCS_DEVELOPER_KEY,
cache_discovery=False,
num_retries=5,
)


Expand Down Expand Up @@ -184,21 +189,15 @@ def query_gcs(args, service, last_completed_plan_index, plan):

max_tries = 5
initial_delay = 1 # in seconds
rate_delay = copy(initial_delay) # query gently
start = last_completed_plan_index + 1
stop = start + args.limit

for plan_row in plan[start:stop]: # noqa: E203
index = plan.index(plan_row)
query_info = f"index: {index}, tool: {plan_row['TOOL_IDENTIFIER']}"
encoded_tool_url = urllib.parse.quote(plan_row["TOOL_URL"], safe=":/")
query_params = {
"cx": CX,
# "num": records_per_query,
# "start": start_index,
# "cr": cr,
# "lr": lr,
"q": encoded_tool_url,
}
query_params = {"cx": GCS_CX, "q": encoded_tool_url}
if plan_row["COUNTRY"]:
query_info = f"{query_info}, country: {plan_row['COUNTRY']}"
query_params["cr"] = plan_row["CR"]
Expand All @@ -222,6 +221,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
results.get("searchInformation", {}).get("totalResults", 0)
)
success = True
time.sleep(rate_delay)
break # no need to try again

except HttpError as e:
Expand All @@ -230,7 +230,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
"Quota exceeded" in e.reason
and "Queries per day" in e.reason
):
LOGGER.warning(f"{e.status_code}: {e.reason}.")
LOGGER.warning(f"{e.status_code}: {e.reason}")
return # abort queries
else:
LOGGER.warning(
Expand Down