Merge pull request #218 from creativecommons/encoding-utf8-newline-unix

TimidRobot · web-flow · commit ddb395818cad · 2025-10-30T09:06:09.000+01:00
ensure encoding (utf-8) and newline (unix) are consistent across platforms
diff --git a/dev/create_gcs_query_plan.py b/dev/create_gcs_query_plan.py
@@ -117,7 +117,7 @@ def get_tool_urls():
     file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
     prefix = "//creativecommons.org/"
     tool_urls = []
-    with open(file_path, "r") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         for line in file_obj:
             tool_urls.append(f"{prefix}{line.strip()}")
     LOGGER.info("Prioritizing CC Legal Tool URLs")
@@ -127,14 +127,14 @@ def get_tool_urls():
 
 def load_countries():
     file_path = shared.path_join(PATHS["data"], "gcs_country_collection.yaml")
-    with open(file_path, "r") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         countries = yaml.safe_load(file_obj)
     return countries
 
 
 def load_languages():
     file_path = shared.path_join(PATHS["data"], "gcs_language_collection.yaml")
-    with open(file_path, "r") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         languages = yaml.safe_load(file_obj)
     return languages
 
@@ -209,7 +209,7 @@ def save_plan(plan):
         "LANGUAGE",
         "LR",
     ]
-    with open(file_path, "w") as file_obj:
+    with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
         writer = csv.DictWriter(
             file_obj, fieldnames=fieldnames, dialect="unix"
         )
diff --git a/dev/prioritize_tools.py b/dev/prioritize_tools.py
@@ -42,7 +42,7 @@ def get_tool_urls():
     file_path = shared.path_join(PATHS["data"], "legal-tool-paths.txt")
     prefix = "//creativecommons.org/"
     tool_urls = []
-    with open(file_path, "r") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         for line in file_obj:
             tool_urls.append(f"{prefix}{line.strip()}")
     return tool_urls
@@ -112,7 +112,7 @@ def save_tools_list(tool_urls):
     LOGGER.info("Saving prioritized CC Legal Tool URLs")
     file_path = shared.path_join(PATHS["data"], "prioritized-tool-urls.txt")
     tool_urls.append("")  # ensure file has end of file newline
-    with open(file_path, "w") as file_obj:
+    with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
         file_obj.writelines("\n".join(tool_urls))
 
 
diff --git a/scripts/1-fetch/gcs_fetch.py b/scripts/1-fetch/gcs_fetch.py
@@ -104,7 +104,7 @@ def get_search_service():
 
 def initialize_data_file(file_path, header):
     if not os.path.isfile(file_path):
-        with open(file_path, "w", newline="") as file_obj:
+        with open(file_path, "w", encoding="utf-8", newline="\n") as file_obj:
             writer = csv.DictWriter(
                 file_obj, fieldnames=header, dialect="unix"
             )
@@ -127,7 +127,7 @@ def get_last_completed_plan_index():
     last_completed_plan_index = 0
     for file_path in [FILE1_COUNT, FILE2_LANGUAGE, FILE3_COUNTRY]:
         try:
-            with open(file_path, "r", newline="") as file_obj:
+            with open(file_path, "r", encoding="utf-8") as file_obj:
                 reader = csv.DictReader(file_obj, dialect="unix")
                 for row in reader:
                     pass  # skip through to last row
@@ -147,7 +147,7 @@ def get_last_completed_plan_index():
 def load_plan():
     plan = []
     file_path = shared.path_join(PATHS["data"], "gcs_query_plan.csv")
-    with open(file_path, "r", newline="") as file_obj:
+    with open(file_path, "r", encoding="utf-8") as file_obj:
         plan = list(csv.DictReader(file_obj, dialect="unix"))
     return plan
 
@@ -181,7 +181,7 @@ def append_data(args, plan_row, index, count):
             "TOOL_IDENTIFIER": plan_row["TOOL_IDENTIFIER"],
             "COUNT": count,
         }
-    with open(file_path, "a", newline="") as file_obj:
+    with open(file_path, "a", encoding="utf-8", newline="\n") as file_obj:
         writer = csv.DictWriter(
             file_obj, fieldnames=fieldnames, dialect="unix"
         )
diff --git a/scripts/1-fetch/github_fetch.py b/scripts/1-fetch/github_fetch.py
@@ -110,7 +110,7 @@ def write_data(args, tool_data):
         LOGGER.error("Unable to fetch all records. Aborting.")
         return args
 
-    with open(FILE1_COUNT, "w", newline="") as file_obj:
+    with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:
         writer = csv.DictWriter(
             file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
         )
diff --git a/scripts/1-fetch/wikipedia_fetch.py b/scripts/1-fetch/wikipedia_fetch.py
@@ -83,7 +83,7 @@ def write_data(args, tool_data):
     LOGGER.info("Saving fetched data")
     os.makedirs(PATHS["data_phase"], exist_ok=True)
 
-    with open(FILE_LANGUAGES, "w", newline="", encoding="utf-8") as file_obj:
+    with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:
         writer = csv.DictWriter(
             file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
         )
diff --git a/scripts/shared.py b/scripts/shared.py
@@ -239,7 +239,7 @@ def update_readme(
     entry_end_line = f"<!-- {entry_title} End -->\n"
 
     if os.path.exists(readme_path):
-        with open(readme_path, "r") as f:
+        with open(readme_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
     else:
         lines = []
@@ -327,7 +327,7 @@ def update_readme(
         )
 
     # Write back to the README.md file
-    with open(readme_path, "w") as f:
+    with open(readme_path, "w", encoding="utf-8", newline="\n") as f:
         f.writelines(lines)
 
     logger.info(f"README path: {readme_path.replace(paths['repo'], '.')}")

Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ def write_data(args, tool_data):`
`110`	`110`	`LOGGER.error("Unable to fetch all records. Aborting.")`
`111`	`111`	`return args`
`112`	`112`
`113`		`- with open(FILE1_COUNT, "w", newline="") as file_obj:`
	`113`	`+ with open(FILE1_COUNT, "w", encoding="utf-8", newline="\n") as file_obj:`
`114`	`114`	`writer = csv.DictWriter(`
`115`	`115`	`file_obj, fieldnames=HEADER1_COUNT, dialect="unix"`
`116`	`116`	`)`
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ def write_data(args, tool_data):`
`83`	`83`	`LOGGER.info("Saving fetched data")`
`84`	`84`	`os.makedirs(PATHS["data_phase"], exist_ok=True)`
`85`	`85`
`86`		`- with open(FILE_LANGUAGES, "w", newline="", encoding="utf-8") as file_obj:`
	`86`	`+ with open(FILE_LANGUAGES, "w", encoding="utf-8", newline="\n") as file_obj:`
`87`	`87`	`writer = csv.DictWriter(`
`88`	`88`	`file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"`
`89`	`89`	`)`