@@ -28,7 +28,7 @@ def __str__(self) -> str: ...
2828logging .getLogger ("opentelemetry.sdk.metrics._internal.instrument" ).setLevel (logging .ERROR )
2929
3030
31- def generate_permalink (file_path : Union [Path , str , PathLike ]) -> str :
31+ def generate_permalink (file_path : Union [Path , str , PathLike ], split_extension : bool = True ) -> str :
3232 """Generate a stable permalink from a file path.
3333
3434 Args:
@@ -51,53 +51,59 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
5151 # Convert Path to string if needed
5252 path_str = Path (str (file_path )).as_posix ()
5353
54- # Remove extension
55- base = os .path .splitext (path_str )[ 0 ]
54+ # Remove extension (for now, possibly)
55+ ( base , extension ) = os .path .splitext (path_str )
5656
5757 # Check if we have CJK characters that should be preserved
58- # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
58+ # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
5959 # \u3400-\u4dbf (CJK Extension A), \uff00-\uffef (Fullwidth forms)
6060 has_cjk_chars = any (
61- ' \u4e00 ' <= char <= ' \u9fff ' or
62- ' \u3000 ' <= char <= ' \u303f ' or
63- ' \u3400 ' <= char <= ' \u4dbf ' or
64- ' \uff00 ' <= char <= ' \uffef '
61+ " \u4e00 " <= char <= " \u9fff "
62+ or " \u3000 " <= char <= " \u303f "
63+ or " \u3400 " <= char <= " \u4dbf "
64+ or " \uff00 " <= char <= " \uffef "
6565 for char in base
6666 )
67-
67+
6868 if has_cjk_chars :
6969 # For text with CJK characters, selectively transliterate only Latin accented chars
7070 result = ""
7171 for char in base :
72- if ('\u4e00 ' <= char <= '\u9fff ' or
73- '\u3000 ' <= char <= '\u303f ' or
74- '\u3400 ' <= char <= '\u4dbf ' ):
72+ if (
73+ "\u4e00 " <= char <= "\u9fff "
74+ or "\u3000 " <= char <= "\u303f "
75+ or "\u3400 " <= char <= "\u4dbf "
76+ ):
7577 # Preserve CJK ideographs and symbols
7678 result += char
77- elif ( ' \uff00 ' <= char <= ' \uffef ' ) :
79+ elif " \uff00 " <= char <= " \uffef " :
7880 # Remove Chinese fullwidth punctuation entirely (like ,!?)
7981 continue
8082 else :
8183 # Transliterate Latin accented characters to ASCII
8284 result += unidecode (char )
83-
85+
8486 # Insert hyphens between CJK and Latin character transitions
8587 # Match: CJK followed by Latin letter/digit, or Latin letter/digit followed by CJK
86- result = re .sub (r'([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])' , r'\1-\2' , result )
87- result = re .sub (r'([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])' , r'\1-\2' , result )
88-
88+ result = re .sub (
89+ r"([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])" , r"\1-\2" , result
90+ )
91+ result = re .sub (
92+ r"([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])" , r"\1-\2" , result
93+ )
94+
8995 # Insert dash between camelCase
9096 result = re .sub (r"([a-z0-9])([A-Z])" , r"\1-\2" , result )
91-
97+
9298 # Convert ASCII letters to lowercase, preserve CJK
9399 lower_text = "" .join (c .lower () if c .isascii () and c .isalpha () else c for c in result )
94-
100+
95101 # Replace underscores with hyphens
96102 text_with_hyphens = lower_text .replace ("_" , "-" )
97-
103+
98104 # Remove apostrophes entirely (don't replace with hyphens)
99105 text_no_apostrophes = text_with_hyphens .replace ("'" , "" )
100-
106+
101107 # Replace unsafe chars with hyphens, but preserve CJK characters
102108 clean_text = re .sub (
103109 r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]" , "-" , text_no_apostrophes
@@ -129,7 +135,13 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
129135 segments = clean_text .split ("/" )
130136 clean_segments = [s .strip ("-" ) for s in segments ]
131137
132- return "/" .join (clean_segments )
138+ return_val = "/" .join (clean_segments )
139+
140+ # Append file extension back, if necessary
141+ if not split_extension and extension :
142+ return_val += extension
143+
144+ return return_val
133145
134146
135147def setup_logging (
@@ -229,79 +241,79 @@ def normalize_newlines(multiline: str) -> str:
229241 Returns:
230242 A string with normalized newlines native to the platform.
231243 """
232- return re .sub (r' \r\n?|\n' , os .linesep , multiline )
244+ return re .sub (r" \r\n?|\n" , os .linesep , multiline )
233245
234246
235247def normalize_file_path_for_comparison (file_path : str ) -> str :
236248 """Normalize a file path for conflict detection.
237-
249+
238250 This function normalizes file paths to help detect potential conflicts:
239251 - Converts to lowercase for case-insensitive comparison
240252 - Normalizes Unicode characters
241253 - Handles path separators consistently
242-
254+
243255 Args:
244256 file_path: The file path to normalize
245-
257+
246258 Returns:
247259 Normalized file path for comparison purposes
248260 """
249261 import unicodedata
250-
262+
251263 # Convert to lowercase for case-insensitive comparison
252264 normalized = file_path .lower ()
253-
265+
254266 # Normalize Unicode characters (NFD normalization)
255- normalized = unicodedata .normalize (' NFD' , normalized )
256-
267+ normalized = unicodedata .normalize (" NFD" , normalized )
268+
257269 # Replace path separators with forward slashes
258- normalized = normalized .replace (' \\ ' , '/' )
259-
270+ normalized = normalized .replace (" \\ " , "/" )
271+
260272 # Remove multiple slashes
261- normalized = re .sub (r'/+' , '/' , normalized )
262-
273+ normalized = re .sub (r"/+" , "/" , normalized )
274+
263275 return normalized
264276
265277
266278def detect_potential_file_conflicts (file_path : str , existing_paths : List [str ]) -> List [str ]:
267279 """Detect potential conflicts between a file path and existing paths.
268-
280+
269281 This function checks for various types of conflicts:
270282 - Case sensitivity differences
271283 - Unicode normalization differences
272284 - Path separator differences
273285 - Permalink generation conflicts
274-
286+
275287 Args:
276288 file_path: The file path to check
277289 existing_paths: List of existing file paths to check against
278-
290+
279291 Returns:
280292 List of existing paths that might conflict with the given file path
281293 """
282294 conflicts = []
283-
295+
284296 # Normalize the input file path
285297 normalized_input = normalize_file_path_for_comparison (file_path )
286298 input_permalink = generate_permalink (file_path )
287-
299+
288300 for existing_path in existing_paths :
289301 # Skip identical paths
290302 if existing_path == file_path :
291303 continue
292-
304+
293305 # Check for case-insensitive path conflicts
294306 normalized_existing = normalize_file_path_for_comparison (existing_path )
295307 if normalized_input == normalized_existing :
296308 conflicts .append (existing_path )
297309 continue
298-
310+
299311 # Check for permalink conflicts
300312 existing_permalink = generate_permalink (existing_path )
301313 if input_permalink == existing_permalink :
302314 conflicts .append (existing_path )
303315 continue
304-
316+
305317 return conflicts
306318
307319
@@ -336,13 +348,13 @@ def validate_project_path(path: str, project_path: Path) -> bool:
336348
337349def ensure_timezone_aware (dt : datetime ) -> datetime :
338350 """Ensure a datetime is timezone-aware using system timezone.
339-
351+
340352 If the datetime is naive, convert it to timezone-aware using the system's local timezone.
341353 If it's already timezone-aware, return it unchanged.
342-
354+
343355 Args:
344356 dt: The datetime to ensure is timezone-aware
345-
357+
346358 Returns:
347359 A timezone-aware datetime
348360 """
@@ -351,4 +363,4 @@ def ensure_timezone_aware(dt: datetime) -> datetime:
351363 return dt .astimezone ()
352364 else :
353365 # Already timezone-aware
354- return dt
366+ return dt
0 commit comments