3535import frontmatter
3636import re
3737import glob
38+ import argparse
39+
40+
41+ def parse_args ():
42+ """Parse command line arguments."""
43+
44+ parser = argparse .ArgumentParser (description = 'Transform Markdown files to MDX format.' )
45+ parser .add_argument ('--source' , type = str , help = 'Source directory containing markdown files' )
46+ parser .add_argument ('--target' , type = str , help = 'Target directory for MDX files' )
47+
48+ return parser .parse_args ()
49+
3850
3951# Get the project root - need to handle both direct run and test-sync run.
4052SCRIPT_PATH = Path (__file__ ).resolve ()
41- CURRENT_DIR = Path .cwd ()
42-
43- PROJECT_ROOT = SCRIPT_PATH .parent .parent
44- SOURCE_ROOT = PROJECT_ROOT / 'sync/source'
45- TARGET_ROOT = PROJECT_ROOT / 'sync/target'
53+ args = parse_args ()
4654
4755print (f'Script location: { __file__ } ' )
48- print (f'Project root: { PROJECT_ROOT } ' )
56+
57+ if args .source and args .target :
58+ SOURCE_ROOT = Path (args .source ).resolve ()
59+ TARGET_ROOT = Path (args .target ).resolve ()
60+ else :
61+ PROJECT_ROOT = SCRIPT_PATH .parent .parent
62+ SOURCE_ROOT = PROJECT_ROOT / 'sync/source'
63+ TARGET_ROOT = PROJECT_ROOT / 'sync/target'
64+
4965print (f'Source root: { SOURCE_ROOT } ' )
5066print (f'Target root: { TARGET_ROOT } ' )
5167
94110
95111def should_process_file (path : Path ) -> bool :
96112 """Determine if a file should be processed based on ignore rules."""
97-
113+
98114 # Case-insensitive filename check.
99115 if path .name .lower () in IGNORED_FILES :
100116 print (f'\n Skipping ignored file: { path .name } ' )
@@ -138,7 +154,7 @@ def replace_image(match):
138154
139155def add_github_header (content : str , is_readme : bool ) -> str :
140156 """Add GitHub header component after the first heading, but only for README.md."""
141-
157+
142158 if not is_readme :
143159 return content
144160
@@ -309,61 +325,61 @@ def replace_comment(match):
309325
310326def transform_internal_links (content : str ) -> str :
311327 """Transform internal markdown links to the new MDX format."""
312-
328+
313329 print ('\n Transforming internal links...' )
314330
315331 def format_link_text (text ):
316332 """Convert technical names to readable titles."""
317-
333+
318334 # Remove file extensions.
319335 text = re .sub (r'\.(json|md)$' , '' , text )
320-
336+
321337 # Handle special cases.
322338 if text == 'README' :
323339 return 'Documentation'
324-
340+
325341 # Convert UPPER_CASE to Title Case.
326342 if text .isupper ():
327343 words = text .split ('_' )
328344 return ' ' .join (word .capitalize () for word in words )
329-
345+
330346 return text
331347
332348 def replace_link (match ):
333349 text , path , anchor = match .groups ()
334-
350+
335351 # Handle different link types.
336352 if path :
337353 # Remove .md extension if present.
338354 path = path .replace ('.md' , '' )
339-
355+
340356 if path == '../README' :
341357 # Links to README become root links.
342358 new_path = '/'
343359 else :
344360 # Remove ./ or / prefix if present.
345361 path = path .lstrip ('./' ).lstrip ('/' )
346-
362+
347363 # Convert to kebab case.
348364 new_path = '/' + path .lower ().replace ('_' , '-' )
349365 else :
350366 new_path = ''
351-
367+
352368 # Add anchor if present.
353369 if anchor :
354370 new_path = f"{ new_path } { anchor } "
355-
371+
356372 # Format the link text if it's a technical name.
357373 if text .endswith ('.md' ) or text .endswith ('.json' ) or text .isupper () or '.json' in text :
358374 text = format_link_text (text )
359-
375+
360376 print (f' ⭮ { text } → { new_path } ' )
361377 return f'[{ text } ]({ new_path } )'
362378
363379 # First pass: handle standard markdown links.
364380 pattern = r'\[([^\]]+)\]\(((?!http)[^)#\s]+)?([#][^)\s]+)?\)'
365381 content = re .sub (pattern , replace_link , content )
366-
382+
367383 # Second pass: handle already transformed links but with technical names.
368384 pattern = r'\[([A-Z_]+(?:\.(?:json|md))?)\](/[a-z-]+(?:[#][^)\s]+)?)\)'
369385 content = re .sub (pattern , lambda m : f'[{ format_link_text (m .group (1 ))} ]{ m .group (2 )} ' , content )
@@ -373,7 +389,7 @@ def replace_link(match):
373389
374390def remove_img_tags (content : str ) -> str :
375391 """Remove HTML img tags from the content."""
376-
392+
377393 print ('\n Removing img tags...' )
378394
379395 def replace_img (match ):
@@ -390,20 +406,20 @@ def replace_img(match):
390406
391407def transform_inline_references (content : str ) -> str :
392408 """Transform inline file references and URLs to proper format."""
393-
409+
394410 print ('\n Transforming inline references...' )
395411
396412 def replace_reference (match ):
397413 path = match .group (1 )
398-
414+
399415 # Remove .md extension if present.
400416 path = path .replace ('.md' , '' )
401-
417+
402418 # Convert to kebab case and add leading slash.
403419 new_path = '/' + path .lstrip ('./' ).lstrip ('/' ).lower ().replace ('_' , '-' )
404-
420+
405421 print (f' ⭮ { path } → { new_path } ' )
406-
422+
407423 return new_path
408424
409425 # Transform file references like ./DATASET_SCHEMA.md to /dataset-schema.
@@ -426,10 +442,10 @@ def replace_reference(match):
426442
427443def transform_markdown_to_mdx (content : str , source_file : Path ) -> str :
428444 """Main transformation pipeline to convert markdown to MDX format."""
429-
445+
430446 print ('\n Parsing frontmatter...' )
431447 post = frontmatter .loads (content )
432-
448+
433449 is_readme = source_file .name .lower () == 'readme.md'
434450
435451 # Apply transformations in sequence.
@@ -451,14 +467,14 @@ def transform_markdown_to_mdx(content: str, source_file: Path) -> str:
451467
452468def get_target_path (source_path : Path ) -> Path :
453469 """Convert source path to target path using the required transformations."""
454-
470+
455471 # Get relative path from source root.
456472 rel_path = source_path .relative_to (SOURCE_ROOT )
457-
473+
458474 # Transform filename.
459475 stem = rel_path .stem .lower ().replace ('_' , '-' )
460476 new_name = f"{ stem } .mdx"
461-
477+
462478 # Construct target path.
463479 if source_path .name == 'README.md' :
464480 # Special case for README.md -> index.mdx.
@@ -470,47 +486,47 @@ def get_target_path(source_path: Path) -> Path:
470486
471487def process_files ():
472488 """Main function to process all markdown files."""
473-
489+
474490 try :
475491 # Find all markdown files to process.
476492 source_files = [
477493 Path (p ) for p in [
478494 * glob .glob (str (SOURCE_ROOT / '*.md' )), # root md files
479495 * glob .glob (str (SOURCE_ROOT / 'pages/*.md' )) # files in pages directory
480496 ]
481-
497+
482498 if should_process_file (Path (p )) # filter out ignored files
483499 ]
484-
500+
485501 print (f'\n Found { len (source_files )} markdown files to process' )
486-
502+
487503 for source_file in source_files :
488504 target_file = get_target_path (source_file )
489505 print (f'\n Processing: { source_file .name } → { target_file .name } ' )
490-
506+
491507 # Read source content.
492508 print (f' Reading source file: { source_file } ' )
493509 with open (source_file , 'r' , encoding = 'utf-8' ) as f :
494510 content = f .read ()
495511 print (f' Source file size: { len (content )} bytes' )
496-
512+
497513 # Transform content.
498514 print ('\n Transforming content...' )
499515 transformed_content = transform_markdown_to_mdx (content , source_file )
500516 print (f' ⭮ { len (transformed_content )} bytes' )
501-
517+
502518 # Write target file.
503519 print (f'\n Writing target file: { target_file } ' )
504520 os .makedirs (target_file .parent , exist_ok = True )
505521 with open (target_file , 'w' , encoding = 'utf-8' ) as f :
506522 f .write (transformed_content )
507523 print (f' ⭮ { source_file .name } → { target_file .name } ' )
508-
524+
509525 print ('\n Formatting MDX files...' )
510526 os .system ('npm run format-sync' )
511-
527+
512528 print ('\n Done' )
513-
529+
514530 except Exception as error :
515531 print ('\n ❌ Error processing files:' , str (error ))
516532 sys .exit (1 )
0 commit comments