Content Sync: Testing the action

vancura · vancura · commit 50d39104b435 · 2025-02-21T14:14:36.000+01:00
diff --git a/.github/workflows/sync-to-astro.yml b/.github/workflows/sync-to-astro.yml
@@ -57,43 +57,43 @@ jobs:
 
       # Step 5: Run the MD to MDX conversion script.
       - name: Run sync script
-        run: cd sync/source && python scripts/md2mdx.py
+        run: python sync/source/scripts/md2mdx.py --source sync/source --target sync/target
 
       # Step 6: Create or update PR with changes.
       # - name: Create Pull Request
       #   env:
       #     GH_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}
       #   run: |
       #     cd sync/target
-          
+
       #     # Debug: Show current directory and its contents
       #     echo "Current directory: $(pwd)"
       #     echo "Directory contents:"
       #     ls -la
-          
+
       #     # Debug: Show git status in detail
       #     echo "Git status:"
       #     git status
-          
+
       #     # Debug: Show any differences
       #     echo "Git diff:"
       #     git diff
-          
+
       #     git config user.name "github-actions[bot]"
       #     git config user.email "github-actions[bot]@users.noreply.github.com"
 
       #     # Create new branch (with error handling).
       #     git checkout -b ${{ env.TARGET_BRANCH }} || git checkout ${{ env.TARGET_BRANCH }}
-          
+
       #     # Debug: Show branch status
       #     echo "Current branch:"
       #     git branch --show-current
-          
+
       #     # Only create PR if there are changes.
       #     if [[ -n "$(git status --porcelain)" ]]; then
       #       echo "Changes detected:"
       #       git status --porcelain
-            
+
       #       git add .
       #       git commit -m "sync: Update MDX content from whitepaper"
 
diff --git a/scripts/md2mdx.py b/scripts/md2mdx.py
@@ -35,17 +35,33 @@
 import frontmatter
 import re
 import glob
+import argparse
+
+
+def parse_args():
+    """Parse command line arguments."""
+
+    parser = argparse.ArgumentParser(description='Transform Markdown files to MDX format.')
+    parser.add_argument('--source', type=str, help='Source directory containing markdown files')
+    parser.add_argument('--target', type=str, help='Target directory for MDX files')
+
+    return parser.parse_args()
+
 
 # Get the project root - need to handle both direct run and test-sync run.
 SCRIPT_PATH = Path(__file__).resolve()
-CURRENT_DIR = Path.cwd()
-
-PROJECT_ROOT = SCRIPT_PATH.parent.parent
-SOURCE_ROOT = PROJECT_ROOT / 'sync/source'
-TARGET_ROOT = PROJECT_ROOT / 'sync/target'
+args = parse_args()
 
 print(f'Script location: {__file__}')
-print(f'Project root: {PROJECT_ROOT}')
+
+if args.source and args.target:
+    SOURCE_ROOT = Path(args.source).resolve()
+    TARGET_ROOT = Path(args.target).resolve()
+else:
+    PROJECT_ROOT = SCRIPT_PATH.parent.parent
+    SOURCE_ROOT = PROJECT_ROOT / 'sync/source'
+    TARGET_ROOT = PROJECT_ROOT / 'sync/target'
+
 print(f'Source root: {SOURCE_ROOT}')
 print(f'Target root: {TARGET_ROOT}')
 
@@ -94,7 +110,7 @@
 
 def should_process_file(path: Path) -> bool:
     """Determine if a file should be processed based on ignore rules."""
-    
+
     # Case-insensitive filename check.
     if path.name.lower() in IGNORED_FILES:
         print(f'\n󰋼  Skipping ignored file: {path.name}')
@@ -138,7 +154,7 @@ def replace_image(match):
 
 def add_github_header(content: str, is_readme: bool) -> str:
     """Add GitHub header component after the first heading, but only for README.md."""
-    
+
     if not is_readme:
         return content
 
@@ -309,61 +325,61 @@ def replace_comment(match):
 
 def transform_internal_links(content: str) -> str:
     """Transform internal markdown links to the new MDX format."""
-    
+
     print('\n󰋼  Transforming internal links...')
 
     def format_link_text(text):
         """Convert technical names to readable titles."""
-        
+
         # Remove file extensions.
         text = re.sub(r'\.(json|md)$', '', text)
-        
+
         # Handle special cases.
         if text == 'README':
             return 'Documentation'
-            
+
         # Convert UPPER_CASE to Title Case.
         if text.isupper():
             words = text.split('_')
             return ' '.join(word.capitalize() for word in words)
-            
+
         return text
 
     def replace_link(match):
         text, path, anchor = match.groups()
-        
+
         # Handle different link types.
         if path:
             # Remove .md extension if present.
             path = path.replace('.md', '')
-            
+
             if path == '../README':
                 # Links to README become root links.
                 new_path = '/'
             else:
                 # Remove ./ or / prefix if present.
                 path = path.lstrip('./').lstrip('/')
-                
+
                 # Convert to kebab case.
                 new_path = '/' + path.lower().replace('_', '-')
         else:
             new_path = ''
-            
+
         # Add anchor if present.
         if anchor:
             new_path = f"{new_path}{anchor}"
-            
+
         # Format the link text if it's a technical name.
         if text.endswith('.md') or text.endswith('.json') or text.isupper() or '.json' in text:
             text = format_link_text(text)
-            
+
         print(f'  ⭮  {text} → {new_path}')
         return f'[{text}]({new_path})'
 
     # First pass: handle standard markdown links.
     pattern = r'\[([^\]]+)\]\(((?!http)[^)#\s]+)?([#][^)\s]+)?\)'
     content = re.sub(pattern, replace_link, content)
-    
+
     # Second pass: handle already transformed links but with technical names.
     pattern = r'\[([A-Z_]+(?:\.(?:json|md))?)\](/[a-z-]+(?:[#][^)\s]+)?)\)'
     content = re.sub(pattern, lambda m: f'[{format_link_text(m.group(1))}]{m.group(2)}', content)
@@ -373,7 +389,7 @@ def replace_link(match):
 
 def remove_img_tags(content: str) -> str:
     """Remove HTML img tags from the content."""
-    
+
     print('\n󰋼  Removing img tags...')
 
     def replace_img(match):
@@ -390,20 +406,20 @@ def replace_img(match):
 
 def transform_inline_references(content: str) -> str:
     """Transform inline file references and URLs to proper format."""
-    
+
     print('\n󰋼  Transforming inline references...')
 
     def replace_reference(match):
         path = match.group(1)
-        
+
         # Remove .md extension if present.
         path = path.replace('.md', '')
-        
+
         # Convert to kebab case and add leading slash.
         new_path = '/' + path.lstrip('./').lstrip('/').lower().replace('_', '-')
-        
+
         print(f'  ⭮  {path} → {new_path}')
-        
+
         return new_path
 
     # Transform file references like ./DATASET_SCHEMA.md to /dataset-schema.
@@ -426,10 +442,10 @@ def replace_reference(match):
 
 def transform_markdown_to_mdx(content: str, source_file: Path) -> str:
     """Main transformation pipeline to convert markdown to MDX format."""
-    
+
     print('\n󰋼  Parsing frontmatter...')
     post = frontmatter.loads(content)
-    
+
     is_readme = source_file.name.lower() == 'readme.md'
 
     # Apply transformations in sequence.
@@ -451,14 +467,14 @@ def transform_markdown_to_mdx(content: str, source_file: Path) -> str:
 
 def get_target_path(source_path: Path) -> Path:
     """Convert source path to target path using the required transformations."""
-    
+
     # Get relative path from source root.
     rel_path = source_path.relative_to(SOURCE_ROOT)
-    
+
     # Transform filename.
     stem = rel_path.stem.lower().replace('_', '-')
     new_name = f"{stem}.mdx"
-    
+
     # Construct target path.
     if source_path.name == 'README.md':
         # Special case for README.md -> index.mdx.
@@ -470,47 +486,47 @@ def get_target_path(source_path: Path) -> Path:
 
 def process_files():
     """Main function to process all markdown files."""
-    
+
     try:
         # Find all markdown files to process.
         source_files = [
             Path(p) for p in [
                 *glob.glob(str(SOURCE_ROOT / '*.md')),  # root md files
                 *glob.glob(str(SOURCE_ROOT / 'pages/*.md'))  # files in pages directory
             ]
-            
+
             if should_process_file(Path(p))  # filter out ignored files
         ]
-        
+
         print(f'\n󰋼  Found {len(source_files)} markdown files to process')
-        
+
         for source_file in source_files:
             target_file = get_target_path(source_file)
             print(f'\n󰋼  Processing: {source_file.name} → {target_file.name}')
-            
+
             # Read source content.
             print(f'  Reading source file: {source_file}')
             with open(source_file, 'r', encoding='utf-8') as f:
                 content = f.read()
             print(f'  Source file size: {len(content)} bytes')
-            
+
             # Transform content.
             print('\n󰋼  Transforming content...')
             transformed_content = transform_markdown_to_mdx(content, source_file)
             print(f'  ⭮  {len(transformed_content)} bytes')
-            
+
             # Write target file.
             print(f'\n󰋼  Writing target file: {target_file}')
             os.makedirs(target_file.parent, exist_ok=True)
             with open(target_file, 'w', encoding='utf-8') as f:
                 f.write(transformed_content)
             print(f'  ⭮  {source_file.name} → {target_file.name}')
-        
+
         print('\n󰋼  Formatting MDX files...')
         os.system('npm run format-sync')
-        
+
         print('\n  Done')
-        
+
     except Exception as error:
         print('\n❌ Error processing files:', str(error))
         sys.exit(1)
diff --git a/scripts/test-sync.sh b/scripts/test-sync.sh
@@ -51,7 +51,7 @@ echo -e "\n  Python dependencies installed"
 
 echo -e "\n\n"
 echo "Current path: $(pwd)"
-python3 scripts/md2mdx.py
+python3 scripts/md2mdx.py --source $WORK_DIR/source --target $WORK_DIR/target
 echo -e "\n  MD to MDX conversion completed"
 
 echo -e "\n\n"