1+ #!/usr/bin/env python3
2+
3+ import os
4+ import re
5+ import markdown
6+ import glob
7+ import html
8+
9+
10+ def parse_databricks_notebook (filepath ):
11+ """Parse a Databricks .py notebook format into cells"""
12+ with open (filepath , 'r' ) as f :
13+ content = f .read ()
14+
15+ # Split by COMMAND ----------
16+ sections = re .split (r'# COMMAND ----------' , content )
17+ cells = []
18+
19+ for section in sections :
20+ if not section .strip ():
21+ continue
22+
23+ # Check if this is a markdown cell
24+ if '# MAGIC %md' in section :
25+ # Extract markdown content
26+ lines = section .split ('\n ' )
27+ md_lines = []
28+ for line in lines :
29+ if line .startswith ('# MAGIC %md' ):
30+ # Remove '# MAGIC %md'
31+ md_lines .append (line [11 :].strip ())
32+ elif line .startswith ('# MAGIC ' ):
33+ # Remove '# MAGIC '
34+ md_lines .append (line [8 :])
35+ elif line .startswith ('# MAGIC' ):
36+ # Remove '# MAGIC'
37+ md_lines .append (line [7 :])
38+
39+ md_content = '\n ' .join (md_lines )
40+ cells .append ({'type' : 'markdown' , 'content' : md_content })
41+ else :
42+ # This is a code cell
43+ # Remove any leading comments that aren't actual code
44+ lines = section .split ('\n ' )
45+ code_lines = []
46+ for line in lines :
47+ if not line .startswith ('# DBTITLE' ):
48+ code_lines .append (line )
49+
50+ code_content = '\n ' .join (code_lines ).strip ()
51+ if code_content :
52+ cells .append ({'type' : 'code' , 'content' : code_content })
53+
54+ return cells
55+
56+
57+ def convert_to_html_fragment (filepath ):
58+ """Convert Databricks .py notebook to HTML fragment with syntax highlighting"""
59+ filename = os .path .basename (filepath )
60+ name_without_ext = os .path .splitext (filename )[0 ]
61+
62+ cells = parse_databricks_notebook (filepath )
63+ html_content = []
64+
65+ for i , cell in enumerate (cells ):
66+ if cell ['type' ] == 'markdown' :
67+ # Convert markdown to HTML using nbconvert structure
68+ md_html = markdown .markdown (
69+ cell ['content' ],
70+ extensions = ['fenced_code' , 'tables' , 'nl2br' , 'toc' ]
71+ )
72+ html_content .append (f'''<div class="cell border-box-sizing text_cell rendered">
73+ <div class="inner_cell">
74+ <div class="text_cell_render border-box-sizing rendered_html">
75+ { md_html }
76+ </div>
77+ </div>
78+ </div>''' )
79+ elif cell ['type' ] == 'code' :
80+ # Create code cell with proper syntax highlighting for Python
81+ escaped_code = html .escape (cell ['content' ])
82+ html_content .append (f'''<div class="cell border-box-sizing code_cell rendered">
83+ <div class="input">
84+ <div class="inner_cell">
85+ <div class="input_area">
86+ <div class="highlight hl-ipython3">
87+ <pre class="language-python"><code class="language-python">{ escaped_code } </code></pre>
88+ </div>
89+ </div>
90+ </div>
91+ </div>
92+ </div>''' )
93+
94+ # Return just the content fragment (no full HTML document)
95+ fragment_content = '\n ' .join (html_content )
96+
97+ # Write fragment to temp file for the main script to read
98+ temp_path = f"temp_{ name_without_ext } _fragment.html"
99+ with open (temp_path , 'w' ) as f :
100+ f .write (fragment_content )
101+
102+ return name_without_ext , fragment_content
103+
104+
105+ if __name__ == "__main__" :
106+ # Process all .py files in notebooks directory
107+ notebook_data = {}
108+ for py_file in glob .glob ('notebooks/*.py' ):
109+ name , fragment = convert_to_html_fragment (py_file )
110+ notebook_data [name ] = fragment
111+ print (f"Converted { py_file } to HTML fragment" )
112+
113+ # Write notebook data to a JSON file for the main script
114+ import json
115+ with open ('notebook_fragments.json' , 'w' ) as f :
116+ json .dump (notebook_data , f )
0 commit comments