Skip to content

Commit 4ff49db

Browse files
Initial commit
0 parents  commit 4ff49db

18 files changed

+1603
-0
lines changed
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env python3
2+
3+
import os
4+
import re
5+
import markdown
6+
import glob
7+
import html
8+
9+
10+
def parse_databricks_notebook(filepath):
11+
"""Parse a Databricks .py notebook format into cells"""
12+
with open(filepath, 'r') as f:
13+
content = f.read()
14+
15+
# Split by COMMAND ----------
16+
sections = re.split(r'# COMMAND ----------', content)
17+
cells = []
18+
19+
for section in sections:
20+
if not section.strip():
21+
continue
22+
23+
# Check if this is a markdown cell
24+
if '# MAGIC %md' in section:
25+
# Extract markdown content
26+
lines = section.split('\n')
27+
md_lines = []
28+
for line in lines:
29+
if line.startswith('# MAGIC %md'):
30+
# Remove '# MAGIC %md'
31+
md_lines.append(line[11:].strip())
32+
elif line.startswith('# MAGIC '):
33+
# Remove '# MAGIC '
34+
md_lines.append(line[8:])
35+
elif line.startswith('# MAGIC'):
36+
# Remove '# MAGIC'
37+
md_lines.append(line[7:])
38+
39+
md_content = '\n'.join(md_lines)
40+
cells.append({'type': 'markdown', 'content': md_content})
41+
else:
42+
# This is a code cell
43+
# Remove any leading comments that aren't actual code
44+
lines = section.split('\n')
45+
code_lines = []
46+
for line in lines:
47+
if not line.startswith('# DBTITLE'):
48+
code_lines.append(line)
49+
50+
code_content = '\n'.join(code_lines).strip()
51+
if code_content:
52+
cells.append({'type': 'code', 'content': code_content})
53+
54+
return cells
55+
56+
57+
def convert_to_html_fragment(filepath):
58+
"""Convert Databricks .py notebook to HTML fragment with syntax highlighting"""
59+
filename = os.path.basename(filepath)
60+
name_without_ext = os.path.splitext(filename)[0]
61+
62+
cells = parse_databricks_notebook(filepath)
63+
html_content = []
64+
65+
for i, cell in enumerate(cells):
66+
if cell['type'] == 'markdown':
67+
# Convert markdown to HTML using nbconvert structure
68+
md_html = markdown.markdown(
69+
cell['content'],
70+
extensions=['fenced_code', 'tables', 'nl2br', 'toc']
71+
)
72+
html_content.append(f'''<div class="cell border-box-sizing text_cell rendered">
73+
<div class="inner_cell">
74+
<div class="text_cell_render border-box-sizing rendered_html">
75+
{md_html}
76+
</div>
77+
</div>
78+
</div>''')
79+
elif cell['type'] == 'code':
80+
# Create code cell with proper syntax highlighting for Python
81+
escaped_code = html.escape(cell['content'])
82+
html_content.append(f'''<div class="cell border-box-sizing code_cell rendered">
83+
<div class="input">
84+
<div class="inner_cell">
85+
<div class="input_area">
86+
<div class="highlight hl-ipython3">
87+
<pre class="language-python"><code class="language-python">{escaped_code}</code></pre>
88+
</div>
89+
</div>
90+
</div>
91+
</div>
92+
</div>''')
93+
94+
# Return just the content fragment (no full HTML document)
95+
fragment_content = '\n'.join(html_content)
96+
97+
# Write fragment to temp file for the main script to read
98+
temp_path = f"temp_{name_without_ext}_fragment.html"
99+
with open(temp_path, 'w') as f:
100+
f.write(fragment_content)
101+
102+
return name_without_ext, fragment_content
103+
104+
105+
if __name__ == "__main__":
106+
# Process all .py files in notebooks directory
107+
notebook_data = {}
108+
for py_file in glob.glob('notebooks/*.py'):
109+
name, fragment = convert_to_html_fragment(py_file)
110+
notebook_data[name] = fragment
111+
print(f"Converted {py_file} to HTML fragment")
112+
113+
# Write notebook data to a JSON file for the main script
114+
import json
115+
with open('notebook_fragments.json', 'w') as f:
116+
json.dump(notebook_data, f)

0 commit comments

Comments
 (0)