Skip to content

Commit 2b861be

Browse files
committed
Refactor jupyter notebook cleaning script
1 parent c9e8b58 commit 2b861be

File tree

2 files changed

+38
-27
lines changed

2 files changed

+38
-27
lines changed

notebooks/.gitattributes

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1 @@
1-
*.ipynb filter=strip-notebook-output
2-
# keep the output of the following notebooks when committing
3-
**/SlurmGPU.ipynb !filter=strip-notebook-output
4-
**/Requested and Used VRAM.ipynb !filter=strip-notebook-output
1+
*.ipynb filter=strip-notebook-output

scripts/clean_notebook.sh

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,33 +16,47 @@ tmpnotebook=$(mktemp)
1616
# Ensure temp files are cleaned up on exit or error
1717
trap 'rm -f "$tmpjson" "$tmpnotebook"' EXIT
1818

19-
# Check if the notebook is in the exclude list
19+
# Always strip metadata. Only strip outputs if notebook NOT in exclude list.
20+
21+
# Check if input file is empty
22+
if [ ! -s "$fname" ]; then
23+
echo "Error: Input file $fname is empty" >&2
24+
exit 1
25+
fi
26+
27+
# Convert notebook to JSON and assign sequential cell IDs with jq (array-safe)
28+
if ! cat "$fname" | \
29+
jq 'if .cells then .cells |= [ range(0; length) as $i | .[$i] as $cell | ($cell | (if has("execution_count") then .execution_count = null else . end) + {id: ($i|tostring)}) ] else . end' > "$tmpjson"; then
30+
echo "Error: jq failed for $fname" >&2
31+
exit 2
32+
fi
33+
34+
# Validate JSON
35+
if [ ! -s "$tmpjson" ] || ! jq empty "$tmpjson" 2>/dev/null; then
36+
echo "Error: $tmpjson is not valid JSON after jq for $fname" >&2
37+
exit 3
38+
fi
39+
40+
BASE_NBCONVERT_ARGS=(--to=notebook --stdin --stdout --log-level=ERROR)
41+
2042
if grep -Fxq "$(basename "$fname")" "$EXCLUDE_FILE" 2>/dev/null; then
21-
cat "$fname"
22-
else
23-
# Check if input file is empty
24-
if [ ! -s "$fname" ]; then
25-
echo "Error: Input file $fname is empty" >&2
26-
exit 1
27-
fi
28-
# Convert notebook to JSON and assign sequential cell IDs with jq (array-safe)
29-
if ! cat "$fname" | \
30-
jq 'if .cells then .cells |= [ range(0; length) as $i | .[$i] as $cell | ($cell + {id: ($i|tostring)}) ] else . end' > "$tmpjson"; then
31-
echo "Error: jq failed for $fname" >&2
32-
exit 2
33-
fi
34-
# Check if tmpjson is valid and not empty
35-
if [ ! -s "$tmpjson" ] || ! jq empty "$tmpjson" 2>/dev/null; then
36-
echo "Error: $tmpjson is not valid JSON after jq for $fname" >&2
37-
exit 3
43+
# Excluded: keep outputs, strip metadata only
44+
if ! jupyter nbconvert "${BASE_NBCONVERT_ARGS[@]}" < "$tmpjson" | \
45+
jupyter nbconvert --ClearMetadataPreprocessor.enabled=True \
46+
--TagRemovePreprocessor.enabled=True --ClearMetadataPreprocessor.preserve_cell_metadata_mask='[("tags")]' \
47+
"${BASE_NBCONVERT_ARGS[@]}" > "$tmpnotebook"; then
48+
echo "Error: nbconvert (metadata-only) failed for $fname" >&2
49+
exit 4
3850
fi
39-
# Convert JSON back to notebook and strip outputs/metadata
40-
if ! jupyter nbconvert --to=notebook --stdin --stdout --log-level=ERROR < "$tmpjson" | \
51+
else
52+
# Not excluded: strip outputs and metadata
53+
if ! jupyter nbconvert "${BASE_NBCONVERT_ARGS[@]}" < "$tmpjson" | \
4154
jupyter nbconvert --ClearOutputPreprocessor.enabled=True --ClearMetadataPreprocessor.enabled=True \
4255
--TagRemovePreprocessor.enabled=True --ClearMetadataPreprocessor.preserve_cell_metadata_mask='[("tags")]' \
43-
--to=notebook --stdin --stdout --log-level=ERROR > "$tmpnotebook"; then
44-
echo "Error: nbconvert failed for $fname" >&2
56+
"${BASE_NBCONVERT_ARGS[@]}" > "$tmpnotebook"; then
57+
echo "Error: nbconvert (outputs+metadata) failed for $fname" >&2
4558
exit 4
4659
fi
47-
cat "$tmpnotebook"
4860
fi
61+
62+
cat "$tmpnotebook"

0 commit comments

Comments
 (0)