|
1 | 1 | package sanitize |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "strings" |
4 | 5 | "sync" |
| 6 | + "unicode" |
5 | 7 |
|
6 | 8 | "github.com/microcosm-cc/bluemonday" |
7 | 9 | ) |
@@ -40,6 +42,109 @@ func FilterHTMLTags(input string) string { |
40 | 42 | return getPolicy().Sanitize(input) |
41 | 43 | } |
42 | 44 |
|
| 45 | +// FilterCodeFenceMetadata removes hidden or suspicious info strings from fenced code blocks. |
| 46 | +func FilterCodeFenceMetadata(input string) string { |
| 47 | + if input == "" { |
| 48 | + return input |
| 49 | + } |
| 50 | + |
| 51 | + lines := strings.Split(input, "\n") |
| 52 | + insideFence := false |
| 53 | + currentFenceLen := 0 |
| 54 | + for i, line := range lines { |
| 55 | + sanitized, toggled, fenceLen := sanitizeCodeFenceLine(line, insideFence, currentFenceLen) |
| 56 | + lines[i] = sanitized |
| 57 | + if toggled { |
| 58 | + insideFence = !insideFence |
| 59 | + if insideFence { |
| 60 | + currentFenceLen = fenceLen |
| 61 | + } else { |
| 62 | + currentFenceLen = 0 |
| 63 | + } |
| 64 | + } |
| 65 | + } |
| 66 | + return strings.Join(lines, "\n") |
| 67 | +} |
| 68 | + |
| 69 | +const maxCodeFenceInfoLength = 48 |
| 70 | + |
| 71 | +func sanitizeCodeFenceLine(line string, insideFence bool, expectedFenceLen int) (string, bool, int) { |
| 72 | + idx := strings.Index(line, "```") |
| 73 | + if idx == -1 { |
| 74 | + return line, false, expectedFenceLen |
| 75 | + } |
| 76 | + |
| 77 | + if hasNonWhitespace(line[:idx]) { |
| 78 | + return line, false, expectedFenceLen |
| 79 | + } |
| 80 | + |
| 81 | + fenceEnd := idx |
| 82 | + for fenceEnd < len(line) && line[fenceEnd] == '`' { |
| 83 | + fenceEnd++ |
| 84 | + } |
| 85 | + |
| 86 | + fenceLen := fenceEnd - idx |
| 87 | + if fenceLen < 3 { |
| 88 | + return line, false, expectedFenceLen |
| 89 | + } |
| 90 | + |
| 91 | + rest := line[fenceEnd:] |
| 92 | + |
| 93 | + if insideFence { |
| 94 | + if expectedFenceLen != 0 && fenceLen != expectedFenceLen { |
| 95 | + return line, false, expectedFenceLen |
| 96 | + } |
| 97 | + return line[:fenceEnd], true, fenceLen |
| 98 | + } |
| 99 | + |
| 100 | + trimmed := strings.TrimSpace(rest) |
| 101 | + |
| 102 | + if trimmed == "" { |
| 103 | + return line[:fenceEnd], true, fenceLen |
| 104 | + } |
| 105 | + |
| 106 | + if strings.IndexFunc(trimmed, unicode.IsSpace) != -1 { |
| 107 | + return line[:fenceEnd], true, fenceLen |
| 108 | + } |
| 109 | + |
| 110 | + if len(trimmed) > maxCodeFenceInfoLength { |
| 111 | + return line[:fenceEnd], true, fenceLen |
| 112 | + } |
| 113 | + |
| 114 | + if !isSafeCodeFenceToken(trimmed) { |
| 115 | + return line[:fenceEnd], true, fenceLen |
| 116 | + } |
| 117 | + |
| 118 | + if len(rest) > 0 && unicode.IsSpace(rune(rest[0])) { |
| 119 | + return line[:fenceEnd] + " " + trimmed, true, fenceLen |
| 120 | + } |
| 121 | + |
| 122 | + return line[:fenceEnd] + trimmed, true, fenceLen |
| 123 | +} |
| 124 | + |
| 125 | +func hasNonWhitespace(segment string) bool { |
| 126 | + for _, r := range segment { |
| 127 | + if !unicode.IsSpace(r) { |
| 128 | + return true |
| 129 | + } |
| 130 | + } |
| 131 | + return false |
| 132 | +} |
| 133 | + |
| 134 | +func isSafeCodeFenceToken(token string) bool { |
| 135 | + for _, r := range token { |
| 136 | + if unicode.IsLetter(r) || unicode.IsDigit(r) { |
| 137 | + continue |
| 138 | + } |
| 139 | + switch r { |
| 140 | + case '+', '-', '_', '#', '.': |
| 141 | + continue |
| 142 | + } |
| 143 | + return false |
| 144 | + } |
| 145 | + return true |
| 146 | +} |
| 147 | + |
43 | 148 | func getPolicy() *bluemonday.Policy { |
44 | 149 | policyOnce.Do(func() { |
45 | 150 | p := bluemonday.StrictPolicy() |
|
0 commit comments