|
| 1 | +// Copyright (c) 2021, Daniel Martí <[email protected]> |
| 2 | +// See LICENSE for licensing information |
| 3 | + |
| 4 | +package syntax |
| 5 | + |
| 6 | +import ( |
| 7 | + "fmt" |
| 8 | + "strings" |
| 9 | + "unicode" |
| 10 | + "unicode/utf8" |
| 11 | +) |
| 12 | + |
| 13 | +type QuoteError struct { |
| 14 | + ByteOffset int |
| 15 | + Message string |
| 16 | +} |
| 17 | + |
| 18 | +func (e QuoteError) Error() string { |
| 19 | + return fmt.Sprintf("cannot quote character at byte %d: %s", e.ByteOffset, e.Message) |
| 20 | +} |
| 21 | + |
| 22 | +const ( |
| 23 | + quoteErrNull = "shell strings cannot contain null bytes" |
| 24 | + quoteErrPOSIX = "POSIX shell lacks escape sequences" |
| 25 | + quoteErrRange = "rune out of range" |
| 26 | + quoteErrMksh = "mksh cannot escape codepoints above 16 bits" |
| 27 | +) |
| 28 | + |
| 29 | +// Quote returns a quoted version of the input string, |
| 30 | +// so that the quoted version is expanded or interpreted |
| 31 | +// as the original string in the given language variant. |
| 32 | +// |
| 33 | +// Quoting is necessary when using arbitrary literal strings |
| 34 | +// as words in a shell script or command. |
| 35 | +// Without quoting, one can run into syntax errors, |
| 36 | +// as well as the possibility of running unintended code. |
| 37 | +// |
| 38 | +// An error is returned when a string cannot be quoted for a variant. |
| 39 | +// For instance, POSIX lacks escape sequences for non-printable characters, |
| 40 | +// and no language variant can represent a string containing null bytes. |
| 41 | +// In such cases, the returned error type will be *QuoteError. |
| 42 | +// |
| 43 | +// The quoting strategy is chosen on a best-effort basis, |
| 44 | +// to minimize the amount of extra bytes necessary. |
| 45 | +// |
| 46 | +// Some strings do not require any quoting and are returned unchanged. |
| 47 | +// Those strings can be directly surrounded in single quotes as well. |
| 48 | +// |
| 49 | +//nolint:gocyclo // ignore "cyclomatic complexity 35 of func `Quote` is high (> 16) (gocyclo)" |
| 50 | +func Quote(s string, lang LangVariant) (string, error) { |
| 51 | + if s == "" { |
| 52 | + // Special case; an empty string must always be quoted, |
| 53 | + // as otherwise it expands to zero fields. |
| 54 | + return "''", nil |
| 55 | + } |
| 56 | + shellChars := false |
| 57 | + nonPrintable := false |
| 58 | + offs := 0 |
| 59 | + for rem := s; len(rem) > 0; { |
| 60 | + r, size := utf8.DecodeRuneInString(rem) |
| 61 | + switch r { |
| 62 | + // Like regOps; token characters. |
| 63 | + case ';', '"', '\'', '(', ')', '$', '|', '&', '>', '<', '`', |
| 64 | + // Whitespace; might result in multiple fields. |
| 65 | + ' ', '\t', '\r', '\n', |
| 66 | + // Escape sequences would be expanded. |
| 67 | + '\\', |
| 68 | + // Would start a comment unless quoted. |
| 69 | + '#', |
| 70 | + // Might result in brace expansion. |
| 71 | + '{', |
| 72 | + // Might result in tilde expansion. |
| 73 | + '~', |
| 74 | + // Might result in globbing. |
| 75 | + '*', '?', '[', |
| 76 | + // Might result in an assignment. |
| 77 | + '=': |
| 78 | + shellChars = true |
| 79 | + case '\x00': |
| 80 | + return "", &QuoteError{ByteOffset: offs, Message: quoteErrNull} |
| 81 | + } |
| 82 | + if r == utf8.RuneError || !unicode.IsPrint(r) { |
| 83 | + if lang == LangPOSIX { |
| 84 | + return "", &QuoteError{ByteOffset: offs, Message: quoteErrPOSIX} |
| 85 | + } |
| 86 | + nonPrintable = true |
| 87 | + } |
| 88 | + rem = rem[size:] |
| 89 | + offs += size |
| 90 | + } |
| 91 | + if !shellChars && !nonPrintable && !IsKeyword(s) { |
| 92 | + // Nothing to quote; avoid allocating. |
| 93 | + return s, nil |
| 94 | + } |
| 95 | + |
| 96 | + // Single quotes are usually best, |
| 97 | + // as they don't require any escaping of characters. |
| 98 | + // If we have any invalid utf8 or non-printable runes, |
| 99 | + // use $'' so that we can escape them. |
| 100 | + // Note that we can't use double quotes for those. |
| 101 | + var b strings.Builder |
| 102 | + if nonPrintable { |
| 103 | + b.WriteString("$'") |
| 104 | + lastRequoteIfHex := false |
| 105 | + offs = 0 |
| 106 | + for rem := s; len(rem) > 0; { |
| 107 | + nextRequoteIfHex := false |
| 108 | + r, size := utf8.DecodeRuneInString(rem) |
| 109 | + switch { |
| 110 | + case r == '\'', r == '\\': |
| 111 | + b.WriteByte('\\') |
| 112 | + b.WriteRune(r) |
| 113 | + case unicode.IsPrint(r) && r != utf8.RuneError: |
| 114 | + if lastRequoteIfHex && isHex(r) { |
| 115 | + b.WriteString("'$'") |
| 116 | + } |
| 117 | + b.WriteRune(r) |
| 118 | + case r == '\a': |
| 119 | + b.WriteString(`\a`) |
| 120 | + case r == '\b': |
| 121 | + b.WriteString(`\b`) |
| 122 | + case r == '\f': |
| 123 | + b.WriteString(`\f`) |
| 124 | + case r == '\n': |
| 125 | + b.WriteString(`\n`) |
| 126 | + case r == '\r': |
| 127 | + b.WriteString(`\r`) |
| 128 | + case r == '\t': |
| 129 | + b.WriteString(`\t`) |
| 130 | + case r == '\v': |
| 131 | + b.WriteString(`\v`) |
| 132 | + case r < utf8.RuneSelf, r == utf8.RuneError && size == 1: |
| 133 | + // \xXX, fixed at two hexadecimal characters. |
| 134 | + fmt.Fprintf(&b, "\\x%02x", rem[0]) |
| 135 | + // Unfortunately, mksh allows \x to consume more hex characters. |
| 136 | + // Ensure that we don't allow it to read more than two. |
| 137 | + if lang == LangMirBSDKorn { |
| 138 | + nextRequoteIfHex = true |
| 139 | + } |
| 140 | + case r > utf8.MaxRune: |
| 141 | + // Not a valid Unicode code point? |
| 142 | + return "", &QuoteError{ByteOffset: offs, Message: quoteErrRange} |
| 143 | + case lang == LangMirBSDKorn && r > 0xFFFD: |
| 144 | + // From the CAVEATS section in R59's man page: |
| 145 | + // |
| 146 | + // mksh currently uses OPTU-16 internally, which is the same as |
| 147 | + // UTF-8 and CESU-8 with 0000..FFFD being valid codepoints. |
| 148 | + return "", &QuoteError{ByteOffset: offs, Message: quoteErrMksh} |
| 149 | + case r < 0x10000: |
| 150 | + // \uXXXX, fixed at four hexadecimal characters. |
| 151 | + fmt.Fprintf(&b, "\\u%04x", r) |
| 152 | + default: |
| 153 | + // \UXXXXXXXX, fixed at eight hexadecimal characters. |
| 154 | + fmt.Fprintf(&b, "\\U%08x", r) |
| 155 | + } |
| 156 | + rem = rem[size:] |
| 157 | + lastRequoteIfHex = nextRequoteIfHex |
| 158 | + offs += size |
| 159 | + } |
| 160 | + b.WriteString("'") |
| 161 | + return b.String(), nil |
| 162 | + } |
| 163 | + |
| 164 | + // Single quotes without any need for escaping. |
| 165 | + if !strings.Contains(s, "'") { |
| 166 | + return "'" + s + "'", nil |
| 167 | + } |
| 168 | + |
| 169 | + // The string contains single quotes, |
| 170 | + // so fall back to double quotes. |
| 171 | + b.WriteByte('"') |
| 172 | + for _, r := range s { |
| 173 | + switch r { |
| 174 | + case '"', '\\', '`', '$': |
| 175 | + b.WriteByte('\\') |
| 176 | + } |
| 177 | + b.WriteRune(r) |
| 178 | + } |
| 179 | + b.WriteByte('"') |
| 180 | + return b.String(), nil |
| 181 | +} |
| 182 | + |
| 183 | +func isHex(r rune) bool { |
| 184 | + return (r >= '0' && r <= '9') || |
| 185 | + (r >= 'a' && r <= 'f') || |
| 186 | + (r >= 'A' && r <= 'F') |
| 187 | +} |
0 commit comments