|
1 | | -""" |
2 | | -Small scanf implementation. |
| 1 | +"""PyRAF scanf module |
| 2 | +
|
| 3 | +Implements a subset of C-style scanf functionality in Python using regular expressions. |
3 | 4 |
|
4 | | -Python has powerful regular expressions but sometimes they are totally overkill |
5 | | -when you just want to parse a simple-formatted string. |
6 | | -C programmers use the scanf-function for these tasks (see link below). |
| 5 | +Features: |
7 | 6 |
|
8 | | -This implementation of scanf translates the simple scanf-format into |
9 | | -regular expressions. Unlike C you can be sure that there are no buffer overflows |
10 | | -possible. |
| 7 | + - Supports %d, %f, %g, %e, %x, %o, %s, %c, %% and %[...] scan sets |
| 8 | + - Handles optional field widths and suppressed assignments (%*d, etc.) |
| 9 | + - Returns parsed values as Python types (int, float, str) |
| 10 | + - Partial matches are allowed; parsing stops at first mismatch |
| 11 | + - Leading whitespace is consumed for most numeric and string conversions |
11 | 12 |
|
12 | | -For more information see |
13 | | - * http://www.python.org/doc/current/lib/node49.html |
14 | | - * http://en.wikipedia.org/wiki/Scanf |
| 13 | +Differences from IRAF scanf: |
15 | 14 |
|
16 | | -Original code from: |
17 | | - https://github.com/joshburnett/scanf (version 1.5.2) |
| 15 | + - Sexagesimal numbers (hh:mm:ss) are not supported |
| 16 | + - no INDEF handling |
18 | 17 |
|
19 | | -Modified for the needs of PyRAF: |
20 | | - * all fields may have a max width (not a fixed width) |
21 | | - * add "l" (for [outdated] long ints) |
22 | | - * allow "0x" and "0o" prefixes in ints for hexa/octal numbers |
| 18 | +Differences from original PyRAF implementation (on purpose, to |
| 19 | +establish more conformity to IRAF scanf): |
23 | 20 |
|
24 | | -Differences to the original PyRAF sscanf module: |
25 | | - * "n" coversion missing (number of characters so far) |
| 21 | + - "0x" prefix is not allowed when parsing integer values |
| 22 | + - leading zero does not indicate octal when parsing integer values |
| 23 | + - %i, %l, %u, %E, %X are not implemented |
26 | 24 |
|
| 25 | +Original code was taken from https://github.com/joshburnett/scanf |
| 26 | +(version 1.5.2) and heavily modified. |
27 | 27 |
|
28 | 28 | """ |
29 | 29 | import re |
|
33 | 33 | except ImportError: |
34 | 34 | from backports.functools_lru_cache import lru_cache |
35 | 35 |
|
36 | | -__version__ = '1.5.2' |
37 | 36 |
|
38 | 37 | __all__ = ["scanf", 'scanf_translate', 'scanf_compile'] |
39 | 38 |
|
40 | 39 |
|
41 | | -DEBUG = False |
42 | | - |
43 | | -# As you can probably see it is relatively easy to add more format types. |
44 | | -# Make sure you add a second entry for each new item that adds the extra |
45 | | -# few characters needed to handle the field ommision. |
| 40 | +# Each tuple is: (format_regex, regex_pattern, cast_function) |
| 41 | +# |
| 42 | +# - format_regex: regex to identify the format specifier in the format string. |
| 43 | +# The first group item is always for the optional "*" to suppress conversion. |
| 44 | +# All other groups are used to replace the placeholders in the second pattern. |
| 45 | +# |
| 46 | +# - regex_pattern: regex to match the corresponding input field. |
| 47 | +# As replacement placeholder, %s is used. |
| 48 | +# |
| 49 | +# - cast_function: Python callable to convert matched string to int/float/str |
| 50 | +# Setting to None indicates that the match is ignored for the result, otherwise |
| 51 | +# it is called with the first group of the match from the regexp_pattern. |
| 52 | +# |
46 | 53 | scanf_translate = [ |
47 | 54 | (re.compile(_token), _pattern, _cast) for _token, _pattern, _cast in [ |
48 | | - (r"%c", r"(.)", lambda x:x), |
49 | | - (r"%\*c", r"(?:.)", None), |
50 | | - |
51 | | - (r"%(\d+)c", r"(.{0,%s})", lambda x:x), |
52 | | - (r"%\*(\d+)c", r"(?:.{0,%s})", None), |
| 55 | + # %c - Fixed width character string |
| 56 | + (r"%(\*)?c", r"(.)", lambda x:x), |
| 57 | + (r"%(\*)?(\d+)c", r"(.{1,%s})", lambda x:x), |
53 | 58 |
|
54 | | - (r"%s", r"(\S+)", lambda x: x), |
55 | | - (r"%\*s", r"(?:\S+)", None), |
| 59 | + # %s - String of non-whitespace characters |
| 60 | + (r"%(\*)?s", r"\s*(\S+)", lambda x: x), |
| 61 | + (r"%(\*)?(\d+)s", r"\s*(\S{1,%s})", lambda x:x), |
56 | 62 |
|
57 | | - (r"%(\d+)s", r"(\S{1,%s})", lambda x:x), |
58 | | - (r"%\*(\d+)s", r"(?:\S{1,%s})", None), |
| 63 | + # %[] - Character scan set |
| 64 | + (r"%(\*)?\[([^\]]+)\]", r"\s*([%s]+)", lambda x:x), |
59 | 65 |
|
60 | | - (r"%\[([^\]]+)\]", r"([%s]+)", lambda x:x), |
61 | | - (r"%\*\[([^\]]+)\]", r"(?:[%s]+)", None), |
| 66 | + # %d - Signed integer |
| 67 | + (r"%(\*)?l?d", r"\s*([+-]?\d+)", int), |
| 68 | + (r"%(\*)?(\d+)l?d", r"\s*([+-]?\d{1,%s})", int), |
62 | 69 |
|
63 | | - (r"%l?d", r"([+-]?\d+)", int), |
64 | | - (r"%\*l?d", r"(?:[+-]?\d+)", None), |
| 70 | + # %f, %g, %e - Float |
| 71 | + (r"%(\*)?[fge]", r"\s*([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float), |
| 72 | + (r"%(\*)?(\d+)[fge]", r"\s*([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float), |
65 | 73 |
|
66 | | - (r"%(\d+)l?d", r"([+-]?\d{1,%s})", int), |
67 | | - (r"%\*(\d+)l?d", r"(?:[+-]?\d{1,%s})", None), |
| 74 | + # %x - Hexadecimal integer. |
| 75 | + (r"%(\*)?l?x", r"\s*([-+]?[\da-fA-F]+)", lambda x: int(x, 16)), |
| 76 | + (r"%(\*)?(\d+)l?x", r"\s*([-+]?[\dA-Fa-f]{1,%s})", lambda x: int(x, 16)), |
68 | 77 |
|
69 | | - (r"%[fge]", r"([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float), |
70 | | - (r"%\*[fge]", r"(?:[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", None), |
| 78 | + # %o - Octal integer. |
| 79 | + (r"%(\*)?l?o", r"\s*([-+]?[0-7]+)", lambda x:int(x, 8)), |
| 80 | + (r"%(\*)?(\d+)l?o", r"\s*([-+]?[0-7]{1,%s})", lambda x: int(x, 8)), |
71 | 81 |
|
72 | | - (r"%(\d+)[fge]", r"([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float), |
73 | | - (r"%\*(\d+)[fge]", r"(?:[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", None), |
| 82 | + # %% - single percent sign |
| 83 | + (r"%%", r"%", None), |
74 | 84 |
|
75 | | - (r"%l?x", r"([\dA-Za-f]+)", lambda x: int(x, 16)), |
76 | | - (r"%\*l?x", r"[\dA-Za-f]+)", None), |
| 85 | + # white spaces |
| 86 | + (r"\s+", r"\s*", None), |
77 | 87 |
|
78 | | - (r"%(\d+)l?x", r"([\dA-Za-f]{1,%s})", lambda x: int(x, 16)), |
79 | | - (r"%\*(\d+)l?x", r"[\dA-Za-f]{1,%s})", None), |
| 88 | + # special chars in regexps |
| 89 | + (r"()([\\^$.|?*+()[\]{}-])", r"\%s", None), |
80 | 90 |
|
81 | | - (r"%l?o", r"([0-7]+)", lambda x:int(x, 8)), |
82 | | - (r"%\*l?o", r"(?:[0-7]+)", None), |
83 | | - |
84 | | - (r"%(\d+)l?o", r"([0-7]{1,%s})", lambda x: int(x, 8)), |
85 | | - (r"%\*(\d+)l?o", r"(?:[0-7]{1,%s})", None), |
| 91 | + # All other non-whitespaces |
| 92 | + (r"()([^\s\\^$.|?*+()[\]{}%-]+)", r"%s", None), |
86 | 93 | ]] |
87 | 94 |
|
88 | 95 |
|
89 | | -# Cache formats |
90 | | -SCANF_CACHE_SIZE = 1000 |
91 | | - |
92 | | - |
93 | | -@lru_cache(maxsize=SCANF_CACHE_SIZE) |
94 | | -def scanf_compile(format, collapseWhitespace=True): |
| 96 | +@lru_cache(maxsize=1000) |
| 97 | +def scanf_compile(format): |
95 | 98 | """ |
96 | 99 | Translate the format into a regular expression |
97 | 100 |
|
98 | 101 | For example: |
99 | 102 |
|
100 | | - >>> format_re, casts = scanf_compile('%s - %d errors, %d warnings') |
101 | | - >>> print format_re.pattern |
102 | | - (\\S+) \\- ([+-]?\\d+) errors, ([+-]?\\d+) warnings |
| 103 | + >>> re_list = scanf_compile('%s - %d errors, %d warnings') |
| 104 | + >>> for pattern, cast in re_list: |
| 105 | + ... print(pattern, cast) |
| 106 | + re.compile('(\\S*)') <function <lambda> at 0x7f5da8f1b060> |
| 107 | + re.compile('\\s+\\-\\s+') None |
| 108 | + re.compile('([+-]?(?:0o[0-7]+|0x[\\da-fA-F]+|\\d+))') int |
| 109 | + re.compile('\\s+errors,\\s+') None |
| 110 | + re.compile('([+-]?(?:0o[0-7]+|0x[\\da-fA-F]+|\\d+))') int |
| 111 | + re.compile('\\s+warnings') None |
103 | 112 |
|
104 | 113 | Translated formats are cached for faster reuse |
105 | 114 | """ |
106 | | - |
107 | | - format_pat = "" |
108 | | - cast_list = [] |
| 115 | + # Iterate over the format string, identifying literal text and conversion specifiers |
| 116 | + # For each conversion: |
| 117 | + # - Determine width, suppression, and conversion type |
| 118 | + # - Translate to regex pattern with proper field width |
| 119 | + # - Append regex and cast function to compiled pattern list |
| 120 | + pat_list = [] |
109 | 121 | i = 0 |
110 | 122 | length = len(format) |
111 | 123 | while i < length: |
112 | 124 | found = None |
113 | 125 | for token, pattern, cast in scanf_translate: |
114 | 126 | found = token.match(format, i) |
115 | 127 | if found: |
116 | | - if cast: # cast != None |
117 | | - cast_list.append(cast) |
118 | | - groups = found.groupdict() or found.groups() |
119 | | - if groups: |
120 | | - pattern = pattern % groups |
121 | | - format_pat += pattern |
| 128 | + groups = found.groups() |
| 129 | + |
| 130 | + # Add optional argument (length) to pattern |
| 131 | + if len(groups) > 1: |
| 132 | + pattern = pattern % groups[1:] |
| 133 | + |
| 134 | + # If the assignment is suppressed (indicated by *), the cast function |
| 135 | + # is set to None. This allows regex matching without capturing a value. |
| 136 | + if len(groups) > 0 and groups[0] == "*": |
| 137 | + cast = None |
| 138 | + |
| 139 | + if cast is None and len(pat_list) > 0 and pat_list[-1][1] is None: |
| 140 | + # Combine all subsequent non-consuming patterns into one |
| 141 | + pat_list[-1][0] += pattern |
| 142 | + else: |
| 143 | + pat_list.append([pattern, cast]) |
122 | 144 | i = found.end() |
123 | 145 | break |
124 | | - if not found: |
125 | | - char = format[i] |
126 | | - # escape special characters |
127 | | - if char in "|^$()[]-.+*?{}<>\\": |
128 | | - format_pat += "\\" |
129 | | - format_pat += char |
130 | | - i += 1 |
131 | | - if DEBUG: |
132 | | - print("DEBUG: %r -> %s" % (format, format_pat)) |
133 | | - if collapseWhitespace: |
134 | | - format_pat = re.sub(r'\s+', r'\\s+', format_pat) |
135 | | - |
136 | | - format_re = re.compile(format_pat) |
137 | | - return format_re, cast_list |
138 | | - |
139 | | - |
140 | | -def scanf(format, s=None, collapseWhitespace=True): |
| 146 | + else: |
| 147 | + raise ValueError(f"Unknown char '{format[i]}' in pos {i} of format string \"{format}\"") |
| 148 | + |
| 149 | + # Return compiled list of all patterns |
| 150 | + return [(re.compile(pattern), cast) for pattern, cast in pat_list] |
| 151 | + |
| 152 | + |
| 153 | +def scanf(format, s): |
141 | 154 | """Conversion specification are of the form: |
142 | 155 |
|
143 | 156 | %[*][<max_width>]['l']<type_character>. |
144 | 157 |
|
145 | 158 | The following format conversions are supported: |
146 | 159 |
|
147 | 160 | %c Fixed width character string. |
148 | | - %s String of non-whitespace characters with leading |
149 | | - whitespace skipped. |
| 161 | + %s String of non-whitespace characters |
150 | 162 | %d Signed integer |
151 | 163 | %o Octal integer. |
152 | 164 | %x Hexadecimal integer. |
153 | 165 | %f, %g, %e Float |
154 | 166 | %[] Character scan set |
155 | 167 |
|
156 | | - scanf.scanf returns a tuple of found values or None if the format |
157 | | - does not match. |
| 168 | + scanf returns a tuple of found values or None if the format does |
| 169 | + not match. |
158 | 170 |
|
159 | 171 | """ |
160 | | - |
161 | | - if s is None: |
162 | | - s = sys.stdin |
163 | | - |
164 | | - if hasattr(s, "readline"): |
165 | | - s = s.readline() |
166 | | - |
167 | | - format_re, casts = scanf_compile(format, collapseWhitespace) |
168 | | - |
169 | | - found = format_re.search(s) |
170 | | - if found: |
171 | | - groups = found.groups() |
172 | | - return tuple([casts[i](groups[i]) for i in range(len(groups))]) |
| 172 | + # Start scanning input at index 0 |
| 173 | + # For each compiled pattern: |
| 174 | + # 1. Apply regex match at current position |
| 175 | + # 2. If no match: break loop and return parsed values so far |
| 176 | + # 3. If match: |
| 177 | + # - Apply cast function (unless suppressed) |
| 178 | + # - Advance current index to end of matched substring |
| 179 | + i = 0 |
| 180 | + res = [] |
| 181 | + for format_re, cast in scanf_compile(format): |
| 182 | + found = format_re.match(s, i) |
| 183 | + if found: |
| 184 | + if cast is not None: |
| 185 | + res.append(cast(found.groups()[0])) |
| 186 | + i = found.end() |
| 187 | + else: |
| 188 | + break |
| 189 | + |
| 190 | + return res |
0 commit comments