Skip to content

Commit 9607ad7

Browse files
authored
Merge pull request #197 from iraf-community/issue196
scanf.py: allow empty strings for %s format option
2 parents d0cfdd9 + f7de070 commit 9607ad7

File tree

3 files changed

+336
-120
lines changed

3 files changed

+336
-120
lines changed

pyraf/iraffunctions.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2003,11 +2003,12 @@ def fscanf(theLocals, line, format, *namelist, **kw):
20032003
_nscan = 0
20042004
return EOF
20052005
f = sscanf.scanf(format, line)
2006+
n = min(len(f), len(namelist))
20062007
# if list is null, add a null string
20072008
# ugly but should be right most of the time
2008-
if f is None and namelist:
2009-
f = ('')
2010-
n = min(len(f), len(namelist))
2009+
if n == 0 and namelist:
2010+
f = ['']
2011+
n = 1
20112012
if len(kw):
20122013
raise TypeError('unexpected keyword argument: ' +
20132014
repr(list(kw.keys())))

pyraf/scanf.py

Lines changed: 123 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,29 @@
1-
"""
2-
Small scanf implementation.
1+
"""PyRAF scanf module
2+
3+
Implements a subset of C-style scanf functionality in Python using regular expressions.
34
4-
Python has powerful regular expressions but sometimes they are totally overkill
5-
when you just want to parse a simple-formatted string.
6-
C programmers use the scanf-function for these tasks (see link below).
5+
Features:
76
8-
This implementation of scanf translates the simple scanf-format into
9-
regular expressions. Unlike C you can be sure that there are no buffer overflows
10-
possible.
7+
- Supports %d, %f, %g, %e, %x, %o, %s, %c, %% and %[...] scan sets
8+
- Handles optional field widths and suppressed assignments (%*d, etc.)
9+
- Returns parsed values as Python types (int, float, str)
10+
- Partial matches are allowed; parsing stops at first mismatch
11+
- Leading whitespace is consumed for most numeric and string conversions
1112
12-
For more information see
13-
* http://www.python.org/doc/current/lib/node49.html
14-
* http://en.wikipedia.org/wiki/Scanf
13+
Differences from IRAF scanf:
1514
16-
Original code from:
17-
https://github.com/joshburnett/scanf (version 1.5.2)
15+
- Sexagesimal numbers (hh:mm:ss) are not supported
16+
- no INDEF handling
1817
19-
Modified for the needs of PyRAF:
20-
* all fields may have a max width (not a fixed width)
21-
* add "l" (for [outdated] long ints)
22-
* allow "0x" and "0o" prefixes in ints for hexa/octal numbers
18+
Differences from original PyRAF implementation (on purpose, to
19+
establish more conformity to IRAF scanf):
2320
24-
Differences to the original PyRAF sscanf module:
25-
* "n" coversion missing (number of characters so far)
21+
- "0x" prefix is not allowed when parsing integer values
22+
- leading zero does not indicate octal when parsing integer values
23+
- %i, %l, %u, %E, %X are not implemented
2624
25+
Original code was taken from https://github.com/joshburnett/scanf
26+
(version 1.5.2) and heavily modified.
2727
2828
"""
2929
import re
@@ -33,140 +33,158 @@
3333
except ImportError:
3434
from backports.functools_lru_cache import lru_cache
3535

36-
__version__ = '1.5.2'
3736

3837
__all__ = ["scanf", 'scanf_translate', 'scanf_compile']
3938

4039

41-
DEBUG = False
42-
43-
# As you can probably see it is relatively easy to add more format types.
44-
# Make sure you add a second entry for each new item that adds the extra
45-
# few characters needed to handle the field ommision.
40+
# Each tuple is: (format_regex, regex_pattern, cast_function)
41+
#
42+
# - format_regex: regex to identify the format specifier in the format string.
43+
# The first group item is always for the optional "*" to suppress conversion.
44+
# All other groups are used to replace the placeholders in the second pattern.
45+
#
46+
# - regex_pattern: regex to match the corresponding input field.
47+
# As replacement placeholder, %s is used.
48+
#
49+
# - cast_function: Python callable to convert matched string to int/float/str
50+
# Setting to None indicates that the match is ignored for the result, otherwise
51+
# it is called with the first group of the match from the regexp_pattern.
52+
#
4653
scanf_translate = [
4754
(re.compile(_token), _pattern, _cast) for _token, _pattern, _cast in [
48-
(r"%c", r"(.)", lambda x:x),
49-
(r"%\*c", r"(?:.)", None),
50-
51-
(r"%(\d+)c", r"(.{0,%s})", lambda x:x),
52-
(r"%\*(\d+)c", r"(?:.{0,%s})", None),
55+
# %c - Fixed width character string
56+
(r"%(\*)?c", r"(.)", lambda x:x),
57+
(r"%(\*)?(\d+)c", r"(.{1,%s})", lambda x:x),
5358

54-
(r"%s", r"(\S+)", lambda x: x),
55-
(r"%\*s", r"(?:\S+)", None),
59+
# %s - String of non-whitespace characters
60+
(r"%(\*)?s", r"\s*(\S+)", lambda x: x),
61+
(r"%(\*)?(\d+)s", r"\s*(\S{1,%s})", lambda x:x),
5662

57-
(r"%(\d+)s", r"(\S{1,%s})", lambda x:x),
58-
(r"%\*(\d+)s", r"(?:\S{1,%s})", None),
63+
# %[] - Character scan set
64+
(r"%(\*)?\[([^\]]+)\]", r"\s*([%s]+)", lambda x:x),
5965

60-
(r"%\[([^\]]+)\]", r"([%s]+)", lambda x:x),
61-
(r"%\*\[([^\]]+)\]", r"(?:[%s]+)", None),
66+
# %d - Signed integer
67+
(r"%(\*)?l?d", r"\s*([+-]?\d+)", int),
68+
(r"%(\*)?(\d+)l?d", r"\s*([+-]?\d{1,%s})", int),
6269

63-
(r"%l?d", r"([+-]?\d+)", int),
64-
(r"%\*l?d", r"(?:[+-]?\d+)", None),
70+
# %f, %g, %e - Float
71+
(r"%(\*)?[fge]", r"\s*([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float),
72+
(r"%(\*)?(\d+)[fge]", r"\s*([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float),
6573

66-
(r"%(\d+)l?d", r"([+-]?\d{1,%s})", int),
67-
(r"%\*(\d+)l?d", r"(?:[+-]?\d{1,%s})", None),
74+
# %x - Hexadecimal integer.
75+
(r"%(\*)?l?x", r"\s*([-+]?[\da-fA-F]+)", lambda x: int(x, 16)),
76+
(r"%(\*)?(\d+)l?x", r"\s*([-+]?[\dA-Fa-f]{1,%s})", lambda x: int(x, 16)),
6877

69-
(r"%[fge]", r"([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float),
70-
(r"%\*[fge]", r"(?:[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", None),
78+
# %o - Octal integer.
79+
(r"%(\*)?l?o", r"\s*([-+]?[0-7]+)", lambda x:int(x, 8)),
80+
(r"%(\*)?(\d+)l?o", r"\s*([-+]?[0-7]{1,%s})", lambda x: int(x, 8)),
7181

72-
(r"%(\d+)[fge]", r"([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float),
73-
(r"%\*(\d+)[fge]", r"(?:[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", None),
82+
# %% - single percent sign
83+
(r"%%", r"%", None),
7484

75-
(r"%l?x", r"([\dA-Za-f]+)", lambda x: int(x, 16)),
76-
(r"%\*l?x", r"[\dA-Za-f]+)", None),
85+
# white spaces
86+
(r"\s+", r"\s*", None),
7787

78-
(r"%(\d+)l?x", r"([\dA-Za-f]{1,%s})", lambda x: int(x, 16)),
79-
(r"%\*(\d+)l?x", r"[\dA-Za-f]{1,%s})", None),
88+
# special chars in regexps
89+
(r"()([\\^$.|?*+()[\]{}-])", r"\%s", None),
8090

81-
(r"%l?o", r"([0-7]+)", lambda x:int(x, 8)),
82-
(r"%\*l?o", r"(?:[0-7]+)", None),
83-
84-
(r"%(\d+)l?o", r"([0-7]{1,%s})", lambda x: int(x, 8)),
85-
(r"%\*(\d+)l?o", r"(?:[0-7]{1,%s})", None),
91+
# All other non-whitespaces
92+
(r"()([^\s\\^$.|?*+()[\]{}%-]+)", r"%s", None),
8693
]]
8794

8895

89-
# Cache formats
90-
SCANF_CACHE_SIZE = 1000
91-
92-
93-
@lru_cache(maxsize=SCANF_CACHE_SIZE)
94-
def scanf_compile(format, collapseWhitespace=True):
96+
@lru_cache(maxsize=1000)
97+
def scanf_compile(format):
9598
"""
9699
Translate the format into a regular expression
97100
98101
For example:
99102
100-
>>> format_re, casts = scanf_compile('%s - %d errors, %d warnings')
101-
>>> print format_re.pattern
102-
(\\S+) \\- ([+-]?\\d+) errors, ([+-]?\\d+) warnings
103+
>>> re_list = scanf_compile('%s - %d errors, %d warnings')
104+
>>> for pattern, cast in re_list:
105+
... print(pattern, cast)
106+
re.compile('(\\S*)') <function <lambda> at 0x7f5da8f1b060>
107+
re.compile('\\s+\\-\\s+') None
108+
re.compile('([+-]?(?:0o[0-7]+|0x[\\da-fA-F]+|\\d+))') int
109+
re.compile('\\s+errors,\\s+') None
110+
re.compile('([+-]?(?:0o[0-7]+|0x[\\da-fA-F]+|\\d+))') int
111+
re.compile('\\s+warnings') None
103112
104113
Translated formats are cached for faster reuse
105114
"""
106-
107-
format_pat = ""
108-
cast_list = []
115+
# Iterate over the format string, identifying literal text and conversion specifiers
116+
# For each conversion:
117+
# - Determine width, suppression, and conversion type
118+
# - Translate to regex pattern with proper field width
119+
# - Append regex and cast function to compiled pattern list
120+
pat_list = []
109121
i = 0
110122
length = len(format)
111123
while i < length:
112124
found = None
113125
for token, pattern, cast in scanf_translate:
114126
found = token.match(format, i)
115127
if found:
116-
if cast: # cast != None
117-
cast_list.append(cast)
118-
groups = found.groupdict() or found.groups()
119-
if groups:
120-
pattern = pattern % groups
121-
format_pat += pattern
128+
groups = found.groups()
129+
130+
# Add optional argument (length) to pattern
131+
if len(groups) > 1:
132+
pattern = pattern % groups[1:]
133+
134+
# If the assignment is suppressed (indicated by *), the cast function
135+
# is set to None. This allows regex matching without capturing a value.
136+
if len(groups) > 0 and groups[0] == "*":
137+
cast = None
138+
139+
if cast is None and len(pat_list) > 0 and pat_list[-1][1] is None:
140+
# Combine all subsequent non-consuming patterns into one
141+
pat_list[-1][0] += pattern
142+
else:
143+
pat_list.append([pattern, cast])
122144
i = found.end()
123145
break
124-
if not found:
125-
char = format[i]
126-
# escape special characters
127-
if char in "|^$()[]-.+*?{}<>\\":
128-
format_pat += "\\"
129-
format_pat += char
130-
i += 1
131-
if DEBUG:
132-
print("DEBUG: %r -> %s" % (format, format_pat))
133-
if collapseWhitespace:
134-
format_pat = re.sub(r'\s+', r'\\s+', format_pat)
135-
136-
format_re = re.compile(format_pat)
137-
return format_re, cast_list
138-
139-
140-
def scanf(format, s=None, collapseWhitespace=True):
146+
else:
147+
raise ValueError(f"Unknown char '{format[i]}' in pos {i} of format string \"{format}\"")
148+
149+
# Return compiled list of all patterns
150+
return [(re.compile(pattern), cast) for pattern, cast in pat_list]
151+
152+
153+
def scanf(format, s):
141154
"""Conversion specification are of the form:
142155
143156
%[*][<max_width>]['l']<type_character>.
144157
145158
The following format conversions are supported:
146159
147160
%c Fixed width character string.
148-
%s String of non-whitespace characters with leading
149-
whitespace skipped.
161+
%s String of non-whitespace characters
150162
%d Signed integer
151163
%o Octal integer.
152164
%x Hexadecimal integer.
153165
%f, %g, %e Float
154166
%[] Character scan set
155167
156-
scanf.scanf returns a tuple of found values or None if the format
157-
does not match.
168+
scanf returns a tuple of found values or None if the format does
169+
not match.
158170
159171
"""
160-
161-
if s is None:
162-
s = sys.stdin
163-
164-
if hasattr(s, "readline"):
165-
s = s.readline()
166-
167-
format_re, casts = scanf_compile(format, collapseWhitespace)
168-
169-
found = format_re.search(s)
170-
if found:
171-
groups = found.groups()
172-
return tuple([casts[i](groups[i]) for i in range(len(groups))])
172+
# Start scanning input at index 0
173+
# For each compiled pattern:
174+
# 1. Apply regex match at current position
175+
# 2. If no match: break loop and return parsed values so far
176+
# 3. If match:
177+
# - Apply cast function (unless suppressed)
178+
# - Advance current index to end of matched substring
179+
i = 0
180+
res = []
181+
for format_re, cast in scanf_compile(format):
182+
found = format_re.match(s, i)
183+
if found:
184+
if cast is not None:
185+
res.append(cast(found.groups()[0]))
186+
i = found.end()
187+
else:
188+
break
189+
190+
return res

0 commit comments

Comments
 (0)