Merge pull request #197 from iraf-community/issue196

olebole · web-flow · commit 9607ad7b8470 · 2026-01-26T19:30:37.000+01:00
scanf.py: allow empty strings for %s format option
diff --git a/pyraf/iraffunctions.py b/pyraf/iraffunctions.py
@@ -2003,11 +2003,12 @@ def fscanf(theLocals, line, format, *namelist, **kw):
         _nscan = 0
         return EOF
     f = sscanf.scanf(format, line)
+    n = min(len(f), len(namelist))
     # if list is null, add a null string
     # ugly but should be right most of the time
-    if f is None and namelist:
-        f = ('')
-    n = min(len(f), len(namelist))
+    if n == 0 and namelist:
+        f = ['']
+        n = 1
     if len(kw):
         raise TypeError('unexpected keyword argument: ' +
                         repr(list(kw.keys())))
diff --git a/pyraf/scanf.py b/pyraf/scanf.py
@@ -1,29 +1,29 @@
-"""
-Small scanf implementation.
+"""PyRAF scanf module
+
+Implements a subset of C-style scanf functionality in Python using regular expressions.
 
-Python has powerful regular expressions but sometimes they are totally overkill
-when you just want to parse a simple-formatted string.
-C programmers use the scanf-function for these tasks (see link below).
+Features:
 
-This implementation of scanf translates the simple scanf-format into
-regular expressions. Unlike C you can be sure that there are no buffer overflows
-possible.
+ - Supports %d, %f, %g, %e, %x, %o, %s, %c, %% and %[...] scan sets
+ - Handles optional field widths and suppressed assignments (%*d, etc.)
+ - Returns parsed values as Python types (int, float, str)
+ - Partial matches are allowed; parsing stops at first mismatch
+ - Leading whitespace is consumed for most numeric and string conversions
 
-For more information see
-  * http://www.python.org/doc/current/lib/node49.html
-  * http://en.wikipedia.org/wiki/Scanf
+Differences from IRAF scanf:
 
-Original code from:
-    https://github.com/joshburnett/scanf (version 1.5.2)
+ - Sexagesimal numbers (hh:mm:ss) are not supported
+ - no INDEF handling
 
-Modified for the needs of PyRAF:
- * all fields may have a max width (not a fixed width)
- * add "l" (for [outdated] long ints)
- * allow "0x" and "0o" prefixes in ints for hexa/octal numbers
+Differences from original PyRAF implementation (on purpose, to
+establish more conformity to IRAF scanf):
 
-Differences to the original PyRAF sscanf module:
- * "n" coversion missing (number of characters so far)
+ - "0x" prefix is not allowed when parsing integer values
+ - leading zero does not indicate octal when parsing integer values
+ - %i, %l, %u, %E, %X are not implemented
 
+Original code was taken from https://github.com/joshburnett/scanf
+(version 1.5.2) and heavily modified.
 
 """
 import re
@@ -33,140 +33,158 @@
 except ImportError:
     from backports.functools_lru_cache import lru_cache
 
-__version__ = '1.5.2'
 
 __all__ = ["scanf", 'scanf_translate', 'scanf_compile']
 
 
-DEBUG = False
-
-# As you can probably see it is relatively easy to add more format types.
-# Make sure you add a second entry for each new item that adds the extra
-#   few characters needed to handle the field ommision.
+# Each tuple is: (format_regex, regex_pattern, cast_function)
+#
+# - format_regex: regex to identify the format specifier in the format string.
+#   The first group item is always for the optional "*" to suppress conversion.
+#   All other groups are used to replace the placeholders in the second pattern.
+#
+# - regex_pattern: regex to match the corresponding input field.
+#   As replacement placeholder, %s is used.
+#
+# - cast_function: Python callable to convert matched string to int/float/str
+#   Setting to None indicates that the match is ignored for the result, otherwise
+#   it is called with the first group of the match from the regexp_pattern.
+#
 scanf_translate = [
     (re.compile(_token), _pattern, _cast) for _token, _pattern, _cast in [
-        (r"%c", r"(.)", lambda x:x),
-        (r"%\*c", r"(?:.)", None),
-
-        (r"%(\d+)c", r"(.{0,%s})", lambda x:x),
-        (r"%\*(\d+)c", r"(?:.{0,%s})", None),
+        # %c - Fixed width character string
+        (r"%(\*)?c", r"(.)", lambda x:x),
+        (r"%(\*)?(\d+)c", r"(.{1,%s})", lambda x:x),
 
-        (r"%s", r"(\S+)", lambda x: x),
-        (r"%\*s", r"(?:\S+)", None),
+        # %s - String of non-whitespace characters
+        (r"%(\*)?s", r"\s*(\S+)", lambda x: x),
+        (r"%(\*)?(\d+)s", r"\s*(\S{1,%s})", lambda x:x),
 
-        (r"%(\d+)s", r"(\S{1,%s})", lambda x:x),
-        (r"%\*(\d+)s", r"(?:\S{1,%s})", None),
+        # %[] - Character scan set
+        (r"%(\*)?\[([^\]]+)\]", r"\s*([%s]+)", lambda x:x),
 
-        (r"%\[([^\]]+)\]", r"([%s]+)", lambda x:x),
-        (r"%\*\[([^\]]+)\]", r"(?:[%s]+)", None),
+        # %d - Signed integer
+        (r"%(\*)?l?d", r"\s*([+-]?\d+)", int),
+        (r"%(\*)?(\d+)l?d", r"\s*([+-]?\d{1,%s})", int),
 
-        (r"%l?d", r"([+-]?\d+)", int),
-        (r"%\*l?d", r"(?:[+-]?\d+)", None),
+        # %f, %g, %e - Float
+        (r"%(\*)?[fge]", r"\s*([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float),
+        (r"%(\*)?(\d+)[fge]", r"\s*([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float),
 
-        (r"%(\d+)l?d", r"([+-]?\d{1,%s})", int),
-        (r"%\*(\d+)l?d", r"(?:[+-]?\d{1,%s})", None),
+        # %x - Hexadecimal integer.
+        (r"%(\*)?l?x", r"\s*([-+]?[\da-fA-F]+)", lambda x: int(x, 16)),
+        (r"%(\*)?(\d+)l?x", r"\s*([-+]?[\dA-Fa-f]{1,%s})", lambda x: int(x, 16)),
 
-        (r"%[fge]", r"([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float),
-        (r"%\*[fge]", r"(?:[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", None),
+        # %o - Octal integer.
+        (r"%(\*)?l?o", r"\s*([-+]?[0-7]+)", lambda x:int(x, 8)),
+        (r"%(\*)?(\d+)l?o", r"\s*([-+]?[0-7]{1,%s})", lambda x: int(x, 8)),
 
-        (r"%(\d+)[fge]", r"([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", float),
-        (r"%\*(\d+)[fge]", r"(?:[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)", None),
+        # %% - single percent sign
+        (r"%%", r"%", None),
 
-        (r"%l?x", r"([\dA-Za-f]+)", lambda x: int(x, 16)),
-        (r"%\*l?x", r"[\dA-Za-f]+)", None),
+        # white spaces
+        (r"\s+", r"\s*", None),
 
-        (r"%(\d+)l?x", r"([\dA-Za-f]{1,%s})", lambda x: int(x, 16)),
-        (r"%\*(\d+)l?x", r"[\dA-Za-f]{1,%s})", None),
+        # special chars in regexps
+        (r"()([\\^$.|?*+()[\]{}-])", r"\%s", None),
 
-        (r"%l?o", r"([0-7]+)", lambda x:int(x, 8)),
-        (r"%\*l?o", r"(?:[0-7]+)", None),
-
-        (r"%(\d+)l?o", r"([0-7]{1,%s})", lambda x: int(x, 8)),
-        (r"%\*(\d+)l?o", r"(?:[0-7]{1,%s})", None),
+        # All other non-whitespaces
+        (r"()([^\s\\^$.|?*+()[\]{}%-]+)", r"%s", None),
     ]]
 
 
-# Cache formats
-SCANF_CACHE_SIZE = 1000
-
-
-@lru_cache(maxsize=SCANF_CACHE_SIZE)
-def scanf_compile(format, collapseWhitespace=True):
+@lru_cache(maxsize=1000)
+def scanf_compile(format):
     """
     Translate the format into a regular expression
 
     For example:
 
-        >>> format_re, casts = scanf_compile('%s - %d errors, %d warnings')
-        >>> print format_re.pattern
-        (\\S+) \\- ([+-]?\\d+) errors, ([+-]?\\d+) warnings
+        >>> re_list = scanf_compile('%s - %d errors, %d warnings')
+        >>> for pattern, cast in re_list:
+        ...     print(pattern, cast)
+        re.compile('(\\S*)') <function <lambda> at 0x7f5da8f1b060>
+        re.compile('\\s+\\-\\s+') None
+        re.compile('([+-]?(?:0o[0-7]+|0x[\\da-fA-F]+|\\d+))') int
+        re.compile('\\s+errors,\\s+') None
+        re.compile('([+-]?(?:0o[0-7]+|0x[\\da-fA-F]+|\\d+))') int
+        re.compile('\\s+warnings') None
 
     Translated formats are cached for faster reuse
     """
-
-    format_pat = ""
-    cast_list = []
+    # Iterate over the format string, identifying literal text and conversion specifiers
+    # For each conversion:
+    # - Determine width, suppression, and conversion type
+    # - Translate to regex pattern with proper field width
+    # - Append regex and cast function to compiled pattern list
+    pat_list = []
     i = 0
     length = len(format)
     while i < length:
         found = None
         for token, pattern, cast in scanf_translate:
             found = token.match(format, i)
             if found:
-                if cast: # cast != None
-                    cast_list.append(cast)
-                groups = found.groupdict() or found.groups()
-                if groups:
-                    pattern = pattern % groups
-                format_pat += pattern
+                groups = found.groups()
+
+                # Add optional argument (length) to pattern
+                if len(groups) > 1:
+                    pattern = pattern % groups[1:]
+
+                # If the assignment is suppressed (indicated by *), the cast function
+                # is set to None. This allows regex matching without capturing a value.
+                if len(groups) > 0 and groups[0] == "*":
+                    cast = None
+
+                if cast is None and len(pat_list) > 0 and pat_list[-1][1] is None:
+                    # Combine all subsequent non-consuming patterns into one
+                    pat_list[-1][0] += pattern
+                else:
+                    pat_list.append([pattern, cast])
                 i = found.end()
                 break
-        if not found:
-            char = format[i]
-            # escape special characters
-            if char in "|^$()[]-.+*?{}<>\\":
-                format_pat += "\\"
-            format_pat += char
-            i += 1
-    if DEBUG:
-        print("DEBUG: %r -> %s" % (format, format_pat))
-    if collapseWhitespace:
-        format_pat = re.sub(r'\s+', r'\\s+', format_pat)
-
-    format_re = re.compile(format_pat)
-    return format_re, cast_list
-
-
-def scanf(format, s=None, collapseWhitespace=True):
+        else:
+            raise ValueError(f"Unknown char '{format[i]}' in pos {i} of format string \"{format}\"")
+
+    # Return compiled list of all patterns
+    return [(re.compile(pattern), cast) for pattern, cast in pat_list]
+
+
+def scanf(format, s):
     """Conversion specification are of the form:
 
         %[*][<max_width>]['l']<type_character>.
 
     The following format conversions are supported:
 
     %c            Fixed width character string.
-    %s            String of non-whitespace characters with leading
-                     whitespace skipped.
+    %s            String of non-whitespace characters
     %d            Signed integer
     %o            Octal integer.
     %x            Hexadecimal integer.
     %f, %g, %e    Float
     %[]           Character scan set
 
-    scanf.scanf returns a tuple of found values or None if the format
-    does not match.
+    scanf returns a tuple of found values or None if the format does
+    not match.
 
     """
-
-    if s is None:
-        s = sys.stdin
-
-    if hasattr(s, "readline"):
-        s = s.readline()
-
-    format_re, casts = scanf_compile(format, collapseWhitespace)
-
-    found = format_re.search(s)
-    if found:
-        groups = found.groups()
-        return tuple([casts[i](groups[i]) for i in range(len(groups))])
+    # Start scanning input at index 0
+    # For each compiled pattern:
+    # 1. Apply regex match at current position
+    # 2. If no match: break loop and return parsed values so far
+    # 3. If match:
+    #    - Apply cast function (unless suppressed)
+    #    - Advance current index to end of matched substring
+    i = 0
+    res = []
+    for format_re, cast in scanf_compile(format):
+        found = format_re.match(s, i)
+        if found:
+            if cast is not None:
+                res.append(cast(found.groups()[0]))
+            i = found.end()
+        else:
+            break
+
+    return res
diff --git a/pyraf/tests/test_cli.py b/pyraf/tests/test_cli.py