Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions Lib/glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,54 @@ def escape(pathname):
_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
_no_recurse_symlinks = object()

def escape_regex_range_including_seps(pat, seps):
"""Escape ranges containing seperators in a path
"""
pat = list(pat)
ordinal_seps=set(map(ord, seps))

insideRange = False
ds=[]

buf=''
idx1=0
idx2=0
rangeIncludesSep=False

for path_idx, path_ch in enumerate(pat):
if path_idx > 0:
if path_ch == '[' and pat[path_idx-1] != '\\':
insideRange = True
idx1=path_idx
continue
if path_ch == ']' and pat[path_idx-1] != '\\':
insideRange = False
idx2=path_idx+1

if insideRange:
buf+=path_ch
if path_ch == '-':
glob_range = list(range(ord(pat[path_idx-1]), ord(pat[path_idx+1])))
if ordinal_seps.intersection(glob_range):
rangeIncludesSep = True

elif len(buf)>0:
ds.append([idx1, idx2, rangeIncludesSep])

buf=''
idx1=1
idx2=2
rangeIncludesSep=False

for ds_idx, ds_elem in enumerate(ds):
idx1=ds_elem[0]
idx2=ds_elem[1]
rangeIncludesSep=ds_elem[2]
if rangeIncludesSep:
pat.insert(idx1, '\\')
pat.insert(idx2, '\\')

return ''.join(pat)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please revert

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please revert.

def translate(pat, *, recursive=False, include_hidden=False, seps=None):
"""Translate a pathname with shell wildcards to a regular expression.
Expand All @@ -282,6 +330,8 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
seps = (os.path.sep, os.path.altsep)
else:
seps = os.path.sep


escaped_seps = ''.join(map(re.escape, seps))
any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
not_sep = f'[^{escaped_seps}]'
Expand Down Expand Up @@ -312,10 +362,14 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
if part:
if not include_hidden and part[0] in '*?':
results.append(r'(?!\.)')

results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)[0])

if idx < last_part_idx:
results.append(any_sep)

res = ''.join(results)
res=escape_regex_range_including_seps(res, seps=seps)
return fr'(?s:{res})\Z'


Expand Down
3 changes: 3 additions & 0 deletions Lib/test/test_glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,9 @@ def fn(pat):
self.assertEqual(fn('foo/bar\\baz'), r'(?s:foo[/\\]bar[/\\]baz)\Z')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More generally, can you upodate test_translate_matching and include the examples of https://man7.org/linux/man-pages/man7/glob.7.html so that we have a compliant implementation?

self.assertEqual(fn('**/*'), r'(?s:(?:.+[/\\])?[^/\\]+)\Z')

self.assertEqual(fn('foo[%-0]bar'), r'(?s:foo\[%-0\]bar)\Z')
Copy link

@dkaszews dkaszews Mar 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this is correct. From my understanding of manpages quoted in the issue, a class should be escaped only if it contains a literal path separator, not a range encompassing it. In latter case, we need to just exclude the separator.

[%-0] => (?!/)[%-0]
[ab/] => \[ab/\]

Edge case to be tested in bash and glob.glob: is a range beginning with separator ([%-/] or [/-0]) the first case or the second one? What about corner case of single element range [/-/]? I would say that all three should be escaped since they "contain an explicit / character".

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, does

A range containing an
explicit '/' character is syntactically incorrect. (POSIX requires that
syntactically incorrect patterns are left unchanged.)

mean that entire glob should be escaped, or just the part with the separator? I.e, does [ab][0/][xy] map to [ab]\[0/\][xy] or \[ab\]\[0/\]\[xy\]?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

@dmitrin9 dmitrin9 Mar 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Thus, "[]-]" matches just the two characters ']' and
'-', and "[--0]" matches the three characters '-', '.', and '0',
since '/' cannot be matched.)

This would indicate that a range which includes a '/' character as a non-literal would match that range but exclude the '/' character, at least with my interpretation.

I got that from the glob manpage.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2.13.3.1 looks to back my interpretation:

  1. If path separator appears between brackets, be it a single character or next to a hyphen, escape entire bracket expression. For '\\' in seps case, be careful to check it is not an escape but actual '\\\\'.
  2. Else, if any hyphened range spans a separator, add a negative lookahead. For simplicity, it can also be added for any bracket expression with a hyphen, or any bracket at all - result is the same, just simplifies regex in most cases.
  3. All bracket expressions are analyzed separately, so path separator in one does not invalidate and escape all others.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dmitya26 I don't have Python on hand, can you just quickly run glob.glob('a[/-b]c') on a following tree:

|-- abc
`-- a[
    `-- -b]c

If it returns [abc], then you are correct, if it returns the file in subdir then my interpretation seems to match existing implementation.

Copy link
Author

@dmitrin9 dmitrin9 Mar 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It returns '[]'.

edit: Oh wait I think I might've misread how the directories need to be structured.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

├── a[
│   └── -b]c
└── abc

and

glob.glob('a[/-b]c')

would return

['a[/-b]c']

for me.

Copy link
Author

@dmitrin9 dmitrin9 Mar 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait so regarding the spec, do you think we should be disallowing only '/' characters, the system's path separator (os.path.sep), or all path separators mentioned like the ones in glob.translate?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current implementation already extends the spec to all given separators, e.g. glob.translate('abc?', seps=['/', '\\']) maps to '(?s:abc[^/\\\\])\\Z'.

self.assertEqual(fn('foo[U-d]bar'), r'(?s:foo\[U-d\]bar)\Z')


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Glob.translate escapes regex ranges that ecompass path seperator.
Loading