Skip to content

Commit 7d61ad2

Browse files
authored
Merge pull request #44 from aCLImatise/bwa-kit
Parser rewrite
2 parents b5f395d + f6c39a5 commit 7d61ad2

26 files changed

+1130
-167
lines changed

.isort.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
[settings]
2-
known_third_party = WDL,click,cwl_utils,cwltool,dataclasses,inflection,pkg_resources,pyhash,pyparsing,pytest,ruamel,setuptools,spacy,wdlgen,wordsegment,regex,num2words,word2number,psutil,packaging
2+
known_third_party = WDL,click,cwl_utils,cwltool,dataclasses,inflection,pkg_resources,pyhash,pyparsing,pytest,ruamel,setuptools,spacy,wdlgen,wordsegment,regex,num2words,word2number,psutil,packaging,docker

acclimatise/__init__.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from acclimatise.execution.local import LocalExecutor
1616
from acclimatise.flag_parser.parser import CliParser
1717
from acclimatise.model import Command, Flag
18-
from acclimatise.usage_parser import parse_usage
18+
from acclimatise.usage_parser.parser import UsageParser
1919

2020
logger = logging.getLogger("acclimatise")
2121

@@ -44,21 +44,22 @@ def _combine_flags(
4444
return list(unique.values())
4545

4646

47-
def parse_help(
48-
cmd: typing.Collection[str], text: str, parse_positionals=True
49-
) -> Command:
47+
def parse_help(cmd: typing.Collection[str], text: str, max_length=1000) -> Command:
5048
"""
5149
Parse a string of help text into a Command. Use this if you already have run the executable and extracted the
5250
help text yourself
5351
5452
:param cmd: List of arguments used to generate this help text, e.g. ['bwa', 'mem']
5553
:param text: The help text to parse
56-
:param parse_positionals: If false, don't parse positional arguments
54+
:param max_length: If the input text has more than this many lines, no attempt will be made to parse the file (as
55+
it's too large, will likely take a long time, and there's probably an underlying problem if this has happened).
56+
In this case, an empty Command will be returned
5757
"""
58-
help_command = CliParser(parse_positionals=parse_positionals).parse_command(
59-
name=cmd, cmd=text
60-
)
61-
usage_command = parse_usage(cmd, text)
58+
if len(text.splitlines()) > max_length:
59+
return Command(list(cmd))
60+
61+
help_command = CliParser().parse_command(name=cmd, cmd=text)
62+
usage_command = UsageParser().parse_usage(list(cmd), text)
6263

6364
# Combine the two commands by picking from the help_command where possible, otherwise falling back on the usage
6465
fields = dict(

acclimatise/converter/wdl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def escape_wdl_str(text: str):
2525
"""
2626
Escape literal quotes in a Python string, to become suitable for WDL
2727
"""
28-
return text.replace('"', '\\"')
28+
return text.replace('"', '\\"').replace("\n", "\\n")
2929

3030

3131
def flag_to_command_input(

acclimatise/flag_parser/elements.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""
22
Re-usable parser elements that aren't tied to the parser object
33
"""
4+
from typing import List
5+
46
from pyparsing import *
57

68
from acclimatise.model import *
@@ -84,6 +86,10 @@ def checkUnindent(s, l, t):
8486

8587
cli_id = Word(initChars=element_start_chars, bodyChars=element_body_chars)
8688

89+
positional_name = Word(
90+
initChars=element_start_chars, bodyChars=element_body_chars, min=2
91+
)
92+
8793
# short_flag = originalTextFor(Literal('-') + Word(alphanums + '@', max=1))
8894
# """A short flag has only a single dash and single character, e.g. `-m`"""
8995
# long_flag = originalTextFor(Literal('--') + cli_id)
@@ -244,9 +250,20 @@ def noop(s, loc, toks):
244250
# The description of the flag
245251
# e.g. for grep's `-o, --only-matching`, this is:
246252
# "Print only the matched (non-empty) parts of a matching line, with each such part on a separate output line."
247-
desc_line = originalTextFor(SkipTo(LineEnd())).setName(
248-
"DescriptionLine"
249-
) # .setParseAction(success))
253+
# desc_line = originalTextFor(SkipTo(LineEnd())).setName(
254+
# "DescriptionLine"
255+
# ) # .setParseAction(success))
250256
# desc_line = originalTextFor(
251257
# delimitedList(Regex("[^\s]+"), delim=" ", combine=True)
252258
# ).leaveWhitespace()
259+
260+
261+
def visit_description_line(s, loc, toks):
262+
return toks[0].strip()
263+
264+
265+
description_line = (
266+
SkipTo(LineEnd(), include=True)
267+
.setParseAction(visit_description_line)
268+
.setWhitespaceChars(" \t")
269+
).setName("DescriptionLine")

acclimatise/flag_parser/parser.py

Lines changed: 101 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import regex
55

66
from acclimatise.flag_parser.elements import *
7+
from acclimatise.parser import IndentCheckpoint, IndentParserMixin
78

89

910
def pick(*args):
@@ -51,7 +52,10 @@ def unique_by(
5152
non_alpha = regex.compile(r"^[^[:alpha:]]+$")
5253

5354

54-
class CliParser:
55+
# The reason we have a parser class here instead of just a function is so that we can store the parser state, in
56+
# particular the indentation stack. Without this, we would have to use a global stack which would be even more
57+
# worrying
58+
class CliParser(IndentParserMixin):
5559
def parse_command(self, cmd, name) -> Command:
5660
all_flags = list(itertools.chain.from_iterable(self.flags.searchString(cmd)))
5761
# If flags aren't unique, they likely aren't real flags
@@ -67,54 +71,52 @@ def parse_command(self, cmd, name) -> Command:
6771
)
6872
return Command(command=name, positional=positional, named=named)
6973

70-
def __init__(self, parse_positionals=True):
71-
stack = [1]
74+
def __init__(self):
75+
super().__init__()
7276

7377
def parse_description(s, lok, toks):
74-
text = " ".join([tok[0] for tok in toks[0]])
78+
text = "".join(toks)
79+
if len(text.strip()) == 0:
80+
return ""
81+
7582
if all([non_alpha.match(word) for word in text.split()]):
7683
raise ParseException(
7784
"This can't be a description block if all text is numeric!"
7885
)
86+
7987
if len(multi_space.findall(text)) > len(single_space.findall(text)):
8088
raise ParseException(
8189
"This description block has more unusual spaces than word spaces, it probably isn't a real description"
8290
)
8391

8492
return text
8593

86-
self.indented_desc = (
87-
customIndentedBlock(
88-
desc_line, indentStack=stack, indent=True, terminal=True
89-
)
90-
.setParseAction(parse_description)
91-
.setName("DescriptionBlock")
92-
)
93-
94-
self.description = self.indented_desc.copy().setName(
95-
"description"
96-
) # Optional(one_line_desc) + Optional(indented_desc)
97-
# A description that takes up one line
98-
# one_line_desc = SkipTo(LineEnd())
94+
def visit_mandatory_description(s, loc, toks):
95+
text = toks[0].strip()
96+
if len(text.strip()) == 0:
97+
raise ParseException("A positional argument must have a description")
9998

100-
# A flag description that makes up an indented block
101-
# originalTextFor(SkipTo(flag_prefix ^ LineEnd()))
99+
self.mandatory_description = description_line.copy().setParseAction(
100+
visit_mandatory_description
101+
)
102102

103-
# The entire flag documentation, including all synonyms and description
104103
self.flag = (
105-
(flag_synonyms + self.description)
104+
(flag_synonyms + description_line)
106105
.setName("flag")
107106
.setParseAction(
108107
lambda s, loc, toks: (
109108
Flag.from_synonyms(synonyms=toks[0:-1], description=toks[-1])
110109
)
111110
)
112111
)
112+
"""
113+
The entire flag documentation, including all synonyms and one line of description
114+
"""
113115

114116
self.positional = (
115117
# Unlike with flags, we have to be a bit pickier about what defines a positional because it's very easy
116118
# for a paragraph of regular text to be parsed as a positional. So we add a minimum of 2 spaces separation
117-
(cli_id + White(min=2).suppress() + self.description)
119+
(positional_name + White(min=2).suppress() + self.mandatory_description)
118120
.setName("positional")
119121
.setParseAction(
120122
lambda s, loc, toks: Positional(
@@ -130,9 +132,9 @@ def visit_flags(s, loc, toks):
130132
# Give the correct position to the positional arguments
131133
processed = []
132134
counter = 0
133-
flags = toks[0]
135+
# flags = toks[0]
134136

135-
for (flag,) in flags:
137+
for flag in toks:
136138
if isinstance(flag, Positional):
137139
flag.position = counter
138140
counter += 1
@@ -150,30 +152,91 @@ def visit_colon_block(s, loc, toks):
150152
else:
151153
return toks
152154

153-
if parse_positionals:
154-
block_element = self.flag ^ self.positional
155-
else:
156-
block_element = self.flag
155+
self.block_element = self.flag | self.positional
157156

158-
self.flag_block = customIndentedBlock(
159-
block_element, indentStack=stack, indent=True, lax=True
160-
).setName("FlagBlock")
157+
def visit_description_block(s, loc, toks):
158+
return "\n".join(toks)
161159

162-
# self.flag_block.skipWhitespace = True
160+
self.description_block = IndentCheckpoint(
161+
self.indent()
162+
+ (self.peer_indent(allow_greater=True) + description_line)[1, ...]
163+
+ self.dedent(precise=False),
164+
indent_stack=self.stack,
165+
).setParseAction(visit_description_block)
166+
"""
167+
The description block is the section of indented text after a flag. e.g. in this example:
168+
--use_strict (enforce strict mode)
169+
type: bool default: false
170+
The description block is "type: bool default: false"
171+
"""
172+
173+
self.indented_flag = IndentCheckpoint(
174+
# We require that each flag is indented, but we don't check for a dedent: this allows the next flag to
175+
# have any indentation as long as it's more indented than the top level
176+
self.indent() + self.block_element,
177+
indent_stack=self.stack,
178+
)
179+
"""
180+
Each flag can actually be at any indentation level, but we need to update the indent stack whenever we find one,
181+
so that we can identify the indented description block
182+
"""
183+
184+
def visit_flag_block(s, loc, toks):
185+
ret: List[Flag] = []
186+
187+
# The tokens are a mix of flags and lines of text. Append the text to the previous flag
188+
for tok in toks:
189+
if isinstance(tok, CliArgument):
190+
ret.append(tok)
191+
else:
192+
# Add a newline if we already have some content
193+
if len(ret[-1].description) > 0:
194+
ret[-1].description += "\n"
195+
ret[-1].description += tok
196+
return ret
197+
198+
self.flag_block = (
199+
IndentCheckpoint(
200+
self.indented_flag
201+
+ (
202+
# We pop the indent if parsing a new flag, since we no longer care about the previous flag
203+
IndentCheckpoint(
204+
self.pop_indent() + self.indented_flag, indent_stack=self.stack
205+
)
206+
# We don't pop the indent until after if parsing a description block, since we need to know
207+
# that flag's indentation
208+
| IndentCheckpoint(self.description_block, indent_stack=self.stack)
209+
)[...]
210+
+ self.pop_indent(),
211+
indent_stack=self.stack,
212+
)
213+
).setParseAction(visit_flag_block)
214+
"""
215+
A block of flags is one or more flags, each followed by a description block.
216+
The grammar is written this way so that parsing a flag is *always* prioritised over the description block,
217+
preventing certain indented flags from being missed
218+
"""
163219

164220
self.colon_block = Literal(
165221
":"
166-
).suppress() + self.flag_block.copy().setParseAction(visit_colon_block)
222+
).suppress() + self.flag_block.copy().addParseAction(visit_colon_block)
223+
"""
224+
When the block is introduced by a colon, we can be more lax about parsing
225+
"""
167226

168227
self.newline_block = (
169228
LineStart().leaveWhitespace()
170229
+ White().suppress()
171-
+ self.flag_block.copy().setParseAction(visit_flags)
230+
+ self.flag_block.copy().addParseAction(visit_flags)
172231
)
232+
"""
233+
When the block is introduced by a newline, we have to be quite strict about its contents
234+
"""
173235

174-
self.unindented_flag_block = LineStart().suppress() + OneOrMore(
175-
self.flag
176-
) # delimitedList(self.flag, delim='\n')
236+
self.unindented_flag_block = LineStart().suppress() + (
237+
self.flag + Optional(self.description_block)
238+
)[1, ...].setParseAction(visit_flag_block)
239+
# ) # delimitedList(self.flag, delim='\n')
177240
# self.unindented_flag_block.leaveWhitespace()
178241
self.unindented_flag_block.skipWhitespace = False
179242
"""
@@ -187,7 +250,7 @@ def visit_colon_block(s, loc, toks):
187250
# A flag block can start with a colon, but then it must have 2 or more flags. If it starts with a newline it
188251
# only has to have one flag at least
189252
self.flags = (
190-
self.newline_block ^ self.colon_block ^ self.unindented_flag_block
253+
self.colon_block | self.newline_block | self.unindented_flag_block
191254
).setName(
192255
"FlagList"
193256
) # .leaveWhitespace()

0 commit comments

Comments
 (0)