Skip to content

Commit 44cfd22

Browse files
authored
Merge pull request #52 from aCLImatise/test-description
Add test for valid descriptions
2 parents 37a620d + 8f7ea40 commit 44cfd22

File tree

9 files changed

+102
-73
lines changed

9 files changed

+102
-73
lines changed

aclimatise/flag_parser/elements.py

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -19,71 +19,6 @@
1919
NL = OneOrMore(LineEnd().setWhitespaceChars("\t ").suppress()).setName("Newline")
2020

2121

22-
def customIndentedBlock(
23-
blockStatementExpr, indentStack, indent=True, terminal=False, lax=False
24-
):
25-
"""
26-
Modified version of the indentedBlock construct provided by pyparsing. Allows fuzzier indent boundaries
27-
"""
28-
backup_stack = indentStack[:]
29-
30-
def reset_stack():
31-
indentStack[:] = backup_stack
32-
33-
def checkPeerIndent(s, l, t):
34-
if l >= len(s):
35-
return
36-
curCol = col(l, s)
37-
38-
# A terminal block doesn't have children, so we can assume that any sub-indent is a peer
39-
if terminal and curCol >= indentStack[-1]:
40-
return
41-
42-
# If we're being lax, anything that's not a full dedent is a peer
43-
if lax and curCol > indentStack[-2]:
44-
return
45-
46-
# Anything that is indented more than the previous indent level counts as a peer
47-
if curCol < indentStack[-1] or curCol <= indentStack[-2]:
48-
raise ParseException(s, l, "not a peer entry")
49-
50-
def checkSubIndent(s, l, t):
51-
curCol = col(l, s)
52-
if curCol > indentStack[-1]:
53-
indentStack.append(curCol)
54-
else:
55-
raise ParseException(s, l, "not a subentry")
56-
57-
def checkUnindent(s, l, t):
58-
if l >= len(s):
59-
indentStack.pop()
60-
return
61-
curCol = col(l, s)
62-
if not (indentStack and curCol < indentStack[-1]):
63-
raise ParseException(s, l, "not an unindent")
64-
indentStack.pop()
65-
66-
INDENT = (Empty() + Empty().setParseAction(checkSubIndent)).setName("Indent")
67-
PEER = Empty().setParseAction(checkPeerIndent).setName("PeerIndent")
68-
UNDENT = Empty().setParseAction(checkUnindent).setName("Unindent")
69-
if indent:
70-
smExpr = Group(
71-
Optional(NL)
72-
+
73-
# ~ FollowedBy(blockStatementExpr) +
74-
INDENT
75-
+ (OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL)))
76-
+ UNDENT
77-
)
78-
else:
79-
smExpr = Group(
80-
Optional(NL) + (OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL)))
81-
)
82-
smExpr.setFailAction(lambda a, b, c, d: reset_stack())
83-
blockStatementExpr.ignore("\\" + LineEnd())
84-
return smExpr.setName("IndentedBlock")
85-
86-
8722
cli_id = Word(initChars=element_start_chars, bodyChars=element_body_chars)
8823

8924
positional_name = Word(

aclimatise/flag_parser/parser.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import regex
55

66
from aclimatise.flag_parser.elements import *
7+
from aclimatise.nlp import is_sentence
78
from aclimatise.parser import IndentCheckpoint, IndentParserMixin
89

910

@@ -182,7 +183,7 @@ def visit_description_block(s, loc, toks):
182183
"""
183184

184185
def visit_flag_block(s, loc, toks):
185-
ret: List[Flag] = []
186+
ret: List[CliArgument] = []
186187

187188
# The tokens are a mix of flags and lines of text. Append the text to the previous flag
188189
for tok in toks:
@@ -193,6 +194,8 @@ def visit_flag_block(s, loc, toks):
193194
if len(ret[-1].description) > 0:
194195
ret[-1].description += "\n"
195196
ret[-1].description += tok
197+
198+
ret = [flag for flag in ret if is_sentence(flag.description)]
196199
return ret
197200

198201
self.flag_block = (

aclimatise/nlp.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,51 @@
11
import spacy
22
import wordsegment
33

4-
# We load the spacy and the wordsegment models here as a kind of singleton pattern, to avoid multiple functions loading
5-
# redundant copies
64

7-
if len(wordsegment.WORDS) == 0:
8-
wordsegment.load()
5+
def prevent_sentence_boundary_detection(doc):
6+
for token in doc:
7+
token.is_sent_start = False
8+
return doc
99

1010

1111
try:
1212
nlp = spacy.load("en")
13+
no_sentences = spacy.load("en")
14+
no_sentences.add_pipe(
15+
prevent_sentence_boundary_detection, name="prevent-sbd", before="parser"
16+
)
1317
except IOError:
1418
raise Exception(
1519
"Spacy model doesn't exist! Install it with `python -m spacy download en`"
1620
)
21+
22+
# We load the spacy and the wordsegment models here as a kind of singleton pattern, to avoid multiple functions loading
23+
# redundant copies
24+
25+
if len(wordsegment.WORDS) == 0:
26+
wordsegment.load()
27+
28+
29+
def is_sentence(text: str, threshold: float = 0.8) -> bool:
30+
"""
31+
Returns a bool that indicates if this text is likely a sentence. This should probably be replaced by a machine
32+
learning classifier in the future
33+
:param threshold: If the ratio of non-word tokens over word tokens is higher than this, then return False
34+
"""
35+
36+
doc = no_sentences(text)
37+
sentence = list(doc.sents)[0]
38+
non_word_count = 0
39+
word_count = 0
40+
for tok in sentence:
41+
pos = tok.pos_
42+
if pos == "SPACE":
43+
# Ignore whitespace
44+
continue
45+
46+
if pos in {"X", "SYM", "PUNCT", "NUM"}:
47+
non_word_count += 1
48+
word_count += 1
49+
50+
result = word_count == 0 or non_word_count / word_count < threshold
51+
return result

aclimatise/usage_parser/elements.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from aclimatise.flag_parser.elements import (
77
arg,
88
argument_body_chars,
9-
customIndentedBlock,
109
delimited_body_chars,
1110
element_body_chars,
1211
element_start_chars,

aclimatise/usage_parser/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from pyparsing import *
55

6-
from aclimatise.flag_parser.elements import customIndentedBlock, description_line
6+
from aclimatise.flag_parser.elements import description_line
77
from aclimatise.parser import IndentCheckpoint, IndentParserMixin
88
from aclimatise.usage_parser.elements import usage_example
99
from aclimatise.usage_parser.model import UsageElement, UsageInstance

docs/changes.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
Changelog
22
=========
3+
2.1.1 (2020-09-)
4+
----------------
5+
* Add `bedtools random` as a test case
6+
* Use a simple metric to exclude flags unless they have a somewhat valid description text
7+
*
8+
39
2.0.0 (2020-09-16)
410
------------------
511
* Rename the package from ``acclimatise`` to ``aclimatise``, to be consistent with the naming elsewhere. This is a breaking change, and will require you to ``pip install aclimatise`` from now on.

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = aclimatise
3-
version = 2.0.0.post1
3+
version = 2.1.0
44
description = aCLImatise is a Python library and command-line utility for parsing the help output of a command-line tool and then outputting a description of the tool in a more structured format
55
long_description = file: README.rst
66
long_description_content_type: text/x-rst

test/test_data/bedtools_random.txt

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
*****
3+
*****ERROR: Need a genome (-g) file.
4+
*****
5+
6+
Tool: bedtools random (aka randomBed)
7+
Version: v2.26.0
8+
Summary: Generate random intervals among a genome.
9+
10+
Usage: bedtools random [OPTIONS] -g <genome>
11+
12+
Options:
13+
-l The length of the intervals to generate.
14+
- Default = 100.
15+
- (INTEGER)
16+
17+
-n The number of intervals to generate.
18+
- Default = 1,000,000.
19+
- (INTEGER)
20+
21+
-seed Supply an integer seed for the shuffling.
22+
- By default, the seed is chosen automatically.
23+
- (INTEGER)
24+
25+
Notes:
26+
(1) The genome file should tab delimited and structured as follows:
27+
<chromName><TAB><chromSize>
28+
29+
For example, Human (hg19):
30+
chr1 249250621
31+
chr2 243199373
32+
...
33+
chr18_gl000207_random 4262
34+
35+
Tips:
36+
One can use the UCSC Genome Browser's MySQL database to extract
37+
chromosome sizes. For example, H. sapiens:
38+
39+
mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \
40+
"select chrom, size from hg19.chromInfo" > hg19.genome
41+

test/util.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,16 @@ def run_assertions(self, cmd: Command, explore=False):
318318
outputs=0,
319319
),
320320
),
321+
pytest.param(
322+
HelpText(
323+
path="test_data/bedtools_random.txt",
324+
cmd=["bedtools", "random"],
325+
positional=0,
326+
named=4,
327+
subcommands=0,
328+
outputs=0,
329+
),
330+
),
321331
# These last two are really strange, maybe I'll support them eventually
322332
pytest.param(
323333
HelpText(

0 commit comments

Comments
 (0)