Merge pull request #52 from aCLImatise/test-description

multimeric · web-flow · commit 44cfd229ae2c · 2020-09-22T20:10:32.000+10:00
Add test for valid descriptions
diff --git a/aclimatise/flag_parser/elements.py b/aclimatise/flag_parser/elements.py
@@ -19,71 +19,6 @@
 NL = OneOrMore(LineEnd().setWhitespaceChars("\t ").suppress()).setName("Newline")
 
 
-def customIndentedBlock(
-    blockStatementExpr, indentStack, indent=True, terminal=False, lax=False
-):
-    """
-    Modified version of the indentedBlock construct provided by pyparsing. Allows fuzzier indent boundaries
-    """
-    backup_stack = indentStack[:]
-
-    def reset_stack():
-        indentStack[:] = backup_stack
-
-    def checkPeerIndent(s, l, t):
-        if l >= len(s):
-            return
-        curCol = col(l, s)
-
-        # A terminal block doesn't have children, so we can assume that any sub-indent is a peer
-        if terminal and curCol >= indentStack[-1]:
-            return
-
-        # If we're being lax, anything that's not a full dedent is a peer
-        if lax and curCol > indentStack[-2]:
-            return
-
-        # Anything that is indented more than the previous indent level counts as a peer
-        if curCol < indentStack[-1] or curCol <= indentStack[-2]:
-            raise ParseException(s, l, "not a peer entry")
-
-    def checkSubIndent(s, l, t):
-        curCol = col(l, s)
-        if curCol > indentStack[-1]:
-            indentStack.append(curCol)
-        else:
-            raise ParseException(s, l, "not a subentry")
-
-    def checkUnindent(s, l, t):
-        if l >= len(s):
-            indentStack.pop()
-            return
-        curCol = col(l, s)
-        if not (indentStack and curCol < indentStack[-1]):
-            raise ParseException(s, l, "not an unindent")
-        indentStack.pop()
-
-    INDENT = (Empty() + Empty().setParseAction(checkSubIndent)).setName("Indent")
-    PEER = Empty().setParseAction(checkPeerIndent).setName("PeerIndent")
-    UNDENT = Empty().setParseAction(checkUnindent).setName("Unindent")
-    if indent:
-        smExpr = Group(
-            Optional(NL)
-            +
-            # ~ FollowedBy(blockStatementExpr) +
-            INDENT
-            + (OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL)))
-            + UNDENT
-        )
-    else:
-        smExpr = Group(
-            Optional(NL) + (OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL)))
-        )
-    smExpr.setFailAction(lambda a, b, c, d: reset_stack())
-    blockStatementExpr.ignore("\\" + LineEnd())
-    return smExpr.setName("IndentedBlock")
-
-
 cli_id = Word(initChars=element_start_chars, bodyChars=element_body_chars)
 
 positional_name = Word(
diff --git a/aclimatise/flag_parser/parser.py b/aclimatise/flag_parser/parser.py
@@ -4,6 +4,7 @@
 import regex
 
 from aclimatise.flag_parser.elements import *
+from aclimatise.nlp import is_sentence
 from aclimatise.parser import IndentCheckpoint, IndentParserMixin
 
 
@@ -182,7 +183,7 @@ def visit_description_block(s, loc, toks):
         """
 
         def visit_flag_block(s, loc, toks):
-            ret: List[Flag] = []
+            ret: List[CliArgument] = []
 
             # The tokens are a mix of flags and lines of text. Append the text to the previous flag
             for tok in toks:
@@ -193,6 +194,8 @@ def visit_flag_block(s, loc, toks):
                     if len(ret[-1].description) > 0:
                         ret[-1].description += "\n"
                     ret[-1].description += tok
+
+            ret = [flag for flag in ret if is_sentence(flag.description)]
             return ret
 
         self.flag_block = (
diff --git a/aclimatise/nlp.py b/aclimatise/nlp.py
@@ -1,16 +1,51 @@
 import spacy
 import wordsegment
 
-# We load the spacy and the wordsegment models here as a kind of singleton pattern, to avoid multiple functions loading
-# redundant copies
 
-if len(wordsegment.WORDS) == 0:
-    wordsegment.load()
+def prevent_sentence_boundary_detection(doc):
+    for token in doc:
+        token.is_sent_start = False
+    return doc
 
 
 try:
     nlp = spacy.load("en")
+    no_sentences = spacy.load("en")
+    no_sentences.add_pipe(
+        prevent_sentence_boundary_detection, name="prevent-sbd", before="parser"
+    )
 except IOError:
     raise Exception(
         "Spacy model doesn't exist! Install it with `python -m spacy download en`"
     )
+
+# We load the spacy and the wordsegment models here as a kind of singleton pattern, to avoid multiple functions loading
+# redundant copies
+
+if len(wordsegment.WORDS) == 0:
+    wordsegment.load()
+
+
+def is_sentence(text: str, threshold: float = 0.8) -> bool:
+    """
+    Returns a bool that indicates if this text is likely a sentence. This should probably be replaced by a machine
+    learning classifier in the future
+    :param threshold: If the ratio of non-word tokens over word tokens is higher than this, then return False
+    """
+
+    doc = no_sentences(text)
+    sentence = list(doc.sents)[0]
+    non_word_count = 0
+    word_count = 0
+    for tok in sentence:
+        pos = tok.pos_
+        if pos == "SPACE":
+            # Ignore whitespace
+            continue
+
+        if pos in {"X", "SYM", "PUNCT", "NUM"}:
+            non_word_count += 1
+        word_count += 1
+
+    result = word_count == 0 or non_word_count / word_count < threshold
+    return result
diff --git a/aclimatise/usage_parser/elements.py b/aclimatise/usage_parser/elements.py
@@ -6,7 +6,6 @@
 from aclimatise.flag_parser.elements import (
     arg,
     argument_body_chars,
-    customIndentedBlock,
     delimited_body_chars,
     element_body_chars,
     element_start_chars,
diff --git a/aclimatise/usage_parser/parser.py b/aclimatise/usage_parser/parser.py
@@ -3,7 +3,7 @@
 
 from pyparsing import *
 
-from aclimatise.flag_parser.elements import customIndentedBlock, description_line
+from aclimatise.flag_parser.elements import description_line
 from aclimatise.parser import IndentCheckpoint, IndentParserMixin
 from aclimatise.usage_parser.elements import usage_example
 from aclimatise.usage_parser.model import UsageElement, UsageInstance
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -1,5 +1,11 @@
 Changelog
 =========
+2.1.1 (2020-09-)
+----------------
+* Add `bedtools random` as a test case
+* Use a simple metric to exclude flags unless they have a somewhat valid description text
+*
+
 2.0.0 (2020-09-16)
 ------------------
 * Rename the package from ``acclimatise`` to ``aclimatise``, to be consistent with the naming elsewhere. This is a breaking change, and will require you to ``pip install aclimatise`` from now on.
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = aclimatise
-version = 2.0.0.post1
+version = 2.1.0
 description = aCLImatise is a Python library and command-line utility for parsing the help output of a command-line tool and then outputting a description of the tool in a more structured format
 long_description = file: README.rst
 long_description_content_type: text/x-rst
diff --git a/test/test_data/bedtools_random.txt b/test/test_data/bedtools_random.txt
@@ -0,0 +1,41 @@
+
+*****
+*****ERROR: Need a genome (-g) file. 
+*****
+
+Tool:    bedtools random (aka randomBed)
+Version: v2.26.0
+Summary: Generate random intervals among a genome.
+
+Usage:   bedtools random [OPTIONS] -g <genome>
+
+Options: 
+	-l	The length of the intervals to generate.
+		- Default = 100.
+		- (INTEGER)
+
+	-n	The number of intervals to generate.
+		- Default = 1,000,000.
+		- (INTEGER)
+
+	-seed	Supply an integer seed for the shuffling.
+		- By default, the seed is chosen automatically.
+		- (INTEGER)
+
+Notes: 
+	(1)  The genome file should tab delimited and structured as follows:
+	     <chromName><TAB><chromSize>
+
+	For example, Human (hg19):
+	chr1	249250621
+	chr2	243199373
+	...
+	chr18_gl000207_random	4262
+
+Tips: 
+	One can use the UCSC Genome Browser's MySQL database to extract
+	chromosome sizes. For example, H. sapiens:
+
+	mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \
+	"select chrom, size from hg19.chromInfo"  > hg19.genome
+
diff --git a/test/util.py b/test/util.py
@@ -318,6 +318,16 @@ def run_assertions(self, cmd: Command, explore=False):
             outputs=0,
         ),
     ),
+    pytest.param(
+        HelpText(
+            path="test_data/bedtools_random.txt",
+            cmd=["bedtools", "random"],
+            positional=0,
+            named=4,
+            subcommands=0,
+            outputs=0,
+        ),
+    ),
     # These last two are really strange, maybe I'll support them eventually
     pytest.param(
         HelpText(