Merge remote-tracking branch 'origin/20160628-fsaad-cgpm' into 20160624-riastradh-cgpm

riastradh-probcomp · riastradh-probcomp · commit d756b65e89d9 · 2016-07-07T18:21:22.000Z
diff --git a/setup.py b/setup.py
@@ -186,7 +186,8 @@ def run_tests(self):
 lemonade = 'external/lemonade/dist'
 grammars = [
     'src/grammar.y',
-    'src/metamodels/cgpm_grammar.y',
+    'src/metamodels/cgpm_analyze/grammar.y',
+    'src/metamodels/cgpm_schema/grammar.y',
 ]
 
 setup(
@@ -213,6 +214,8 @@ def run_tests(self):
     packages=[
         'bayeslite',
         'bayeslite.metamodels',
+        'bayeslite.metamodels.cgpm_schema',
+        'bayeslite.metamodels.cgpm_analyze',
         'bayeslite.plex',
         'bayeslite.shell',
         'bayeslite.weakprng',
diff --git a/src/metamodels/cgpm_analyze/__init__.py b/src/metamodels/cgpm_analyze/__init__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+#   Copyright (c) 2010-2016, MIT Probabilistic Computing Project
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
diff --git a/src/metamodels/cgpm_analyze/grammar.y b/src/metamodels/cgpm_analyze/grammar.y
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2010-2016, MIT Probabilistic Computing Project
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * Terminal conventions:
+ * - T_ means a punctuation token.
+ * - K_ means a keyword.
+ * - L_ means a lexeme, which has useful associated text, e.g. an integer.
+ */
+
+
+anlaysis(start)     ::= phrases(ps).
+
+phrases(one)        ::= phrase(p).
+phrases(many)       ::= phrases(ps) T_SEMI phrase(p).
+
+phrase(none)        ::= .
+phrase(variables)   ::= K_VARIABLES column_list(cols).
+phrase(skip)        ::= K_SKIP column_list(cols).
+
+column_list(one)    ::= column_name(col).
+column_list(many)   ::= column_list(cols) T_COMMA column_name(col).
+
+column_name(n)      ::= L_NAME(name).
diff --git a/src/metamodels/cgpm_analyze/parse.py b/src/metamodels/cgpm_analyze/parse.py
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+
+#   Copyright (c) 2010-2016, MIT Probabilistic Computing Project
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+from collections import namedtuple
+
+from bayeslite.exception import BQLParseError
+from bayeslite.util import casefold
+
+import grammar
+
+'''
+grep -o 'K_[A-Z][A-Z0-9_]*' < grammar.y | sort -u | awk '
+{
+    sub("^K_", "", $1);
+    printf("    '\''%s'\'': grammar.K_%s,\n", tolower($1), $1);
+}'
+'''
+
+KEYWORDS = {
+    'skip': grammar.K_SKIP,
+    'variables': grammar.K_VARIABLES,
+}
+
+PUNCTUATION = {
+    ',': grammar.T_COMMA,
+    ';': grammar.T_SEMI,
+}
+
+def parse(tokens):
+    semantics = CGpmAnalyzeSemantics()
+    parser = grammar.Parser(semantics)
+    for token in tokenize(tokens):
+        semantics.context.append(token)
+        if len(semantics.context) > 10:
+            semantics.context.pop(0)
+        parser.feed(token)
+    if semantics.failed or semantics.errors:
+        raise BQLParseError('\n'.join(semantics.errors))
+    assert semantics.phrases is not None
+    return semantics.phrases
+
+
+def tokenize(tokens):
+    for token in tokens:
+        if isinstance(token, str):
+            if casefold(token) in KEYWORDS:
+                yield KEYWORDS[casefold(token)], token
+            elif token in PUNCTUATION:
+                yield PUNCTUATION[token], token
+            else:               # XXX check for alphanumeric/_
+                yield grammar.L_NAME, token
+        elif isinstance(token, (int, float)):
+            yield grammar.L_NUMBER, token
+        else:
+            raise IOError('Invalid token: %r' % (token,))
+    yield 0, ''                 # EOF
+
+
+class CGpmAnalyzeSemantics(object):
+    def __init__(self):
+        self.context = []
+        self.errors = []
+        self.failed = False
+        self.phrases = None
+
+    def accept(self):
+        pass
+    def parse_failed(self):
+        self.failed = True
+
+    def syntax_error(self, (token, text)):
+        if token == -1:         # error
+            self.errors.append("Syntax error near [%s] after [%s]" % (
+                text, ' '.join([str(t) for (_t, t) in self.context[:-1]])))
+
+    def p_anlaysis_start(self, ps):             self.phrases = ps
+
+    def p_phrases_one(self, p):                 return [p] if p else []
+    def p_phrases_many(self, ps, p):
+        if p: ps.append(p)
+        return ps
+
+    def p_phrase_none(self,):                   return None
+    def p_phrase_variables(self, cols):         return Variables(cols)
+    def p_phrase_skip(self, cols):              return Skip(cols)
+
+    def p_column_list_one(self, col):           return [col]
+    def p_column_list_many(self, cols, col):    cols.append(col); return cols
+    def p_column_name_n(self, name):            return name
+
+Variables = namedtuple('Variables', [
+    'vars',
+])
+
+Skip = namedtuple('Skip', [
+    'vars',
+])
diff --git a/src/metamodels/cgpm_metamodel.py b/src/metamodels/cgpm_metamodel.py
@@ -65,7 +65,7 @@
 from bayeslite.stats import arithmetic_mean
 from bayeslite.util import casefold
 
-import cgpm_parse
+import cgpm_schema.parse
 
 CGPM_SCHEMA_1 = '''
 INSERT INTO bayesdb_metamodel (name, version) VALUES ('cgpm', 1);
@@ -119,7 +119,7 @@ def register(self, bdb):
                     ' with unknown schema version: %d' % (version,))
 
     def create_generator(self, bdb, generator_id, schema_tokens):
-        schema_ast = cgpm_parse.parse(schema_tokens)
+        schema_ast = cgpm_schema.parse.parse(schema_tokens)
         schema = _create_schema(bdb, generator_id, schema_ast)
 
         # Store the schema.
@@ -636,7 +636,7 @@ def _retrieve_stattype_dist_params(var):
     # Process each clause one by one.
     for clause in schema_ast:
 
-        if isinstance(clause, cgpm_parse.Basic):
+        if isinstance(clause, cgpm_schema.parse.Basic):
             # Basic Crosscat component model: one variable to be put
             # into Crosscat views.
             var = clause.var
@@ -665,7 +665,7 @@ def _retrieve_stattype_dist_params(var):
             variables.append([var, stattype, dist, params])
             modelled.add(var)
 
-        elif isinstance(clause, cgpm_parse.Foreign):
+        elif isinstance(clause, cgpm_schema.parse.Foreign):
             # Foreign model: some set of output variables is to be
             # modelled by foreign logic, possibly conditional on some
             # set of input variables.
@@ -717,7 +717,7 @@ def _retrieve_stattype_dist_params(var):
                         'kwds': kwds,
                     })
 
-        elif isinstance(clause, cgpm_parse.Subsample):
+        elif isinstance(clause, cgpm_schema.parse.Subsample):
             if subsample is not None:
                 raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n,))
             subsample = clause.n
diff --git a/src/metamodels/cgpm_schema/__init__.py b/src/metamodels/cgpm_schema/__init__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+#   Copyright (c) 2010-2016, MIT Probabilistic Computing Project
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
diff --git a/src/metamodels/cgpm_schema/grammar.y b/src/metamodels/cgpm_schema/grammar.y
diff --git a/src/metamodels/cgpm_schema/parse.py b/src/metamodels/cgpm_schema/parse.py
@@ -19,33 +19,33 @@
 from bayeslite.exception import BQLParseError
 from bayeslite.util import casefold
 
-import cgpm_grammar
+import grammar
 
 '''
-grep -o 'K_[A-Z][A-Z0-9_]*' < cgpm_grammar.y | sort -u | awk '
+grep -o 'K_[A-Z][A-Z0-9_]*' < grammar.y | sort -u | awk '
 {
     sub("^K_", "", $1);
-    printf("    '\''%s'\'': cgpm_grammar.K_%s,\n", tolower($1), $1);
+    printf("    '\''%s'\'': grammar.K_%s,\n", tolower($1), $1);
 }'
 '''
 
 KEYWORDS = {
-    'given': cgpm_grammar.K_GIVEN,
-    'model': cgpm_grammar.K_MODEL,
-    'subsample': cgpm_grammar.K_SUBSAMPLE,
-    'using': cgpm_grammar.K_USING,
+    'given': grammar.K_GIVEN,
+    'model': grammar.K_MODEL,
+    'subsample': grammar.K_SUBSAMPLE,
+    'using': grammar.K_USING,
 }
 
 PUNCTUATION = {
-    '(': cgpm_grammar.T_LROUND,
-    ')': cgpm_grammar.T_RROUND,
-    ',': cgpm_grammar.T_COMMA,
-    '=': cgpm_grammar.T_EQ,
+    '(': grammar.T_LROUND,
+    ')': grammar.T_RROUND,
+    ',': grammar.T_COMMA,
+    '=': grammar.T_EQ,
 }
 
 def parse(tokenses):
     semantics = CGPM_Semantics()
-    parser = cgpm_grammar.Parser(semantics)
+    parser = grammar.Parser(semantics)
     for token in tokenize(tokenses):
         semantics.context.append(token)
         if len(semantics.context) > 10:
@@ -64,9 +64,9 @@ def tokenize(tokenses):
             elif token in PUNCTUATION:
                 yield PUNCTUATION[token], token
             else:               # XXX check for alphanumeric/_
-                yield cgpm_grammar.L_NAME, token
+                yield grammar.L_NAME, token
         elif isinstance(token, (int, float)):
-            yield cgpm_grammar.L_NUMBER, token
+            yield grammar.L_NUMBER, token
         else:
             raise IOError('Invalid token: %r' % (token,))
     yield 0, ''                 # EOF
diff --git a/tests/foobar.py b/tests/foobar.py
@@ -112,6 +112,7 @@
 
 bdb.sql_execute('''
     CREATE TABLE satellites_ucs (
+        name,
         apogee,
         class_of_orbit,
         country_of_operator,
@@ -125,22 +126,24 @@
     ('geo', lambda x, y: x + y**2),
     ('leo', lambda x, y: math.sin(x + y)),
 ]:
-    for x in xrange(10):
+    for x in xrange(1000):
         for y in xrange(10):
             countries = ['US', 'Russia', 'China', 'Bulgaria']
             country = countries[random.randrange(len(countries))]
+            name = 'sat-%s-%d' % (country, random.randrange(10**8))
             mass = random.gauss(1000, 50)
             bdb.sql_execute('''
                 INSERT INTO satellites_ucs
-                    (country_of_operator, launch_mass, class_of_orbit,
+                    (name, country_of_operator, launch_mass, class_of_orbit,
                         apogee, perigee, period)
-                    VALUES (?,?,?,?,?,?)
-            ''', (country, mass, l, x, y, f(x, y)))
+                    VALUES (?,?,?,?,?,?,?)
+            ''', (name, country, mass, l, x, y, f(x, y)))
 
 D = bdb.sql_execute('SELECT * FROM satellites_ucs').fetchall()
 
 bdb.execute('''
     CREATE POPULATION satellites FOR satellites_ucs (
+        name IGNORE,
         apogee NUMERICAL,
         class_of_orbit CATEGORICAL,
         country_of_operator CATEGORICAL,
@@ -172,6 +175,7 @@
         MODEL perigee GIVEN apogee USING linreg,
         MODEL class_of_orbit GIVEN apogee, period, perigee
             USING forest (k = 4),
+        SUBSAMPLE 100,
         )
     '''.format(kepler_source))
 
diff --git a/tests/test_cgpm.py b/tests/test_cgpm.py
@@ -16,6 +16,7 @@
 
 import math
 import numpy as np
+import pytest
 import random                   # XXX
 
 #from cgpm.regressions.forest import RandomForest
@@ -28,6 +29,7 @@
 
 # XXX KLUDGE TAKEN FROM cgpm/tests/test_gpmcc_simple_composite.py
 from cgpm.cgpm import CGpm
+from cgpm.utils import general as gu
 class FourWay(CGpm):
     """Generates categorical(4) output on R2 valued input."""
 
@@ -178,9 +180,10 @@ def test_cgpm():
             ESTIMATE DEPENDENCE PROBABILITY
                 FROM PAIRWISE VARIABLES OF satellites
         ''').fetchall()
-        bdb.execute('''
-            ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
-        ''').fetchall()
+        with pytest.raises(AssertionError):
+            bdb.execute('''
+                ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
+            ''').fetchall()
         bdb.execute('''
             ESTIMATE PROBABILITY OF period = 42
                     GIVEN (apogee = 8 AND perigee = 7)