Skip to content

Commit d756b65

Browse files
Merge remote-tracking branch 'origin/20160628-fsaad-cgpm' into 20160624-riastradh-cgpm
2 parents 31c2a8c + 786fb61 commit d756b65

File tree

10 files changed

+214
-27
lines changed

10 files changed

+214
-27
lines changed

setup.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,8 @@ def run_tests(self):
186186
lemonade = 'external/lemonade/dist'
187187
grammars = [
188188
'src/grammar.y',
189-
'src/metamodels/cgpm_grammar.y',
189+
'src/metamodels/cgpm_analyze/grammar.y',
190+
'src/metamodels/cgpm_schema/grammar.y',
190191
]
191192

192193
setup(
@@ -213,6 +214,8 @@ def run_tests(self):
213214
packages=[
214215
'bayeslite',
215216
'bayeslite.metamodels',
217+
'bayeslite.metamodels.cgpm_schema',
218+
'bayeslite.metamodels.cgpm_analyze',
216219
'bayeslite.plex',
217220
'bayeslite.shell',
218221
'bayeslite.weakprng',
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Copyright (c) 2010-2016, MIT Probabilistic Computing Project
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
* Copyright (c) 2010-2016, MIT Probabilistic Computing Project
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
/*
18+
* Terminal conventions:
19+
* - T_ means a punctuation token.
20+
* - K_ means a keyword.
21+
* - L_ means a lexeme, which has useful associated text, e.g. an integer.
22+
*/
23+
24+
25+
anlaysis(start) ::= phrases(ps).
26+
27+
phrases(one) ::= phrase(p).
28+
phrases(many) ::= phrases(ps) T_SEMI phrase(p).
29+
30+
phrase(none) ::= .
31+
phrase(variables) ::= K_VARIABLES column_list(cols).
32+
phrase(skip) ::= K_SKIP column_list(cols).
33+
34+
column_list(one) ::= column_name(col).
35+
column_list(many) ::= column_list(cols) T_COMMA column_name(col).
36+
37+
column_name(n) ::= L_NAME(name).
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Copyright (c) 2010-2016, MIT Probabilistic Computing Project
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from collections import namedtuple
18+
19+
from bayeslite.exception import BQLParseError
20+
from bayeslite.util import casefold
21+
22+
import grammar
23+
24+
'''
25+
grep -o 'K_[A-Z][A-Z0-9_]*' < grammar.y | sort -u | awk '
26+
{
27+
sub("^K_", "", $1);
28+
printf(" '\''%s'\'': grammar.K_%s,\n", tolower($1), $1);
29+
}'
30+
'''
31+
32+
KEYWORDS = {
33+
'skip': grammar.K_SKIP,
34+
'variables': grammar.K_VARIABLES,
35+
}
36+
37+
PUNCTUATION = {
38+
',': grammar.T_COMMA,
39+
';': grammar.T_SEMI,
40+
}
41+
42+
def parse(tokens):
43+
semantics = CGpmAnalyzeSemantics()
44+
parser = grammar.Parser(semantics)
45+
for token in tokenize(tokens):
46+
semantics.context.append(token)
47+
if len(semantics.context) > 10:
48+
semantics.context.pop(0)
49+
parser.feed(token)
50+
if semantics.failed or semantics.errors:
51+
raise BQLParseError('\n'.join(semantics.errors))
52+
assert semantics.phrases is not None
53+
return semantics.phrases
54+
55+
56+
def tokenize(tokens):
57+
for token in tokens:
58+
if isinstance(token, str):
59+
if casefold(token) in KEYWORDS:
60+
yield KEYWORDS[casefold(token)], token
61+
elif token in PUNCTUATION:
62+
yield PUNCTUATION[token], token
63+
else: # XXX check for alphanumeric/_
64+
yield grammar.L_NAME, token
65+
elif isinstance(token, (int, float)):
66+
yield grammar.L_NUMBER, token
67+
else:
68+
raise IOError('Invalid token: %r' % (token,))
69+
yield 0, '' # EOF
70+
71+
72+
class CGpmAnalyzeSemantics(object):
73+
def __init__(self):
74+
self.context = []
75+
self.errors = []
76+
self.failed = False
77+
self.phrases = None
78+
79+
def accept(self):
80+
pass
81+
def parse_failed(self):
82+
self.failed = True
83+
84+
def syntax_error(self, (token, text)):
85+
if token == -1: # error
86+
self.errors.append("Syntax error near [%s] after [%s]" % (
87+
text, ' '.join([str(t) for (_t, t) in self.context[:-1]])))
88+
89+
def p_anlaysis_start(self, ps): self.phrases = ps
90+
91+
def p_phrases_one(self, p): return [p] if p else []
92+
def p_phrases_many(self, ps, p):
93+
if p: ps.append(p)
94+
return ps
95+
96+
def p_phrase_none(self,): return None
97+
def p_phrase_variables(self, cols): return Variables(cols)
98+
def p_phrase_skip(self, cols): return Skip(cols)
99+
100+
def p_column_list_one(self, col): return [col]
101+
def p_column_list_many(self, cols, col): cols.append(col); return cols
102+
def p_column_name_n(self, name): return name
103+
104+
Variables = namedtuple('Variables', [
105+
'vars',
106+
])
107+
108+
Skip = namedtuple('Skip', [
109+
'vars',
110+
])

src/metamodels/cgpm_metamodel.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
from bayeslite.stats import arithmetic_mean
6666
from bayeslite.util import casefold
6767

68-
import cgpm_parse
68+
import cgpm_schema.parse
6969

7070
CGPM_SCHEMA_1 = '''
7171
INSERT INTO bayesdb_metamodel (name, version) VALUES ('cgpm', 1);
@@ -119,7 +119,7 @@ def register(self, bdb):
119119
' with unknown schema version: %d' % (version,))
120120

121121
def create_generator(self, bdb, generator_id, schema_tokens):
122-
schema_ast = cgpm_parse.parse(schema_tokens)
122+
schema_ast = cgpm_schema.parse.parse(schema_tokens)
123123
schema = _create_schema(bdb, generator_id, schema_ast)
124124

125125
# Store the schema.
@@ -636,7 +636,7 @@ def _retrieve_stattype_dist_params(var):
636636
# Process each clause one by one.
637637
for clause in schema_ast:
638638

639-
if isinstance(clause, cgpm_parse.Basic):
639+
if isinstance(clause, cgpm_schema.parse.Basic):
640640
# Basic Crosscat component model: one variable to be put
641641
# into Crosscat views.
642642
var = clause.var
@@ -665,7 +665,7 @@ def _retrieve_stattype_dist_params(var):
665665
variables.append([var, stattype, dist, params])
666666
modelled.add(var)
667667

668-
elif isinstance(clause, cgpm_parse.Foreign):
668+
elif isinstance(clause, cgpm_schema.parse.Foreign):
669669
# Foreign model: some set of output variables is to be
670670
# modelled by foreign logic, possibly conditional on some
671671
# set of input variables.
@@ -717,7 +717,7 @@ def _retrieve_stattype_dist_params(var):
717717
'kwds': kwds,
718718
})
719719

720-
elif isinstance(clause, cgpm_parse.Subsample):
720+
elif isinstance(clause, cgpm_schema.parse.Subsample):
721721
if subsample is not None:
722722
raise BQLError(bdb, 'Duplicate subsample: %r' % (clause.n,))
723723
subsample = clause.n
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Copyright (c) 2010-2016, MIT Probabilistic Computing Project
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
File renamed without changes.

src/metamodels/cgpm_parse.py renamed to src/metamodels/cgpm_schema/parse.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,33 +19,33 @@
1919
from bayeslite.exception import BQLParseError
2020
from bayeslite.util import casefold
2121

22-
import cgpm_grammar
22+
import grammar
2323

2424
'''
25-
grep -o 'K_[A-Z][A-Z0-9_]*' < cgpm_grammar.y | sort -u | awk '
25+
grep -o 'K_[A-Z][A-Z0-9_]*' < grammar.y | sort -u | awk '
2626
{
2727
sub("^K_", "", $1);
28-
printf(" '\''%s'\'': cgpm_grammar.K_%s,\n", tolower($1), $1);
28+
printf(" '\''%s'\'': grammar.K_%s,\n", tolower($1), $1);
2929
}'
3030
'''
3131

3232
KEYWORDS = {
33-
'given': cgpm_grammar.K_GIVEN,
34-
'model': cgpm_grammar.K_MODEL,
35-
'subsample': cgpm_grammar.K_SUBSAMPLE,
36-
'using': cgpm_grammar.K_USING,
33+
'given': grammar.K_GIVEN,
34+
'model': grammar.K_MODEL,
35+
'subsample': grammar.K_SUBSAMPLE,
36+
'using': grammar.K_USING,
3737
}
3838

3939
PUNCTUATION = {
40-
'(': cgpm_grammar.T_LROUND,
41-
')': cgpm_grammar.T_RROUND,
42-
',': cgpm_grammar.T_COMMA,
43-
'=': cgpm_grammar.T_EQ,
40+
'(': grammar.T_LROUND,
41+
')': grammar.T_RROUND,
42+
',': grammar.T_COMMA,
43+
'=': grammar.T_EQ,
4444
}
4545

4646
def parse(tokenses):
4747
semantics = CGPM_Semantics()
48-
parser = cgpm_grammar.Parser(semantics)
48+
parser = grammar.Parser(semantics)
4949
for token in tokenize(tokenses):
5050
semantics.context.append(token)
5151
if len(semantics.context) > 10:
@@ -64,9 +64,9 @@ def tokenize(tokenses):
6464
elif token in PUNCTUATION:
6565
yield PUNCTUATION[token], token
6666
else: # XXX check for alphanumeric/_
67-
yield cgpm_grammar.L_NAME, token
67+
yield grammar.L_NAME, token
6868
elif isinstance(token, (int, float)):
69-
yield cgpm_grammar.L_NUMBER, token
69+
yield grammar.L_NUMBER, token
7070
else:
7171
raise IOError('Invalid token: %r' % (token,))
7272
yield 0, '' # EOF

tests/foobar.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@
112112

113113
bdb.sql_execute('''
114114
CREATE TABLE satellites_ucs (
115+
name,
115116
apogee,
116117
class_of_orbit,
117118
country_of_operator,
@@ -125,22 +126,24 @@
125126
('geo', lambda x, y: x + y**2),
126127
('leo', lambda x, y: math.sin(x + y)),
127128
]:
128-
for x in xrange(10):
129+
for x in xrange(1000):
129130
for y in xrange(10):
130131
countries = ['US', 'Russia', 'China', 'Bulgaria']
131132
country = countries[random.randrange(len(countries))]
133+
name = 'sat-%s-%d' % (country, random.randrange(10**8))
132134
mass = random.gauss(1000, 50)
133135
bdb.sql_execute('''
134136
INSERT INTO satellites_ucs
135-
(country_of_operator, launch_mass, class_of_orbit,
137+
(name, country_of_operator, launch_mass, class_of_orbit,
136138
apogee, perigee, period)
137-
VALUES (?,?,?,?,?,?)
138-
''', (country, mass, l, x, y, f(x, y)))
139+
VALUES (?,?,?,?,?,?,?)
140+
''', (name, country, mass, l, x, y, f(x, y)))
139141

140142
D = bdb.sql_execute('SELECT * FROM satellites_ucs').fetchall()
141143

142144
bdb.execute('''
143145
CREATE POPULATION satellites FOR satellites_ucs (
146+
name IGNORE,
144147
apogee NUMERICAL,
145148
class_of_orbit CATEGORICAL,
146149
country_of_operator CATEGORICAL,
@@ -172,6 +175,7 @@
172175
MODEL perigee GIVEN apogee USING linreg,
173176
MODEL class_of_orbit GIVEN apogee, period, perigee
174177
USING forest (k = 4),
178+
SUBSAMPLE 100,
175179
)
176180
'''.format(kepler_source))
177181

tests/test_cgpm.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import math
1818
import numpy as np
19+
import pytest
1920
import random # XXX
2021

2122
#from cgpm.regressions.forest import RandomForest
@@ -28,6 +29,7 @@
2829

2930
# XXX KLUDGE TAKEN FROM cgpm/tests/test_gpmcc_simple_composite.py
3031
from cgpm.cgpm import CGpm
32+
from cgpm.utils import general as gu
3133
class FourWay(CGpm):
3234
"""Generates categorical(4) output on R2 valued input."""
3335

@@ -178,9 +180,10 @@ def test_cgpm():
178180
ESTIMATE DEPENDENCE PROBABILITY
179181
FROM PAIRWISE VARIABLES OF satellites
180182
''').fetchall()
181-
bdb.execute('''
182-
ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
183-
''').fetchall()
183+
with pytest.raises(AssertionError):
184+
bdb.execute('''
185+
ESTIMATE PREDICTIVE PROBABILITY OF period FROM satellites
186+
''').fetchall()
184187
bdb.execute('''
185188
ESTIMATE PROBABILITY OF period = 42
186189
GIVEN (apogee = 8 AND perigee = 7)

0 commit comments

Comments
 (0)