Skip to content

Commit 1e0bccb

Browse files
add json schema generation (#175)
Co-authored-by: Dylan Sprayberry <[email protected]>
1 parent f0e1e6e commit 1e0bccb

File tree

8 files changed

+225
-54
lines changed

8 files changed

+225
-54
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog
22

3+
## 6.2.0
4+
* Adds json schema generation [#175](https://github.com/singer-io/singer-python/pull/175)
5+
36
## 6.1.0
47
* Make ensure_ascii Dynamic with Default Set to True in JSON Serialization. Required to handle the special characters [#168](https://github.com/singer-io/singer-python/pull/168)
58

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import subprocess
55

66
setup(name="singer-python",
7-
version='6.1.1',
7+
version='6.2.0',
88
description="Singer.io utility library",
99
author="Stitch",
1010
classifiers=['Programming Language :: Python :: 3 :: Only'],

singer/schema_generation.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import dateutil.parser
2+
3+
4+
def add_observation(acc, path):
5+
6+
node = acc
7+
for i in range(0, len(path) - 1):
8+
k = path[i]
9+
if k not in node:
10+
node[k] = {}
11+
node = node[k]
12+
13+
node[path[-1]] = True
14+
15+
# pylint: disable=too-many-branches
16+
def add_observations(acc, path, data):
17+
if isinstance(data, dict):
18+
for key in data:
19+
add_observations(acc, path + ["object", key], data[key])
20+
elif isinstance(data, list):
21+
for item in data:
22+
add_observations(acc, path + ["array"], item)
23+
elif isinstance(data, str):
24+
# If the string parses as a date, add an observation that its a date
25+
try:
26+
data = dateutil.parser.parse(data)
27+
except (dateutil.parser.ParserError, OverflowError):
28+
data = None
29+
if data:
30+
add_observation(acc, path + ["date"])
31+
else:
32+
add_observation(acc, path + ["string"])
33+
34+
elif isinstance(data, bool):
35+
add_observation(acc, path + ["boolean"])
36+
elif isinstance(data, int):
37+
add_observation(acc, path + ["integer"])
38+
elif isinstance(data, float):
39+
add_observation(acc, path + ["number"])
40+
elif data is None:
41+
add_observation(acc, path + ["null"])
42+
else:
43+
raise Exception("Unexpected value " + repr(data) + " at path " + repr(path))
44+
45+
return acc
46+
47+
def to_json_schema(obs):
48+
result = {'type': ['null']}
49+
50+
for key in obs:
51+
52+
if key == 'object':
53+
result['type'] += ['object']
54+
if 'properties' not in result:
55+
result['properties'] = {}
56+
for obj_key in obs['object']:
57+
result['properties'][obj_key] = to_json_schema(obs['object'][obj_key])
58+
59+
elif key == 'array':
60+
result['type'] += ['array']
61+
result['items'] = to_json_schema(obs['array'])
62+
63+
elif key == 'date':
64+
result['type'] += ['string']
65+
result['format'] = 'date-time'
66+
elif key == 'string':
67+
result['type'] += ['string']
68+
69+
elif key == 'boolean':
70+
result['type'] += ['boolean']
71+
72+
elif key == 'integer':
73+
result['type'] += ['integer']
74+
75+
elif key == 'number':
76+
# Use type=string, format=singer.decimal
77+
result['type'] += ['string']
78+
result['format'] = 'singer.decimal'
79+
80+
elif key == 'null':
81+
pass
82+
83+
else:
84+
raise Exception("Unexpected data type " + key)
85+
86+
return result
87+
88+
def generate_schema(records):
89+
obs = {}
90+
for record in records:
91+
obs = add_observations(obs, [], record)
92+
return to_json_schema(obs)

tests/test_catalog.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def test_one_selected_stream(self):
2525
CatalogEntry(tap_stream_id='c',schema=Schema(),metadata=[])])
2626
state = {}
2727
selected_streams = catalog.get_selected_streams(state)
28-
self.assertEquals([e for e in selected_streams],[selected_entry])
28+
self.assertEqual([e for e in selected_streams],[selected_entry])
2929

3030
def test_resumes_currently_syncing_stream(self):
3131
selected_entry_a = CatalogEntry(tap_stream_id='a',
@@ -44,7 +44,7 @@ def test_resumes_currently_syncing_stream(self):
4444
selected_entry_c])
4545
state = {'currently_syncing': 'c'}
4646
selected_streams = catalog.get_selected_streams(state)
47-
self.assertEquals([e for e in selected_streams][0],selected_entry_c)
47+
self.assertEqual([e for e in selected_streams][0],selected_entry_c)
4848

4949
class TestToDictAndFromDict(unittest.TestCase):
5050

@@ -141,4 +141,4 @@ def test(self):
141141
CatalogEntry(tap_stream_id='b'),
142142
CatalogEntry(tap_stream_id='c')])
143143
entry = catalog.get_stream('b')
144-
self.assertEquals('b', entry.tap_stream_id)
144+
self.assertEqual('b', entry.tap_stream_id)

tests/test_exceptions.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ def test_SingerError_prints_correctly(self):
1414
raise SingerError(error_text)
1515

1616
expected_text = "SingerError\n" + error_text
17-
self.assertEquals(expected_text,
18-
str(test_run.exception))
17+
self.assertEqual(expected_text,
18+
str(test_run.exception))
1919

2020
def test_SingerConfigurationError_prints_correctly(self):
2121
error_text = "An error occured"
@@ -24,8 +24,8 @@ def test_SingerConfigurationError_prints_correctly(self):
2424
raise SingerConfigurationError(error_text)
2525

2626
expected_text = "SingerConfigurationError\n" + error_text
27-
self.assertEquals(expected_text,
28-
str(test_run.exception))
27+
self.assertEqual(expected_text,
28+
str(test_run.exception))
2929

3030
def test_SingerDiscoveryError_prints_correctly(self):
3131
error_text = "An error occured"
@@ -34,8 +34,8 @@ def test_SingerDiscoveryError_prints_correctly(self):
3434
raise SingerDiscoveryError(error_text)
3535

3636
expected_text = "SingerDiscoveryError\n" + error_text
37-
self.assertEquals(expected_text,
38-
str(test_run.exception))
37+
self.assertEqual(expected_text,
38+
str(test_run.exception))
3939

4040
def test_SingerSyncError_prints_correctly(self):
4141
error_text = "An error occured"
@@ -44,8 +44,8 @@ def test_SingerSyncError_prints_correctly(self):
4444
raise SingerSyncError(error_text)
4545

4646
expected_text = "SingerSyncError\n" + error_text
47-
self.assertEquals(expected_text,
48-
str(test_run.exception))
47+
self.assertEqual(expected_text,
48+
str(test_run.exception))
4949

5050
def test_SingerRetryableRequestError_prints_correctly(self):
5151
error_text = "An error occured"
@@ -54,8 +54,8 @@ def test_SingerRetryableRequestError_prints_correctly(self):
5454
raise SingerRetryableRequestError(error_text)
5555

5656
expected_text = "SingerRetryableRequestError\n" + error_text
57-
self.assertEquals(expected_text,
58-
str(test_run.exception))
57+
self.assertEqual(expected_text,
58+
str(test_run.exception))
5959

6060
def test_SingerError_prints_multiple_lines_correctly(self):
6161
error_text = "\n".join(["Line 1", "Line 2", "Line 3"])
@@ -64,5 +64,5 @@ def test_SingerError_prints_multiple_lines_correctly(self):
6464
raise SingerError(error_text)
6565

6666
expected_text = "SingerError\n" + error_text
67-
self.assertEquals(expected_text,
68-
str(test_run.exception))
67+
self.assertEqual(expected_text,
68+
str(test_run.exception))

tests/test_schema.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -44,38 +44,38 @@ class TestSchema(unittest.TestCase):
4444
additionalProperties=True)
4545

4646
def test_string_to_dict(self):
47-
self.assertEquals(self.string_dict, self.string_obj.to_dict())
47+
self.assertEqual(self.string_dict, self.string_obj.to_dict())
4848

4949
def test_integer_to_dict(self):
50-
self.assertEquals(self.integer_dict, self.integer_obj.to_dict())
50+
self.assertEqual(self.integer_dict, self.integer_obj.to_dict())
5151

5252
def test_array_to_dict(self):
53-
self.assertEquals(self.array_dict, self.array_obj.to_dict())
53+
self.assertEqual(self.array_dict, self.array_obj.to_dict())
5454

5555
def test_object_to_dict(self):
56-
self.assertEquals(self.object_dict, self.object_obj.to_dict())
56+
self.assertEqual(self.object_dict, self.object_obj.to_dict())
5757

5858
def test_string_from_dict(self):
59-
self.assertEquals(self.string_obj, Schema.from_dict(self.string_dict))
59+
self.assertEqual(self.string_obj, Schema.from_dict(self.string_dict))
6060

6161
def test_integer_from_dict(self):
62-
self.assertEquals(self.integer_obj, Schema.from_dict(self.integer_dict))
62+
self.assertEqual(self.integer_obj, Schema.from_dict(self.integer_dict))
6363

6464
def test_array_from_dict(self):
65-
self.assertEquals(self.array_obj, Schema.from_dict(self.array_dict))
65+
self.assertEqual(self.array_obj, Schema.from_dict(self.array_dict))
6666

6767
def test_object_from_dict(self):
68-
self.assertEquals(self.object_obj, Schema.from_dict(self.object_dict))
68+
self.assertEqual(self.object_obj, Schema.from_dict(self.object_dict))
6969

7070
def test_repr_atomic(self):
71-
self.assertEquals(self.string_obj, eval(repr(self.string_obj)))
71+
self.assertEqual(self.string_obj, eval(repr(self.string_obj)))
7272

7373
def test_repr_recursive(self):
74-
self.assertEquals(self.object_obj, eval(repr(self.object_obj)))
74+
self.assertEqual(self.object_obj, eval(repr(self.object_obj)))
7575

7676
def test_object_from_dict_with_defaults(self):
7777
schema = Schema.from_dict(self.object_dict, inclusion='automatic')
78-
self.assertEquals('whatever', schema.inclusion,
79-
msg='The schema value should override the default')
80-
self.assertEquals('automatic', schema.properties['a_string'].inclusion)
81-
self.assertEquals('automatic', schema.properties['an_array'].items.inclusion)
78+
self.assertEqual('whatever', schema.inclusion,
79+
msg='The schema value should override the default')
80+
self.assertEqual('automatic', schema.properties['a_string'].inclusion)
81+
self.assertEqual('automatic', schema.properties['an_array'].items.inclusion)

tests/test_schema_generation.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import unittest
2+
from singer.schema_generation import generate_schema
3+
4+
class TestSchemaGeneration(unittest.TestCase):
5+
def test_simple_schema(self):
6+
records = [{'a': 1, 'b': 'two', 'c': True, 'dt': '2000-01-01T00:11:22Z'}]
7+
expected_schema = {
8+
'type': ['null', 'object'],
9+
'properties': {
10+
'a': {'type': ['null', 'integer']},
11+
'b': {'type': ['null', 'string']},
12+
'c': {'type': ['null', 'boolean']},
13+
'dt': {'type': ['null', 'string'], 'format': 'date-time'}
14+
}
15+
}
16+
self.assertEqual(expected_schema, generate_schema(records))
17+
18+
def test_mix_n_match_records_schema(self):
19+
records = [
20+
{'a': 1, 'b': 'b'},
21+
{'a': 'two', 'c': 7, 'd': [1, 'two']},
22+
{'a': True, 'c': 7.7, 'd': {'one': 1, 'two': 'two'}}
23+
]
24+
expected_schema = {
25+
'type': ['null', 'object'],
26+
'properties': {
27+
'a': {'type': {'null', 'integer', 'string', 'boolean'}},
28+
'b': {'type': ['null', 'string']},
29+
'c': {'type': {'null', 'integer', 'string'}, 'format': 'singer.decimal'},
30+
'd': {
31+
'type': {'null', 'array', 'object'},
32+
'items': {'type': {'null', 'integer', 'string'}},
33+
'properties': {'one': {'type': ['null', 'integer']},
34+
'two': {'type': ['null', 'string']}}
35+
36+
}
37+
}
38+
}
39+
actual_schema = generate_schema(records)
40+
actual_schema['properties']['a']['type'] = set(actual_schema['properties']['a']['type'])
41+
actual_schema['properties']['c']['type'] = set(actual_schema['properties']['c']['type'])
42+
actual_schema['properties']['d']['type'] = set(actual_schema['properties']['d']['type'])
43+
actual_schema['properties']['d']['items']['type'] = set(actual_schema['properties']['d']['items']['type'])
44+
self.assertEqual(expected_schema, actual_schema)
45+
46+
def test_nested_structue_schema(self):
47+
records = [{'a': {'b': {'c': [{'d': 7}]}, 'e': [[1, 2, 3]]}}]
48+
expected_schema = {
49+
'type': ['null', 'object'],
50+
'properties': {
51+
'a': {
52+
'type': ['null', 'object'],
53+
'properties': {
54+
'b': {
55+
'type': ['null', 'object'],
56+
'properties': {
57+
'c': {
58+
'type': ['null', 'array'],
59+
'items': {
60+
'type': ['null', 'object'],
61+
'properties': {'d': {'type': ['null', 'integer']}}
62+
}
63+
}
64+
}
65+
},
66+
'e': {
67+
'type': ['null', 'array'],
68+
'items': {
69+
'type': ['null', 'array'],
70+
'items': {'type': ['null', 'integer']}}
71+
}
72+
}
73+
}
74+
}
75+
}
76+
self.assertEqual(expected_schema, generate_schema(records))

0 commit comments

Comments
 (0)