Skip to content

Commit b2937e2

Browse files
authored
Merge pull request #79 from moj-analytical-services/write-glue-spec
Write glue spec
2 parents 6a44b2c + 2da2551 commit b2937e2

File tree

7 files changed

+41
-9
lines changed

7 files changed

+41
-9
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
55
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
66

7+
## v2.2.0
8+
### Added
9+
- Fixed bug where glue_specific would not write to json or be a key in dictionary from TableMeta class `to_dict()` method.
10+
- Fixed bug where default table ddl templates would be overwritten causing mixed table definitions (see issue no. 80) for specific example and fix.
11+
- If meta has partition property if none or empty list then this property will no longer be passed to dict (and therefore not to json)
12+
- If meta has glue_specific property if none or empty dict then this property will no longer be passed to dict (and therefore not to json)
13+
714
## v2.1.2
815
### Added
916
- DatabaseMeta method function `test_column_types_align` now tests that all column types match across all tables in database object.

MANIFEST.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ include etl_manager/specs/glue_job.json
33
include etl_manager/specs/base.json
44
include etl_manager/specs/orc_specific.json
55
include etl_manager/specs/csv_quoted_nodate_specific.json
6-
include etl_manager/specs/par_specific.json
6+
include etl_manager/specs/parquet_specific.json
77
include etl_manager/specs/csv_specific.json
88
include etl_manager/specs/json_specific.json
99
include etl_manager/specs/regex_specific.json

etl_manager/meta.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
_s3_resource,
1313
_remove_final_slash
1414
)
15-
from copy import copy
15+
import copy
1616
import string
1717
import json
1818
import os
@@ -27,8 +27,7 @@
2727
"csv_quoted_nodate": json.load(pkg_resources.resource_stream(__name__, "specs/csv_quoted_nodate_specific.json")),
2828
"regex": json.load(pkg_resources.resource_stream(__name__, "specs/regex_specific.json")),
2929
"orc": json.load(pkg_resources.resource_stream(__name__, "specs/orc_specific.json")),
30-
"par": json.load(pkg_resources.resource_stream(__name__, "specs/par_specific.json")),
31-
"parquet": json.load(pkg_resources.resource_stream(__name__, "specs/par_specific.json")),
30+
"parquet": json.load(pkg_resources.resource_stream(__name__, "specs/parquet_specific.json")),
3231
"json": json.load(pkg_resources.resource_stream(__name__, "specs/json_specific.json"))
3332
}
3433

@@ -47,7 +46,7 @@ def _get_spec(spec_name) :
4746
if spec_name not in _template :
4847
raise ValueError("spec_name/data_type requested ({}) is not a valid spec/data_type".format(spec_name))
4948

50-
return copy(_template[spec_name])
49+
return copy.deepcopy(_template[spec_name])
5150

5251
class TableMeta :
5352
"""
@@ -273,9 +272,14 @@ def to_dict(self) :
273272
"description" : self.description,
274273
"data_format" : self.data_format,
275274
"columns" : self.columns,
276-
"partitions" : self.partitions,
277-
"location" : self.location
275+
"location" : self.location,
278276
}
277+
if bool(self.partitions) :
278+
meta['partitions'] = self.partitions
279+
280+
if bool(self.glue_specific) :
281+
meta['glue_specific'] = self.glue_specific
282+
279283
return meta
280284

281285
def write_to_json(self, file_path) :

etl_manager/specs/table_schema.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@
6565
"examples": [
6666
"sop_full/"
6767
]
68+
},
69+
"glue_specific" : {
70+
"type" : ["null", "object"],
71+
"title": "Dict used to add any additional table properties for glue catalogue. For an example see here: https://github.com/moj-analytical-services/etl_manager/blob/master/example/meta_data/db1/pay.json#L19"
6872
}
6973
}
7074
}

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name='etl_manager',
5-
version='2.1.2',
5+
version='2.2.0',
66
packages=find_packages(exclude=['tests*']),
77
license='MIT',
88
description='A python package to manage etl processes on AWS',

tests/test_tests.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66

77
import unittest
8-
from etl_manager.meta import DatabaseMeta, TableMeta, read_database_folder, read_table_json, _agnostic_to_glue_spark_dict, MetaColumnTypeMismatch
8+
from etl_manager.meta import DatabaseMeta, TableMeta, read_database_folder, read_table_json, _agnostic_to_glue_spark_dict, MetaColumnTypeMismatch, _get_spec
99
from etl_manager.utils import _end_with_slash, _validate_string, _glue_client, read_json, _remove_final_slash
1010
from etl_manager.etl import GlueJob
1111
import boto3
@@ -158,6 +158,16 @@ def test_table_to_dict(self) :
158158

159159
self.assertDictEqual(test_dict, expected_dict)
160160

161+
# Test file with glue specific
162+
expected_dict2 = read_json('example/meta_data/db1/pay.json')
163+
test_dict2 = db.table('pay').to_dict()
164+
165+
# Null out schema as may need changing when on branch but still need to unit test
166+
expected_dict2["$schema"] = ''
167+
test_dict2["$schema"] = ''
168+
169+
self.assertDictEqual(test_dict2, expected_dict2)
170+
161171
def test_db_table_names(self) :
162172
db = read_database_folder('example/meta_data/db1/')
163173
t = all(t in ['teams', 'employees', 'pay'] for t in db.table_names)
@@ -189,6 +199,13 @@ def test_glue_specific_table(self):
189199
glue_def = t.glue_table_definition("db_path")
190200
self.assertTrue(t.glue_table_definition("db_path")["Parameters"]['skip.header.line.count'] == '1')
191201

202+
def test_glue_table_definition_doesnt_overwrite_base_spec(self) :
203+
expected_dict = _get_spec('base')
204+
db = read_database_folder('example/meta_data/db1/')
205+
glue_def_dump = db.table('pay').glue_table_definition()
206+
207+
self.assertDictEqual(expected_dict, _get_spec('base'))
208+
192209
def test_add_remove_table(self) :
193210
db = read_database_folder('example/meta_data/db1/')
194211
self.assertRaises(ValueError, db.remove_table, 'not_a_table')

0 commit comments

Comments
 (0)