Skip to content

Commit d3a3690

Browse files
authored
Merge pull request #27 from cmap/chunking_and_compression
Chunking and compression
2 parents fef095f + 8f2f4a4 commit d3a3690

File tree

5 files changed

+100
-23
lines changed

5 files changed

+100
-23
lines changed

.travis.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@ language: python
33
# python versioning
44
python:
55
- 2.7
6-
- 3.4
7-
- 3.5
8-
- 3.6
96

107
# requirements
118
install:

cmapPy/pandasGEXpress/tests/test_write_gctx.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,37 @@ def test_write_version(self):
7979
self.assertEqual(hdf5_v2, write_gctx.version_number)
8080
os.remove(fn)
8181

82+
def test_calculate_elem_per_kb(self):
83+
max_chunk_kb = 1024
84+
85+
# dtype is numpy.float32
86+
dtype1 = numpy.float32
87+
correct_elem_per_kb1 = 256
88+
elem_per_kb1 = write_gctx.calculate_elem_per_kb(max_chunk_kb, dtype1)
89+
self.assertEqual(elem_per_kb1, correct_elem_per_kb1)
90+
91+
# dtype is numpy.float64
92+
dtype2 = numpy.float64
93+
correct_elem_per_kb2 = 128
94+
elem_per_kb2 = write_gctx.calculate_elem_per_kb(max_chunk_kb, dtype2)
95+
self.assertEqual(elem_per_kb2, correct_elem_per_kb2)
96+
97+
# dtype is somethign else
98+
dtype3 = numpy.int
99+
with self.assertRaises(Exception) as context:
100+
write_gctx.calculate_elem_per_kb(max_chunk_kb, dtype3)
101+
self.assertTrue("only numpy.float32 and numpy.float64 are currently supported" in str(context.exception))
102+
103+
104+
def test_set_data_matrix_chunk_size(self):
105+
max_chunk_kb = 1024
106+
elem_per_kb = 256
107+
sample_data_shape = (978, 1000)
108+
expected_chunk_size = (978, 268)
109+
calculated_chunk_size = write_gctx.set_data_matrix_chunk_size(sample_data_shape, max_chunk_kb, elem_per_kb)
110+
self.assertEqual(calculated_chunk_size, expected_chunk_size)
111+
112+
82113
def test_write_metadata(self):
83114
"""
84115
CASE 1:
@@ -87,8 +118,8 @@ def test_write_metadata(self):
87118
"""
88119
mini_gctoo = mini_gctoo_for_testing.make(convert_neg_666=False)
89120
hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
90-
write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False)
91-
write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False)
121+
write_gctx.write_metadata(hdf5_writer, "row", mini_gctoo.row_metadata_df, False, 6)
122+
write_gctx.write_metadata(hdf5_writer, "col", mini_gctoo.col_metadata_df, False, 6)
92123
hdf5_writer.close()
93124
logger.debug("Wrote mini_gctoo_metadata.gctx to {}".format(
94125
os.path.join(FUNCTIONAL_TESTS_PATH, "mini_gctoo_metadata.gctx")))
@@ -142,8 +173,8 @@ def test_write_metadata(self):
142173
# write row and col metadata fields from mini_gctoo_for_testing instance to file
143174
# Note this time does convert back to -666
144175
hdf5_writer = h5py.File(FUNCTIONAL_TESTS_PATH + "/mini_gctoo_metadata.gctx", "w")
145-
write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True)
146-
write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True)
176+
write_gctx.write_metadata(hdf5_writer, "row", converted_row_metadata, True, 6)
177+
write_gctx.write_metadata(hdf5_writer, "col", converted_col_metadata, True, 6)
147178
hdf5_writer.close()
148179

149180
# read in written metadata, then close and delete file

cmapPy/pandasGEXpress/write_gctx.py

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
version_number = "GCTX1.0"
1919

2020

21-
def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True):
21+
def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6,
22+
max_chunk_kb=1024):
2223
"""
2324
Essentially the same as write() method; enables user to call write_gctx() from
2425
cmapPy instead of write_gctx.write()
@@ -28,13 +29,18 @@ def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True):
2829
write(gctoo_object, out_file_name, convert_back_to_neg_666)
2930

3031

31-
def write(gctoo_object, out_file_name, convert_back_to_neg_666=True):
32+
def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6,
33+
max_chunk_kb=1024, matrix_dtype=numpy.float32):
3234
"""
3335
Writes a GCToo instance to specified file.
3436
3537
Input:
3638
- gctoo_object (GCToo): A GCToo instance.
3739
- out_file_name (str): file name to write gctoo_object to.
40+
- convert_back_to_neg_666 (bool): whether to convert np.NAN in metadata back to "-666"
41+
- gzip_compression_level (int, default=6): Compression level to use for metadata.
42+
- max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
43+
- matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
3844
"""
3945
# make sure out file has a .gctx suffix
4046
gctx_out_name = add_gctx_to_out_name(out_file_name)
@@ -48,14 +54,21 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True):
4854
# write src
4955
write_src(hdf5_out, gctoo_object, gctx_out_name)
5056

57+
# set chunk size for data matrix
58+
elem_per_kb = calculate_elem_per_kb(max_chunk_kb, matrix_dtype)
59+
chunk_size = set_data_matrix_chunk_size(gctoo_object.data_df.shape, max_chunk_kb, elem_per_kb)
60+
5161
# write data matrix
52-
hdf5_out.create_dataset(data_matrix_node, data=gctoo_object.data_df.transpose().as_matrix())
62+
hdf5_out.create_dataset(data_matrix_node, data=gctoo_object.data_df.transpose().as_matrix(),
63+
dtype=matrix_dtype)
5364

5465
# write col metadata
55-
write_metadata(hdf5_out, "col", gctoo_object.col_metadata_df, convert_back_to_neg_666)
66+
write_metadata(hdf5_out, "col", gctoo_object.col_metadata_df, convert_back_to_neg_666,
67+
gzip_compression=gzip_compression_level)
5668

5769
# write row metadata
58-
write_metadata(hdf5_out, "row", gctoo_object.row_metadata_df, convert_back_to_neg_666)
70+
write_metadata(hdf5_out, "row", gctoo_object.row_metadata_df, convert_back_to_neg_666,
71+
gzip_compression=gzip_compression_level)
5972

6073
# close gctx file
6174
hdf5_out.close()
@@ -101,8 +114,46 @@ def write_version(hdf5_out):
101114
"""
102115
hdf5_out.attrs[version_attr] = numpy.string_(version_number)
103116

117+
def calculate_elem_per_kb(max_chunk_kb, matrix_dtype):
118+
"""
119+
Calculates the number of elem per kb depending on the max chunk size set.
120+
121+
Input:
122+
- max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
123+
- matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
124+
Currently needs to be np.float32 or np.float64 (TODO: figure out a better way to get bits from a numpy dtype).
125+
126+
Returns:
127+
elem_per_kb (int), the number of elements per kb for matrix dtype specified.
128+
"""
129+
if matrix_dtype == numpy.float32:
130+
return (max_chunk_kb * 8)/32
131+
elif matrix_dtype == numpy.float64:
132+
return (max_chunk_kb * 8)/64
133+
else:
134+
msg = "Invalid matrix_dtype: {}; only numpy.float32 and numpy.float64 are currently supported".format(matrix_dtype)
135+
logger.error(msg)
136+
raise Exception("write_gctx.calculate_elem_per_kb " + msg)
104137

105-
def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666):
138+
139+
def set_data_matrix_chunk_size(df_shape, max_chunk_kb, elem_per_kb):
140+
"""
141+
Sets chunk size to use for writing data matrix.
142+
Note. Calculation used here is for compatibility with cmapM and cmapR.
143+
144+
Input:
145+
- df_shape (tuple): shape of input data_df.
146+
- max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
147+
- elem_per_kb (int): Number of elements per kb
148+
149+
Returns:
150+
chunk size (tuple) to use for chunking the data matrix
151+
"""
152+
row_chunk_size = min(df_shape[0], 1000)
153+
col_chunk_size = min(((max_chunk_kb*elem_per_kb)//row_chunk_size), df_shape[1])
154+
return (row_chunk_size, col_chunk_size)
155+
156+
def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666, gzip_compression):
106157
"""
107158
Writes either column or row metadata to proper node of gctx out (hdf5) file.
108159
@@ -123,7 +174,8 @@ def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666):
123174
logger.error("'dim' argument must be either 'row' or 'col'!")
124175

125176
# write id field to expected node
126-
hdf5_out.create_dataset(metadata_node_name + "/id", data=[str(x) for x in metadata_df.index])
177+
hdf5_out.create_dataset(metadata_node_name + "/id", data=[str(x) for x in metadata_df.index],
178+
compression=gzip_compression)
127179

128180
metadata_fields = list(metadata_df.columns.copy())
129181

@@ -135,4 +187,5 @@ def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666):
135187
# write metadata columns to their own arrays
136188
for field in [entry for entry in metadata_fields if entry != "ind"]:
137189
hdf5_out.create_dataset(metadata_node_name + "/" + field,
138-
data=numpy.array(list(metadata_df.loc[:, field])))
190+
data=numpy.array(list(metadata_df.loc[:, field])),
191+
compression=gzip_compression)

setup.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[bdist_wheel]
2-
# Only Python 2.7 supported; some versions of Python 3 support as well
3-
universal=1
2+
# Only Python 2.7 supported
3+
universal=0

setup.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# Versions should comply with PEP440. For a discussion on single-sourcing
1313
# the version across setup.py and the project code, see
1414
# https://packaging.python.org/en/latest/single_source_version.html
15-
version='2.2.0',
15+
version='3.0.0',
1616

1717
description='Assorted tools for interacting with .gct, .gctx files and other Connectivity Map (Broad Institute) data/tools',
1818
long_description="cmapPy: Tools for interacting with .gctx and .gct files, and other Connectivity Map resources. See our documentation at http://cmappy.readthedocs.io/en/latest/, and for more information on the file formats and available resources, please see clue.io/gctx.",
@@ -45,11 +45,7 @@
4545
# Specify the Python versions you support here. In particular, ensure
4646
# that you indicate whether you support Python 2, Python 3 or both.
4747
'Programming Language :: Python :: 2',
48-
'Programming Language :: Python :: 2.7',
49-
'Programming Language :: Python :: 3',
50-
'Programming Language :: Python :: 3.4',
51-
'Programming Language :: Python :: 3.5',
52-
'Programming Language :: Python :: 3.6',
48+
'Programming Language :: Python :: 2.7'
5349
],
5450

5551
# What does your project relate to?

0 commit comments

Comments
 (0)