Skip to content

Commit 504dc7c

Browse files
committed
added documentation
1 parent ad31ccc commit 504dc7c

File tree

1 file changed

+48
-4
lines changed

1 file changed

+48
-4
lines changed

cmapPy/pandasGEXpress/write_gctx.py

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
version_number = "GCTX1.0"
1919

2020

21-
def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6):
21+
def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6,
22+
max_chunk_kb=1024):
2223
"""
2324
Essentially the same as write() method; enables user to call write_gctx() from
2425
cmapPy instead of write_gctx.write()
@@ -28,7 +29,8 @@ def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_c
2829
write(gctoo_object, out_file_name, convert_back_to_neg_666)
2930

3031

31-
def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6):
32+
def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6,
33+
max_chunk_kb=1024, matrix_dtype=numpy.float32):
3234
"""
3335
Writes a GCToo instance to specified file.
3436
@@ -37,6 +39,8 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compre
3739
- out_file_name (str): file name to write gctoo_object to.
3840
- convert_back_to_neg_666 (bool): whether to convert np.NAN in metadata back to "-666"
3941
- gzip_compression_level (int, default=6): Compression level to use for metadata.
42+
- max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
43+
- matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
4044
"""
4145
# make sure out file has a .gctx suffix
4246
gctx_out_name = add_gctx_to_out_name(out_file_name)
@@ -50,11 +54,13 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compre
5054
# write src
5155
write_src(hdf5_out, gctoo_object, gctx_out_name)
5256

53-
# TODO: set chunk size
57+
# set chunk size for data matrix
58+
elem_per_kb = calculate_elem_per_kb(max_chunk_kb, matrix_dtype)
59+
chunk_size = set_data_matrix_chunk_size(gctoo_object.data_df.shape, max_chunk_kb, elem_per_kb)
5460

5561
# write data matrix
5662
hdf5_out.create_dataset(data_matrix_node, data=gctoo_object.data_df.transpose().as_matrix(),
57-
dtype=numpy.float32)
63+
dtype=matrix_dtype)
5864

5965
# write col metadata
6066
write_metadata(hdf5_out, "col", gctoo_object.col_metadata_df, convert_back_to_neg_666,
@@ -108,6 +114,44 @@ def write_version(hdf5_out):
108114
"""
109115
hdf5_out.attrs[version_attr] = numpy.string_(version_number)
110116

117+
def calculate_elem_per_kb(max_chunk_kb, matrix_dtype):
118+
"""
119+
Calculates the number of elem per kb depending on the max chunk size set.
120+
121+
Input:
122+
- max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
123+
- matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
124+
Currently needs to be np.float32 or np.float64 (TODO: figure out a better way to get bits from a numpy dtype).
125+
126+
Returns:
127+
elem_per_kb (int), the number of elements per kb for matrix dtype specified.
128+
"""
129+
if matrix_dtype == numpy.float32:
130+
return (max_chunk_kb * 8)/32
131+
elif matrix_dtype == numpy.float64:
132+
return (max_chunk_kb * 8)/64
133+
else:
134+
msg = "Invalid matrix_dtype: {}; only numpy.float32 and numpy.float64 are currently supported".format(matrix_dtype)
135+
logger.error(msg)
136+
raise Exception("write_gctx.calculate_elem_per_kb " + msg)
137+
138+
139+
def set_data_matrix_chunk_size(df_shape, max_chunk_kb, elem_per_kb):
140+
"""
141+
Sets chunk size to use for writing data matrix.
142+
Note. Calculation used here is for compatibility with cmapM and cmapR.
143+
144+
Input:
145+
- df_shape (tuple): shape of input data_df.
146+
- max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
147+
- elem_per_kb (int): Number of elements per kb
148+
149+
Returns:
150+
chunk size (tuple) to use for chunking the data matrix
151+
"""
152+
row_chunk_size = min(df_shape[0], 1000)
153+
col_chunk_size = min(((max_chunk_kb*elem_per_kb)//row_chunk_size), df_shape[1])
154+
return (row_chunk_size, col_chunk_size)
111155

112156
def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666, gzip_compression):
113157
"""

0 commit comments

Comments
 (0)