1818version_number = "GCTX1.0"
1919
2020
21- def write_gctx (gctoo_object , out_file_name , convert_back_to_neg_666 = True ):
21+ def write_gctx (gctoo_object , out_file_name , convert_back_to_neg_666 = True , gzip_compression_level = 6 ,
22+ max_chunk_kb = 1024 ):
2223 """
2324 Essentially the same as write() method; enables user to call write_gctx() from
2425 cmapPy instead of write_gctx.write()
@@ -28,13 +29,18 @@ def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True):
2829 write (gctoo_object , out_file_name , convert_back_to_neg_666 )
2930
3031
31- def write (gctoo_object , out_file_name , convert_back_to_neg_666 = True ):
32+ def write (gctoo_object , out_file_name , convert_back_to_neg_666 = True , gzip_compression_level = 6 ,
33+ max_chunk_kb = 1024 , matrix_dtype = numpy .float32 ):
3234 """
3335 Writes a GCToo instance to specified file.
3436
3537 Input:
3638 - gctoo_object (GCToo): A GCToo instance.
3739 - out_file_name (str): file name to write gctoo_object to.
40+ - convert_back_to_neg_666 (bool): whether to convert np.NAN in metadata back to "-666"
41+ - gzip_compression_level (int, default=6): Compression level to use for metadata.
42+ - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
43+ - matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
3844 """
3945 # make sure out file has a .gctx suffix
4046 gctx_out_name = add_gctx_to_out_name (out_file_name )
@@ -48,14 +54,21 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True):
4854 # write src
4955 write_src (hdf5_out , gctoo_object , gctx_out_name )
5056
57+ # set chunk size for data matrix
58+ elem_per_kb = calculate_elem_per_kb (max_chunk_kb , matrix_dtype )
59+ chunk_size = set_data_matrix_chunk_size (gctoo_object .data_df .shape , max_chunk_kb , elem_per_kb )
60+
5161 # write data matrix
52- hdf5_out .create_dataset (data_matrix_node , data = gctoo_object .data_df .transpose ().as_matrix ())
62+ hdf5_out .create_dataset (data_matrix_node , data = gctoo_object .data_df .transpose ().as_matrix (),
63+ dtype = matrix_dtype )
5364
5465 # write col metadata
55- write_metadata (hdf5_out , "col" , gctoo_object .col_metadata_df , convert_back_to_neg_666 )
66+ write_metadata (hdf5_out , "col" , gctoo_object .col_metadata_df , convert_back_to_neg_666 ,
67+ gzip_compression = gzip_compression_level )
5668
5769 # write row metadata
58- write_metadata (hdf5_out , "row" , gctoo_object .row_metadata_df , convert_back_to_neg_666 )
70+ write_metadata (hdf5_out , "row" , gctoo_object .row_metadata_df , convert_back_to_neg_666 ,
71+ gzip_compression = gzip_compression_level )
5972
6073 # close gctx file
6174 hdf5_out .close ()
@@ -101,8 +114,46 @@ def write_version(hdf5_out):
101114 """
102115 hdf5_out .attrs [version_attr ] = numpy .string_ (version_number )
103116
117+ def calculate_elem_per_kb (max_chunk_kb , matrix_dtype ):
118+ """
119+ Calculates the number of elem per kb depending on the max chunk size set.
120+
121+ Input:
122+ - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
123+ - matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
124+ Currently needs to be np.float32 or np.float64 (TODO: figure out a better way to get bits from a numpy dtype).
125+
126+ Returns:
127+ elem_per_kb (int), the number of elements per kb for matrix dtype specified.
128+ """
129+ if matrix_dtype == numpy .float32 :
130+ return (max_chunk_kb * 8 )/ 32
131+ elif matrix_dtype == numpy .float64 :
132+ return (max_chunk_kb * 8 )/ 64
133+ else :
134+ msg = "Invalid matrix_dtype: {}; only numpy.float32 and numpy.float64 are currently supported" .format (matrix_dtype )
135+ logger .error (msg )
136+ raise Exception ("write_gctx.calculate_elem_per_kb " + msg )
104137
105- def write_metadata (hdf5_out , dim , metadata_df , convert_back_to_neg_666 ):
138+
139+ def set_data_matrix_chunk_size (df_shape , max_chunk_kb , elem_per_kb ):
140+ """
141+ Sets chunk size to use for writing data matrix.
142+ Note. Calculation used here is for compatibility with cmapM and cmapR.
143+
144+ Input:
145+ - df_shape (tuple): shape of input data_df.
146+ - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
147+ - elem_per_kb (int): Number of elements per kb
148+
149+ Returns:
150+ chunk size (tuple) to use for chunking the data matrix
151+ """
152+ row_chunk_size = min (df_shape [0 ], 1000 )
153+ col_chunk_size = min (((max_chunk_kb * elem_per_kb )// row_chunk_size ), df_shape [1 ])
154+ return (row_chunk_size , col_chunk_size )
155+
156+ def write_metadata (hdf5_out , dim , metadata_df , convert_back_to_neg_666 , gzip_compression ):
106157 """
107158 Writes either column or row metadata to proper node of gctx out (hdf5) file.
108159
@@ -123,7 +174,8 @@ def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666):
123174 logger .error ("'dim' argument must be either 'row' or 'col'!" )
124175
125176 # write id field to expected node
126- hdf5_out .create_dataset (metadata_node_name + "/id" , data = [str (x ) for x in metadata_df .index ])
177+ hdf5_out .create_dataset (metadata_node_name + "/id" , data = [str (x ) for x in metadata_df .index ],
178+ compression = gzip_compression )
127179
128180 metadata_fields = list (metadata_df .columns .copy ())
129181
@@ -135,4 +187,5 @@ def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666):
135187 # write metadata columns to their own arrays
136188 for field in [entry for entry in metadata_fields if entry != "ind" ]:
137189 hdf5_out .create_dataset (metadata_node_name + "/" + field ,
138- data = numpy .array (list (metadata_df .loc [:, field ])))
190+ data = numpy .array (list (metadata_df .loc [:, field ])),
191+ compression = gzip_compression )
0 commit comments