1818version_number = "GCTX1.0"
1919
2020
21- def write_gctx (gctoo_object , out_file_name , convert_back_to_neg_666 = True , gzip_compression_level = 6 ):
21+ def write_gctx (gctoo_object , out_file_name , convert_back_to_neg_666 = True , gzip_compression_level = 6 ,
22+ max_chunk_kb = 1024 ):
2223 """
2324 Essentially the same as write() method; enables user to call write_gctx() from
2425 cmapPy instead of write_gctx.write()
@@ -28,7 +29,8 @@ def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_c
2829 write (gctoo_object , out_file_name , convert_back_to_neg_666 )
2930
3031
31- def write (gctoo_object , out_file_name , convert_back_to_neg_666 = True , gzip_compression_level = 6 ):
32+ def write (gctoo_object , out_file_name , convert_back_to_neg_666 = True , gzip_compression_level = 6 ,
33+ max_chunk_kb = 1024 , matrix_dtype = numpy .float32 ):
3234 """
3335 Writes a GCToo instance to specified file.
3436
@@ -37,6 +39,8 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compre
3739 - out_file_name (str): file name to write gctoo_object to.
3840 - convert_back_to_neg_666 (bool): whether to convert np.NAN in metadata back to "-666"
3941 - gzip_compression_level (int, default=6): Compression level to use for metadata.
42+ - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
43+ - matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
4044 """
4145 # make sure out file has a .gctx suffix
4246 gctx_out_name = add_gctx_to_out_name (out_file_name )
@@ -50,11 +54,13 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compre
5054 # write src
5155 write_src (hdf5_out , gctoo_object , gctx_out_name )
5256
53- # TODO: set chunk size
57+ # set chunk size for data matrix
58+ elem_per_kb = calculate_elem_per_kb (max_chunk_kb , matrix_dtype )
59+ chunk_size = set_data_matrix_chunk_size (gctoo_object .data_df .shape , max_chunk_kb , elem_per_kb )
5460
5561 # write data matrix
5662 hdf5_out .create_dataset (data_matrix_node , data = gctoo_object .data_df .transpose ().as_matrix (),
57- dtype = numpy . float32 )
63+ dtype = matrix_dtype )
5864
5965 # write col metadata
6066 write_metadata (hdf5_out , "col" , gctoo_object .col_metadata_df , convert_back_to_neg_666 ,
@@ -108,6 +114,44 @@ def write_version(hdf5_out):
108114 """
109115 hdf5_out .attrs [version_attr ] = numpy .string_ (version_number )
110116
117+ def calculate_elem_per_kb (max_chunk_kb , matrix_dtype ):
118+ """
119+ Calculates the number of elem per kb depending on the max chunk size set.
120+
121+ Input:
122+ - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
123+ - matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix.
124+ Currently needs to be np.float32 or np.float64 (TODO: figure out a better way to get bits from a numpy dtype).
125+
126+ Returns:
127+ elem_per_kb (int), the number of elements per kb for matrix dtype specified.
128+ """
129+ if matrix_dtype == numpy .float32 :
130+ return (max_chunk_kb * 8 )/ 32
131+ elif matrix_dtype == numpy .float64 :
132+ return (max_chunk_kb * 8 )/ 64
133+ else :
134+ msg = "Invalid matrix_dtype: {}; only numpy.float32 and numpy.float64 are currently supported" .format (matrix_dtype )
135+ logger .error (msg )
136+ raise Exception ("write_gctx.calculate_elem_per_kb " + msg )
137+
138+
139+ def set_data_matrix_chunk_size (df_shape , max_chunk_kb , elem_per_kb ):
140+ """
141+ Sets chunk size to use for writing data matrix.
142+ Note. Calculation used here is for compatibility with cmapM and cmapR.
143+
144+ Input:
145+ - df_shape (tuple): shape of input data_df.
146+ - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
147+ - elem_per_kb (int): Number of elements per kb
148+
149+ Returns:
150+ chunk size (tuple) to use for chunking the data matrix
151+ """
152+ row_chunk_size = min (df_shape [0 ], 1000 )
153+ col_chunk_size = min (((max_chunk_kb * elem_per_kb )// row_chunk_size ), df_shape [1 ])
154+ return (row_chunk_size , col_chunk_size )
111155
112156def write_metadata (hdf5_out , dim , metadata_df , convert_back_to_neg_666 , gzip_compression ):
113157 """
0 commit comments