added documentation

oena · oena · commit 504dc7c370fb · 2018-01-10T15:20:50.000-05:00
diff --git a/cmapPy/pandasGEXpress/write_gctx.py b/cmapPy/pandasGEXpress/write_gctx.py
@@ -18,7 +18,8 @@
 version_number = "GCTX1.0"
 
 
-def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6):
+def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6, 
+    max_chunk_kb=1024):
     """
 	Essentially the same as write() method; enables user to call write_gctx() from
 	cmapPy instead of write_gctx.write()
@@ -28,7 +29,8 @@ def write_gctx(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_c
     write(gctoo_object, out_file_name, convert_back_to_neg_666)
 
 
-def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6):
+def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compression_level=6, 
+    max_chunk_kb=1024, matrix_dtype=numpy.float32):
     """
 	Writes a GCToo instance to specified file.
 
@@ -37,6 +39,8 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compre
 		- out_file_name (str): file name to write gctoo_object to.
         - convert_back_to_neg_666 (bool): whether to convert np.NAN in metadata back to "-666"
         - gzip_compression_level (int, default=6): Compression level to use for metadata. 
+        - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
+        - matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix. 
 	"""
     # make sure out file has a .gctx suffix
     gctx_out_name = add_gctx_to_out_name(out_file_name)
@@ -50,11 +54,13 @@ def write(gctoo_object, out_file_name, convert_back_to_neg_666=True, gzip_compre
     # write src
     write_src(hdf5_out, gctoo_object, gctx_out_name)
 
-    # TODO: set chunk size 
+    # set chunk size for data matrix
+    elem_per_kb = calculate_elem_per_kb(max_chunk_kb, matrix_dtype)
+    chunk_size = set_data_matrix_chunk_size(gctoo_object.data_df.shape, max_chunk_kb, elem_per_kb)
 
     # write data matrix
     hdf5_out.create_dataset(data_matrix_node, data=gctoo_object.data_df.transpose().as_matrix(), 
-        dtype=numpy.float32)
+        dtype=matrix_dtype)
 
     # write col metadata
     write_metadata(hdf5_out, "col", gctoo_object.col_metadata_df, convert_back_to_neg_666, 
@@ -108,6 +114,44 @@ def write_version(hdf5_out):
 	"""
     hdf5_out.attrs[version_attr] = numpy.string_(version_number)
 
+def calculate_elem_per_kb(max_chunk_kb, matrix_dtype):
+    """
+    Calculates the number of elem per kb depending on the max chunk size set. 
+
+    Input: 
+        - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
+        - matrix_dtype (numpy dtype, default=numpy.float32): Storage data type for data matrix. 
+            Currently needs to be np.float32 or np.float64 (TODO: figure out a better way to get bits from a numpy dtype).
+
+    Returns: 
+        elem_per_kb (int), the number of elements per kb for matrix dtype specified. 
+    """
+    if matrix_dtype == numpy.float32:
+        return (max_chunk_kb * 8)/32
+    elif matrix_dtype == numpy.float64:
+        return (max_chunk_kb * 8)/64
+    else:
+        msg = "Invalid matrix_dtype: {}; only numpy.float32 and numpy.float64 are currently supported".format(matrix_dtype)
+        logger.error(msg)
+        raise Exception("write_gctx.calculate_elem_per_kb " + msg)
+
+
+def set_data_matrix_chunk_size(df_shape, max_chunk_kb, elem_per_kb):
+    """
+    Sets chunk size to use for writing data matrix. 
+    Note. Calculation used here is for compatibility with cmapM and cmapR. 
+
+    Input:
+        - df_shape (tuple): shape of input data_df. 
+        - max_chunk_kb (int, default=1024): The maximum number of KB a given chunk will occupy
+        - elem_per_kb (int): Number of elements per kb 
+
+    Returns:
+        chunk size (tuple) to use for chunking the data matrix 
+    """ 
+    row_chunk_size = min(df_shape[0], 1000)
+    col_chunk_size = min(((max_chunk_kb*elem_per_kb)//row_chunk_size), df_shape[1])
+    return (row_chunk_size, col_chunk_size)
 
 def write_metadata(hdf5_out, dim, metadata_df, convert_back_to_neg_666, gzip_compression):
     """