JuliaInferelator_V2/createPseudobulkMuonloader.jl at main · MiraldiLab/JuliaInferelator_V2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
using Muon
using  DataFrames, Statistics
using SparseArrays, LinearAlgebra, SparseMatricesCSR
using ArgParse
using Dates
using CSV


"""
    createPseudobulk(adata, ident_var, save_dir)

Create a pseudobulk DataFrame from the given AnnData object.

# Arguments
- `adata::AnnData`: The AnnData object containing the data.
- `ident_var::String`: The identifier variable for grouping cells.
- `save_dir::String`: The directory to save the output pseudobulk DataFrame.

# Returns
- A DataFrame containing the pseudobulk data, where each column corresponds to a unique identifier and the rows represent features (e.g. genes, peaks).
"""

# reference: https://docs.juliahub.com/Muon/QfqCh/0.1.1/objects/

function createPseudobulk(adata, ident_var::String, save_dir::String)
    # Read the AnnData object
    #adata = h5open(dataset, "r")
    # adata = read(adata)

    sp_mat = adata.X  #sparse matrix (cells by feature)

    ident_col = adata.obs[!, ident_var]  # element of column corresponding to ident_var. Note: adata.obs -> metadata::dataframe
    unique_idents = unique(ident_col)

    features = adata.var[!, "name"]  # contains rownames
    pseudobulk_df = DataFrame(zeros(length(features), length(unique_idents)), unique_idents)

    for ident in unique_idents
        curr_indices = findall(x -> x == ident, ident_col)
        # Subset data for the current identity using logical indexing
        curr_data = sp_mat[curr_indices, :]

        # sum counts across all cells per row/feature
        curr_counts = sum(curr_data, dims=1)
        pseudobulk_df[!, ident] .= vec(curr_counts)
    end

    mat = Matrix(pseudobulk_df);
    row_sums = vec(sum(mat, dims=2));  # Sum along rows;

    pseudobulk_df = insertcols!(pseudobulk_df, 1, :RowNames => features)
    pseudobulk_df = pseudobulk_df[row_sums .> 0, :]

    # Save the pseudobulk DataFrame
    current_time = Dates.format(Dates.now(), "yyyymmdd_HHMMSS")
    output_path = joinpath(save_dir, "PseudobulkData_$(ident_var)_$(current_time).tsv")
    # open(output_path, "w") do io
    #     write(io, pseudobulk_df)
    # end
    CSV.write(output_path, string.(pseudobulk_df); delim = '\t')

    println("Pseudobulk DataFrame saved to: ", output_path)
    return pseudobulk_df
end


# function main(args)
#     # Parse the command line arguments
#     @argparse args begin
#         "--dataset", help="Path to the .h5ad file", required = true, arg_type = String
#         "--ident_var", help="Identifier variable for pseudobulk creation", required=true,
#         arg_type=String
#         "--save_dir", help="Directory to save the output"
#     end

#     adata = readh5ad(args.dataset) do file
#         read(file)
#     end

#     createPseudobulk(adata, args.ident_var, args.save_dir)
# end

# # Allows running the script directly or import it without executing main
# if abspath(PROGRAM_FILE) == abspath(@__FILE__)
#     main(ARGS)
# end


# Testing:

# function main(args)
#     # Load the data from the provided path
#     adata = readh5ad(args["dataset"], "r") do file
#         read(file)
#     end

#     # Call the createPseudobulk function
#     createPseudobulk(adata, args["ident_var"], args["save_dir"])
# end

# args = Dict(
#     "dataset" => "/data/miraldiNB/Katko/Projects/Asthma/Asthma_05_17_2024.h5ad",
#     "ident_var" => "celltype2_offspring_rep_comb",
#     "save_dir" => "/data/miraldiNB/Katko/Projects/Asthma/Analysis_New/Pseudobulk_Analysis/Asthma_05_17"
# )

# main(args)