-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmatch_protgroups.jl
More file actions
73 lines (66 loc) · 3.17 KB
/
match_protgroups.jl
File metadata and controls
73 lines (66 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
proj_info = (id = "cov2",
match_ver = "20200417")
using Pkg
Pkg.activate(@__DIR__)
using Revise
using DataFrames, CSV, Statistics, RData
const misc_scripts_path = joinpath(base_scripts_path, "misc_jl");
const analysis_path = joinpath(base_analysis_path, proj_info.id)
const data_path = joinpath(analysis_path, "data")
const networks_path = joinpath(analysis_path, "networks")
const results_path = joinpath(analysis_path, "results")
const scratch_path = joinpath(analysis_path, "scratch")
const plots_path = joinpath(analysis_path, "plots")
Revise.includet(joinpath(base_scripts_path, "misc_jl", "protgroup_crossmatch.jl"))
rdataset_names = [
"APMS" => "$(proj_info.id)_msglm_data_mq_apms_20200329_20200329",
"OeProteome" => "$(proj_info.id)_msdata_full_spectronaut_qc_20200331_20200331"
]
protgroups_dfs = [k => begin
rdata = load(joinpath(scratch_path, "$v.RData"), convert=true)
if haskey(rdata, "msdata") && haskey(rdata["msdata"], "protgroups")
rdata["msdata"]["protgroups"]
elseif haskey(rdata, "msdata_full") && haskey(rdata["msdata_full"], "protgroups")
rdata["msdata_full"]["protgroups"]
else
@warn "No protgroups dataframe found for $k"
nothing
end
end for (k, v) in rdataset_names]
proteins_df = load(joinpath(scratch_path, "$(proj_info.id)_msdata_full_spectronaut_qc_20200331_20200331.RData"), convert=true)["msdata_full"]["proteins"]
for noiso_group_df in groupby(proteins_df, :protein_ac_noiso)
pe = noiso_group_df.protein_existence
if any(ismissing, pe) && any(!ismissing, pe)
noiso_group_df.protein_existence .= coalesce.(pe, minimum(skipmissing(pe)))
elseif any(ismissing, pe) && any(noiso_group_df.is_viral)
noiso_group_df.protein_existence .= 1
end
end
# calculate AC ranks - the more canonical proptein is, the better
protein_ac_ranks = Dict(r.protein_ac => ProtgroupXMatch.rank_uniprot(r) for r in eachrow(proteins_df))
# save the matches
pg_matches_df = ProtgroupXMatch.match_protgroups(protgroups_dfs, protein_ac_ranks);
for (dsname, pg_df) in protgroups_dfs
pg_col = Symbol("protgroup_id_", dsname)
rowix_col = Symbol("rowix_", dsname)
pg_matches_df[!, pg_col] = missings(nonmissingtype(eltype(pg_df.protgroup_id)), nrow(pg_matches_df))
for (i, rowix) in enumerate(pg_matches_df[!, rowix_col])
if !ismissing(rowix)
pg_matches_df[i, pg_col] = pg_df[rowix, :protgroup_id]
end
end
select!(pg_matches_df, Not(rowix_col))
end
nonunique_matches_dfs = [dsname => begin
pg_col = Symbol("protgroup_id_", dsname)
by(filter(r -> !ismissing(r[pg_col]), pg_matches_df), pg_col) do df
return nrow(df) > 1 ? df : df[1:0, :]
end
end for (dsname, _) in protgroups_dfs]
nonunique_matches_dfs[1][2].protgroup_id_OeProteome
CSV.write(joinpath(analysis_path, "$(proj_info.id)_protgroup_matches_$(proj_info.match_ver).txt"),
pg_matches_df, delim='\t', missingstring="");
#output_filepath = joinpath(analysis_path, "$(project_id)_all_protgroups_matches_$(analysis_version).RData");
#@rput(output_filepath, output_filepath)
#@rput(combined_pgs_df, combined_pgs_df)
#R"save(combined_pgs_df, file=output_filepath)";