-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmsa.R
More file actions
132 lines (102 loc) · 4.52 KB
/
msa.R
File metadata and controls
132 lines (102 loc) · 4.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Description: Uses msa to perform multiple sequence alignment on queries obtained from
# a csv file. Writes aligned sequences to fasta files to generate hmms
# Rscript msa.R <input.csv>
# Load necessary packages
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(version = "3.19")
BiocManager::install("Biostrings")
library(Biostrings)
BiocManager::install("msa")
library(msa)
# Get command line arguments
args <- commandArgs(trailingOnly = TRUE)
# Check if the user provided the correct number of arguments (1 or 2)
if (length(args) < 1 || length(args) > 2) {
stop("Please provide one mandatory argument: <path/to/input.csv>")
}
# Assign arguments to variables
csv_path <- args[1]
# Prompt user to provide chain and sequence type
cat("Please provide the chain type (Enter 'heavy', 'light', 'kappa' or 'lambda'):")
chain <- tolower(trimws(readLines("stdin", n = 1)))
cat("Do you want to translate sequences before alignment? (Enter 'Y' or 'N'): ")
type <- tolower(trimws(readLines("stdin", n = 1)))
# Validate the chain input
if (!(chain %in% c("heavy", "light", "kappa", "lambda"))) {
stop("Invalid chain type. Please enter 'heavy', 'light', 'kappa' or 'lambda'.")
}
# Validate the type input
if (!(type %in% c("dna", "aa"))) {
stop("Invalid response. Please enter 'Y' or 'N'.")
}
# Assign columns based on chain
if (chain == "heavy") {
v_call_col <- "v_call_heavy"
d_call_col <- "d_call_heavy"
j_call_col <- "j_call_heavy"
} else {
v_call_col <- "v_call_light"
d_call_col <- "d_call_light"
j_call_col <- "j_call_light"
}
# Print column names
cat("V Call Column:", v_call_col, "\n")
cat("D Call Column:", d_call_col, "\n")
cat("J Call Column:", j_call_col, "\n")
# Function to process sequences from CSV files
align_sequences <- function(csv_path, chain, type,
v_call_col, d_call_col, j_call_col) {
# Read the CSV file
sequence_csv <- read.csv(csv_path, stringsAsFactors = FALSE)
if (chain == "lambda") {
# Filter lambda sequences for IGL
filtered_csv <- subset(sequence_csv,
grepl("IGL", sequence_csv[[v_call_col]]) |
grepl("IGL", sequence_csv[[d_call_col]]) |
grepl("IGL", sequence_csv[[j_call_col]]))
# Convert lambda sequences to DNAStringSet object
DNA_sequence <- DNAStringSet(filtered_csv$sequence_light)
names(DNA_sequence) <- filtered_csv$sequence_id_light
} else if (chain == "kappa") {
# Filter kappa sequences for IGK
filtered_csv <- subset(sequence_csv,
grepl("IGK", sequence_csv[[v_call_col]]) |
grepl("IGK", sequence_csv[[d_call_col]]) |
grepl("IGK", sequence_csv[[j_call_col]]))
# Convert kappa sequences to DNAStringSet object
DNA_sequence <- DNAStringSet(filtered_csv$sequence_light)
names(DNA_sequence) <- filtered_csv$sequence_id_light
} else if (chain == "heavy") {
# Filter heavy sequences for IGHV3
filtered_csv <- subset(sequence_csv,
grepl("IGHV3", sequence_csv[[v_call_col]]) |
grepl("IGHV3", sequence_csv[[d_call_col]]) |
grepl("IGHV3", sequence_csv[[j_call_col]]))
# Convert heavy sequences to DNAStringSet object
DNA_sequence <- DNAStringSet(filtered_csv$sequence_heavy)
names(DNA_sequence) <- filtered_csv$sequence_id_heavy
} else {
# Convert all light sequences to DNAStringSet object
DNA_sequence <- DNAStringSet(sequence_csv$sequence_light)
names(DNA_sequence) <- sequence_csv$sequence_id_light
}
if (type == "y") {
# Translate DNA seq to AA
translated_seq <- Biostrings::translate(DNA_sequence, if.fuzzy.codon = "X")
# Perform MSA on translated amino acid sequences
AA_alignment <- msa(translated_seq)
# Convert alignment to AAStringSet
AA_aligned_set <- as(AA_alignment, "AAStringSet")
# Save aligned sequences to FASTA file
writeXStringSet(AA_aligned_set, filepath = file.path("~/Desktop/", paste0("aligned_", chain, "_AA.fasta")))
} else if (type == "n") {
# Perform MSA on DNA sequences
DNA_alignment <- msa(DNA_sequence)
# Convert alignment to DNAStringSet
DNA_aligned_set <- as(DNA_alignment, "DNAStringSet")
# Save aligned sequences to FASTA file
writeXStringSet(DNA_aligned_set, filepath = file.path("~/Desktop/", paste0("aligned_", chain, "_DNA.fasta")))
}
print("Output saved to desktop!")
}