HMMProfiler/msa.R at main · alexaburchak/HMMProfiler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Description: Uses msa to perform multiple sequence alignment on queries obtained from
# a csv file. Writes aligned sequences to fasta files to generate hmms

# Rscript msa.R <input.csv>

# Load necessary packages
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install(version = "3.19")

BiocManager::install("Biostrings")
library(Biostrings)

BiocManager::install("msa")
library(msa)

# Get command line arguments
args <- commandArgs(trailingOnly = TRUE)

# Check if the user provided the correct number of arguments (1 or 2)
if (length(args) < 1 || length(args) > 2) {
  stop("Please provide one mandatory argument: <path/to/input.csv>")
}

# Assign arguments to variables
csv_path <- args[1]

# Prompt user to provide chain and sequence type
cat("Please provide the chain type (Enter 'heavy', 'light', 'kappa' or 'lambda'):")
chain <- tolower(trimws(readLines("stdin", n = 1)))

cat("Do you want to translate sequences before alignment? (Enter 'Y' or 'N'): ")
type <- tolower(trimws(readLines("stdin", n = 1)))

# Validate the chain input
if (!(chain %in% c("heavy", "light", "kappa", "lambda"))) {
  stop("Invalid chain type. Please enter 'heavy', 'light', 'kappa' or 'lambda'.")
}

# Validate the type input
if (!(type %in% c("dna", "aa"))) {
  stop("Invalid response. Please enter 'Y' or 'N'.")
}

# Assign columns based on chain
if (chain == "heavy") {
  v_call_col <- "v_call_heavy"
  d_call_col <- "d_call_heavy"
  j_call_col <- "j_call_heavy"
} else {
  v_call_col <- "v_call_light"
  d_call_col <- "d_call_light"
  j_call_col <- "j_call_light"
}

# Print column names
cat("V Call Column:", v_call_col, "\n")
cat("D Call Column:", d_call_col, "\n")
cat("J Call Column:", j_call_col, "\n")


# Function to process sequences from CSV files
align_sequences <- function(csv_path, chain, type,
                              v_call_col, d_call_col, j_call_col) {

  # Read the CSV file
  sequence_csv <- read.csv(csv_path, stringsAsFactors = FALSE)

  if (chain == "lambda") {
    # Filter lambda sequences for IGL
    filtered_csv <- subset(sequence_csv,
                           grepl("IGL", sequence_csv[[v_call_col]]) |
                             grepl("IGL", sequence_csv[[d_call_col]]) |
                             grepl("IGL", sequence_csv[[j_call_col]]))

    # Convert lambda sequences to DNAStringSet object
    DNA_sequence <- DNAStringSet(filtered_csv$sequence_light)
    names(DNA_sequence) <- filtered_csv$sequence_id_light

  } else if (chain == "kappa") {
    # Filter kappa sequences for IGK
    filtered_csv <- subset(sequence_csv,
                           grepl("IGK", sequence_csv[[v_call_col]]) |
                             grepl("IGK", sequence_csv[[d_call_col]]) |
                             grepl("IGK", sequence_csv[[j_call_col]]))

    # Convert kappa sequences to DNAStringSet object
    DNA_sequence <- DNAStringSet(filtered_csv$sequence_light)
    names(DNA_sequence) <- filtered_csv$sequence_id_light

  } else if (chain == "heavy") {
    # Filter heavy sequences for IGHV3
    filtered_csv <- subset(sequence_csv,
                           grepl("IGHV3", sequence_csv[[v_call_col]]) |
                             grepl("IGHV3", sequence_csv[[d_call_col]]) |
                             grepl("IGHV3", sequence_csv[[j_call_col]]))

    # Convert heavy sequences to DNAStringSet object
    DNA_sequence <- DNAStringSet(filtered_csv$sequence_heavy)
    names(DNA_sequence) <- filtered_csv$sequence_id_heavy

  } else {
    # Convert all light sequences to DNAStringSet object
    DNA_sequence <- DNAStringSet(sequence_csv$sequence_light)
    names(DNA_sequence) <- sequence_csv$sequence_id_light
  }

  if (type == "y") {
    # Translate DNA seq to AA
    translated_seq <- Biostrings::translate(DNA_sequence, if.fuzzy.codon = "X")

    # Perform MSA on translated amino acid sequences
    AA_alignment <- msa(translated_seq)

    # Convert alignment to AAStringSet
    AA_aligned_set <- as(AA_alignment, "AAStringSet")

    # Save aligned sequences to FASTA file
    writeXStringSet(AA_aligned_set, filepath = file.path("~/Desktop/", paste0("aligned_", chain, "_AA.fasta")))

  } else if (type == "n") {
    # Perform MSA on DNA sequences
    DNA_alignment <- msa(DNA_sequence)

    # Convert alignment to DNAStringSet
    DNA_aligned_set <- as(DNA_alignment, "DNAStringSet")

    # Save aligned sequences to FASTA file
    writeXStringSet(DNA_aligned_set, filepath = file.path("~/Desktop/", paste0("aligned_", chain, "_DNA.fasta")))
  }
  print("Output saved to desktop!")
}