This repository was archived by the owner on Nov 7, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_TracerFastQ.m
More file actions
247 lines (236 loc) · 11.8 KB
/
parse_TracerFastQ.m
File metadata and controls
247 lines (236 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
function parse_TracerFastQ(filename, libname, varargin)
% Usage: parse_TracerFastQ(filename, libname, varargin)
%
% This function parses demultiplexed, sorted FASTQ files (generated by
% inDrops.py) for TracerSeq barcoding experiments. The function filters
% for abundant cell barcodes and UMIs, and then determines a consensus
% sequence for each unique TracerSeq barcode detected in each cell.
% Consensus sequences are then writted to a csv file & processed further
% by the function "parse_TracerClones.m".
%
% Matlab must be run in the same working directory as the input csv files
% The Matlab PATH must include the following functions:
% get_barcode_edits.m
% strdist.m
% subtitle.m
%
%
% REQUIRED INPUTS:
% filename Full path to an inDrops.py sorted FASTQ file
% libname String identifier for this library
%
% OPTIONAL INPUT NAME/VALUE PAIRS:
% 'thresh_cell'
% Minimum number of reads required to keep a cell barcode
%
% 'thresh_UMI'
% Minimum number of reads required within a cell to keep
% a particular UMI
%
% OUPUTS:
% CellBC_WeightedHist.png
% Weighted histogram of reads per cell barcode and cutoff;
% Use to inspect 'thresh_cell'.
%
% UMI_WeightedHist.png
% Weighted histogram of reads per UMI and cutoff;
% Use to inspect 'thresh_UMI'.
%
% tracerSeqs.csv
% Table of identified barcodes, each TracerSeq mRNA is a row
% column1: inDrops cell barcode
% column2: TracerSeq barcode sequence
%% PARAMETER SETTINGS
% Set defaults
def.thresh_cell = 100;
def.thresh_UMI = 100;
% Create parser object
parserObj = inputParser;
parserObj.FunctionName = 'parse_TracerFastQ';
parserObj.StructExpand = false;
parserObj.addOptional('thresh_cell',def.thresh_cell);
parserObj.addOptional('thresh_UMI',def.thresh_UMI);
% Parse input options
parserObj.parse(varargin{:});
settings = parserObj.Results;
%% Set up paths
if ~exist('plots', 'dir'); mkdir 'plots'; end
%% Load FASTQ file
FQT = struct2table(fastqread(filename));
FQT.Properties.VariableNames = {'Header', 'TracerSeq', 'Quality'};
disp('loaded FASTQ')
%% Extract cell barcode, UMI, and Tracer sequences
% cell arrays are MUCH faster than tables
% extract cell barcode by itself
T.CB = (regexp(FQT.Header,'^(.*?)(?=:)','match'));
T.CB = [T.CB{:}]';
% extract the UMI by itself
T.UMI = regexp(FQT.Header,'(?<=:)(.*)(?=:)', 'match');
T.UMI = [T.UMI{:}]';
% extract the TracerSeqs
T.TracerSeq = FQT.TracerSeq;
% dump the original table
clear FQT
disp('converted to cell arrays')
%% Identify cell barcodes with 'abundant' numbers of reads
[cb_seq, ~ ,cellbcode_ic] = unique(T.CB);
cellbcodes = 1:length(cb_seq);
cellbcodes_nReads = histc(cellbcode_ic,cellbcodes);
abundant_cellbcodes = cellbcodes_nReads > settings.thresh_cell;
clear T.CB;
disp('filtered abundant cell barcodes')
%% Plot weighted cell barcode abundance histogram
% plot weighted histogram of raw reads per cell to verify thresh_cell
figure('position',[300 300 400 400])
[counts centers] = hist(log10(cellbcodes_nReads),50);
y = counts.*(10.^centers)/sum(counts.*(10.^centers));
plot(centers,y*100,'linewidth',2)
nAbundant_cells = sum(abundant_cellbcodes);
nReads_from_abund_cells = sum(cellbcodes_nReads(cellbcodes_nReads>settings.thresh_cell));
fracReads_from_abund_cells = nReads_from_abund_cells/sum(cellbcodes_nReads);
mean_abundant = mean(cellbcodes_nReads(cellbcodes_nReads>settings.thresh_cell));
cv_abundant = std(cellbcodes_nReads(cellbcodes_nReads>settings.thresh_cell))/mean_abundant;
line([log10(settings.thresh_cell) log10(settings.thresh_cell)],[0 max(y*100)],'color','r','linewidth',1)
set(gca,'ylim',[0 max(y*100)])
set(gca,'fontsize',12)
xlabel('Log10(# Reads per Cell Barcode)')
ylabel('Fraction of Total Reads (%)')
st = subtitle({['Total # reads examined = ', num2str(sum(cellbcodes_nReads))],...
['# unique cell barcodes = ' num2str(sum(cellbcodes_nReads>0))],...
['# cell barcodes with > ' num2str(settings.thresh_cell) ' reads = ' num2str(nAbundant_cells)]...
['% reads from cells = ' num2str(100*fracReads_from_abund_cells,2) '%'],...
['# reads from cells = ' num2str(nReads_from_abund_cells)],...
['Avg. reads/cell = ' num2str(round(mean_abundant))],...
['CV reads/cell = ' num2str(round(100*cv_abundant)) '%' ],...
['Max reads/cell = ' num2str(max(cellbcodes_nReads))]},'TopLeft',[0.02 0.01]);
set(st,'fontsize',12)
title([libname ': Cell Barcode Abundance'],'Interpreter', 'none')
saveas(gcf, ['plots/' libname '_CellBC_WeightedHist.png'])
%% Generate a list of 'valid' UMIs for each cell
unique_UMIs_nReads_all_cells = [];
% index and loop through each cell one at a time
find_abundant_cellbcodes = find(abundant_cellbcodes);
for j = 1:length(find_abundant_cellbcodes)
% get the original FQT row indices for this cell
this_cells_FQT_inds = cellbcode_ic == find_abundant_cellbcodes(j);
% get the list of all UMIs for this cell
this_cells_UMIs = T.UMI(this_cells_FQT_inds);
% determine read counts for each UMI detected in this cell
[unique_UMIs_this_cell, ~, unique_UMIs_ic_this_cell] = unique(this_cells_UMIs);
unique_UMIs_ind_this_cell = 1:length(unique_UMIs_this_cell);
unique_UMIs_nReads_this_cell = histc(unique_UMIs_ic_this_cell,unique_UMIs_ind_this_cell);
% keep track of the read per UMI counts across all cells
unique_UMIs_nReads_all_cells = [unique_UMIs_nReads_all_cells; unique_UMIs_nReads_this_cell];
% if this cell has any UMIs above the read count threshold,
% then identify them and perform barcode validation
if sum(unique_UMIs_nReads_this_cell > settings.thresh_UMI) > 0
low_counts = unique_UMIs_nReads_this_cell < settings.thresh_UMI;
unique_UMIs_this_cell(low_counts) = [];
unique_UMIs_nReads_this_cell(low_counts) = [];
% ignore UMIs that are likely to be 'edits' of other UMIs
edit_abund_thresh = 2;
edit_dist_thresh = 1;
valid = get_barcode_edits(unique_UMIs_this_cell, unique_UMIs_nReads_this_cell, edit_abund_thresh, edit_dist_thresh, 'levenshtein');
unique_UMIs_this_cell(~valid) = [];
unique_UMIs_nReads_this_cell(~valid) = [];
valid_UMIs{j} = unique_UMIs_this_cell;
nValid_UMIs(j) = length(unique_UMIs_this_cell);
else % otherwise skip this cell
%disp(['skipped cell ' num2str(j)])
nValid_UMIs(j) = 0;
valid_UMIs{j} = [];
end
end
disp('identified valid UMIs')
%% Plot oversequencing histogram: # reads per original transcript
% plot weighted histogram of raw reads per cell to verify thresh_UMI
figure('position',[300 300 400 400])
[counts, centers] = hist(log10(unique_UMIs_nReads_all_cells),50);
y = counts.*(10.^centers)/sum(counts.*(10.^centers));
plot(centers,y*100,'linewidth',2)
nAbundant_transcripts = sum(unique_UMIs_nReads_all_cells > settings.thresh_UMI);
nReads_from_abund_transcripts = sum(unique_UMIs_nReads_all_cells(unique_UMIs_nReads_all_cells>settings.thresh_UMI));
fracReads_from_abund_transcripts = nReads_from_abund_transcripts/sum(unique_UMIs_nReads_all_cells);
mean_abundant = mean(unique_UMIs_nReads_all_cells(unique_UMIs_nReads_all_cells>settings.thresh_UMI));
cv_abundant = std(unique_UMIs_nReads_all_cells(unique_UMIs_nReads_all_cells>settings.thresh_UMI))/mean_abundant;
line([log10(settings.thresh_UMI) log10(settings.thresh_UMI)],[0 max(y*100)],'color','r','linewidth',1)
set(gca,'ylim',[0 max(y*100)])
set(gca,'fontsize',12)
xlabel('# Reads per Transcript (Log10)')
ylabel('Fraction of Total Reads (%)')
st = subtitle({['Total # reads examined = ', num2str(sum(unique_UMIs_nReads_all_cells))],...
['# unique transcripts = ' num2str(sum(unique_UMIs_nReads_all_cells>0))],...
['# transcripts with > ' num2str(settings.thresh_UMI) ' reads = ' num2str(nAbundant_transcripts)]...
['% reads from abund. transcripts = ' num2str(100*fracReads_from_abund_transcripts,2) '%'],...
['# reads from abund. transcripts = ' num2str(nReads_from_abund_transcripts)],...
['Avg. reads/transcript = ' num2str(round(mean_abundant))],...
['CV reads/transcript = ' num2str(round(100*cv_abundant)) '%' ],...
['Max reads/transcript = ' num2str(max(unique_UMIs_nReads_all_cells))]},'TopLeft',[0.02 0.01]);
set(st,'fontsize',12)
title([libname ': Transcript oversequencing'],'Interpreter', 'none')
saveas(gcf, ['plots/' libname '_UMI_WeightedHist.png'])
%% Generate consensus sequences for each unique TRACER mRNA
% generate container for final consensus sequences
consensus_seqs = cell(1,2);
% also report the original cell barcodes
cell_names = cb_seq(abundant_cellbcodes)';
% loop through each cell
for j = 1:length(find_abundant_cellbcodes)
% only proceed if this cell had at least one valid mRNA
if nValid_UMIs(j) > 0
% get the original FQT row indices for this cell
this_cells_FQT_inds = cellbcode_ic == find_abundant_cellbcodes(j);
% now loop through each validated UMI for this cell
for k = 1:length(valid_UMIs{j})
% display progress
%clc; disp('Generating TRACER consensus sequences...'); disp(['Cell: ', num2str(j)]); disp(['mRNA: ', num2str(k)]);
% get the original FQT rows corresponding to this UMI (logical)
this_UMI_FQT_logical = strcmp(valid_UMIs{j}(k),T.UMI);
% take the intersection of row indices for this cell and this UMI
this_mRNA_FQT_inds = this_cells_FQT_inds & this_UMI_FQT_logical;
% get the original Tracer sequences for this mRNA
this_mRNA_Tracer_seqs = T.TracerSeq(this_mRNA_FQT_inds);
% disregard reads that don't contain the flanking sequences;
% these reads aren't going to contribute productively to a
% multi-sequence alignment
valid_flanks = contains(this_mRNA_Tracer_seqs,'TCTACAAATAAG') & contains(this_mRNA_Tracer_seqs,'TCATACGTATCC');
this_mRNA_Tracer_seqs = this_mRNA_Tracer_seqs(valid_flanks);
% sort reads by how close they are to the median read length
% use up to 50 reads for constructing a multialignment & consensus
individual_read_lengths = cellfun(@length, this_mRNA_Tracer_seqs);
distance_to_median = abs(individual_read_lengths-median(individual_read_lengths));
[~,close_to_median] = sort(distance_to_median, 'ascend');
if length(this_mRNA_Tracer_seqs) > 50
this_mRNA_Tracer_seqs = this_mRNA_Tracer_seqs(close_to_median(1:50));
end
% get the consensus sequence of the TracerSeq barcode
if length(this_mRNA_Tracer_seqs) >= 3 % can't align fewer than 3 sequences
% expose all gaps as '-', then use regexprep to exclude them from the final consensus
untrimmed_consensus = regexprep(seqconsensus(multialign(this_mRNA_Tracer_seqs, 'ScoringMatrix', 'NUC44'), 'Alphabet', 'NT', 'ScoringMatrix', 'NUC44', 'Gaps', 'all'),'-','');
% trim flanking sequences
trimmed_consensus = regexp(untrimmed_consensus,'(?<=TCTACAAATAAG)(.*)(?=TCATACGTATCC)', 'match');
else
untrimmed_consensus = []; trimmed_consensus = [];
end
% now for this k mRNA in this j cell, we have a consensus sequence for
% the TRACER barcode
if ~isempty(trimmed_consensus)
results_this_cell{k,1} = cell_names{j};
results_this_cell{k,2} = trimmed_consensus{:};
else
results_this_cell{k,1} = cell_names{j};
results_this_cell{k,2} = '';
end
end
% append this cell to the final results table
consensus_seqs = [consensus_seqs; results_this_cell];
clear results_this_cell
end
end
disp('generated Tracer consensus seqs')
%% Save results to file
% remove short/empty TracerSeq barcodes
consensus_seqs((cellfun(@length,consensus_seqs(:,2))<15),:) = [];
% write csv
writetable(cell2table(consensus_seqs), [libname '_tracerSeqs.csv'],'WriteVariableNames',0);
disp('wrote files')