TracerSeq/parse_TracerFastQ.m at master · wagnerde/TracerSeq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
function parse_TracerFastQ(filename, libname, varargin)
% Usage: parse_TracerFastQ(filename, libname, varargin)
%
% This function parses demultiplexed, sorted FASTQ files (generated by
% inDrops.py) for TracerSeq barcoding experiments. The function filters
% for abundant cell barcodes and UMIs, and then determines a consensus
% sequence for each unique TracerSeq barcode detected in each cell.
% Consensus sequences are then writted to a csv file & processed further
% by the function "parse_TracerClones.m".
%
% Matlab must be run in the same working directory as the input csv files
% The Matlab PATH must include the following functions:
%   get_barcode_edits.m
%   strdist.m
%   subtitle.m
%
%
% REQUIRED INPUTS:
% filename      Full path to an inDrops.py sorted FASTQ file
% libname       String identifier for this library
%
% OPTIONAL INPUT NAME/VALUE PAIRS:
% 'thresh_cell'
%           Minimum number of reads required to keep a cell barcode
%
% 'thresh_UMI'
%           Minimum number of reads required within a cell to keep
%           a particular UMI
%
% OUPUTS:
% CellBC_WeightedHist.png
%           Weighted histogram of reads per cell barcode and cutoff;
%           Use to inspect 'thresh_cell'.
%
% UMI_WeightedHist.png
%           Weighted histogram of reads per UMI and cutoff;
%           Use to inspect 'thresh_UMI'.
%
% tracerSeqs.csv
%           Table of identified barcodes, each TracerSeq mRNA is a row
%           column1: inDrops cell barcode
%           column2: TracerSeq barcode sequence


%% PARAMETER SETTINGS
% Set defaults
def.thresh_cell = 100;
def.thresh_UMI = 100;
% Create parser object
parserObj = inputParser;
parserObj.FunctionName = 'parse_TracerFastQ';
parserObj.StructExpand = false;
parserObj.addOptional('thresh_cell',def.thresh_cell);
parserObj.addOptional('thresh_UMI',def.thresh_UMI);
% Parse input options
parserObj.parse(varargin{:});
settings = parserObj.Results;

%% Set up paths
if ~exist('plots', 'dir'); mkdir 'plots'; end

%% Load FASTQ file
FQT = struct2table(fastqread(filename));
FQT.Properties.VariableNames = {'Header', 'TracerSeq', 'Quality'};
disp('loaded FASTQ')

%% Extract cell barcode, UMI, and Tracer sequences
% cell arrays are MUCH faster than tables
% extract cell barcode by itself
T.CB = (regexp(FQT.Header,'^(.*?)(?=:)','match'));
T.CB = [T.CB{:}]';
% extract the UMI by itself
T.UMI = regexp(FQT.Header,'(?<=:)(.*)(?=:)', 'match');
T.UMI = [T.UMI{:}]';
% extract the TracerSeqs
T.TracerSeq = FQT.TracerSeq;
% dump the original table
clear FQT
disp('converted to cell arrays')

%% Identify cell barcodes with 'abundant' numbers of reads
[cb_seq, ~ ,cellbcode_ic] = unique(T.CB);
cellbcodes = 1:length(cb_seq);
cellbcodes_nReads = histc(cellbcode_ic,cellbcodes);
abundant_cellbcodes = cellbcodes_nReads > settings.thresh_cell;
clear T.CB;
disp('filtered abundant cell barcodes')

%% Plot weighted cell barcode abundance histogram
% plot weighted histogram of raw reads per cell to verify thresh_cell
figure('position',[300   300   400   400])
[counts centers] = hist(log10(cellbcodes_nReads),50);
y = counts.*(10.^centers)/sum(counts.*(10.^centers));
plot(centers,y*100,'linewidth',2)
nAbundant_cells = sum(abundant_cellbcodes);
nReads_from_abund_cells = sum(cellbcodes_nReads(cellbcodes_nReads>settings.thresh_cell));
fracReads_from_abund_cells = nReads_from_abund_cells/sum(cellbcodes_nReads);
mean_abundant = mean(cellbcodes_nReads(cellbcodes_nReads>settings.thresh_cell));
cv_abundant = std(cellbcodes_nReads(cellbcodes_nReads>settings.thresh_cell))/mean_abundant;
line([log10(settings.thresh_cell) log10(settings.thresh_cell)],[0 max(y*100)],'color','r','linewidth',1)
set(gca,'ylim',[0 max(y*100)])
set(gca,'fontsize',12)
xlabel('Log10(# Reads per Cell Barcode)')
ylabel('Fraction of Total Reads (%)')
st = subtitle({['Total # reads examined = ', num2str(sum(cellbcodes_nReads))],...
    ['# unique cell barcodes = ' num2str(sum(cellbcodes_nReads>0))],...
    ['# cell barcodes with > ' num2str(settings.thresh_cell) ' reads = ' num2str(nAbundant_cells)]...
    ['% reads from cells = ' num2str(100*fracReads_from_abund_cells,2) '%'],...
    ['# reads from cells = ' num2str(nReads_from_abund_cells)],...
    ['Avg. reads/cell = ' num2str(round(mean_abundant))],...
    ['CV reads/cell = ' num2str(round(100*cv_abundant)) '%' ],...
    ['Max reads/cell = ' num2str(max(cellbcodes_nReads))]},'TopLeft',[0.02 0.01]);
set(st,'fontsize',12)
title([libname ': Cell Barcode Abundance'],'Interpreter', 'none')
saveas(gcf, ['plots/' libname '_CellBC_WeightedHist.png'])

%% Generate a list of 'valid' UMIs for each cell
unique_UMIs_nReads_all_cells = [];
% index and loop through each cell one at a time
find_abundant_cellbcodes = find(abundant_cellbcodes);
for j = 1:length(find_abundant_cellbcodes)
    % get the original FQT row indices for this cell
    this_cells_FQT_inds = cellbcode_ic == find_abundant_cellbcodes(j);
    % get the list of all UMIs for this cell
    this_cells_UMIs = T.UMI(this_cells_FQT_inds);
    % determine read counts for each UMI detected in this cell
    [unique_UMIs_this_cell, ~, unique_UMIs_ic_this_cell] = unique(this_cells_UMIs);
    unique_UMIs_ind_this_cell = 1:length(unique_UMIs_this_cell);
    unique_UMIs_nReads_this_cell = histc(unique_UMIs_ic_this_cell,unique_UMIs_ind_this_cell);
    % keep track of the read per UMI counts across all cells
    unique_UMIs_nReads_all_cells = [unique_UMIs_nReads_all_cells; unique_UMIs_nReads_this_cell];
    % if this cell has any UMIs above the read count threshold,
    % then identify them and perform barcode validation
    if sum(unique_UMIs_nReads_this_cell > settings.thresh_UMI) > 0
        low_counts = unique_UMIs_nReads_this_cell < settings.thresh_UMI;
        unique_UMIs_this_cell(low_counts) = [];
        unique_UMIs_nReads_this_cell(low_counts) = [];
        % ignore UMIs that are likely to be 'edits' of other UMIs
        edit_abund_thresh = 2;
        edit_dist_thresh = 1;
        valid = get_barcode_edits(unique_UMIs_this_cell, unique_UMIs_nReads_this_cell, edit_abund_thresh, edit_dist_thresh, 'levenshtein');
        unique_UMIs_this_cell(~valid) = [];
        unique_UMIs_nReads_this_cell(~valid) = [];
        valid_UMIs{j} = unique_UMIs_this_cell;
        nValid_UMIs(j) = length(unique_UMIs_this_cell);
    else % otherwise skip this cell
        %disp(['skipped cell ' num2str(j)])
        nValid_UMIs(j) = 0;
        valid_UMIs{j} = [];
    end
end
disp('identified valid UMIs')

%% Plot oversequencing histogram: # reads per original transcript
% plot weighted histogram of raw reads per cell to verify thresh_UMI
figure('position',[300   300   400   400])
[counts, centers] = hist(log10(unique_UMIs_nReads_all_cells),50);
y = counts.*(10.^centers)/sum(counts.*(10.^centers));
plot(centers,y*100,'linewidth',2)
nAbundant_transcripts = sum(unique_UMIs_nReads_all_cells > settings.thresh_UMI);
nReads_from_abund_transcripts = sum(unique_UMIs_nReads_all_cells(unique_UMIs_nReads_all_cells>settings.thresh_UMI));
fracReads_from_abund_transcripts = nReads_from_abund_transcripts/sum(unique_UMIs_nReads_all_cells);
mean_abundant = mean(unique_UMIs_nReads_all_cells(unique_UMIs_nReads_all_cells>settings.thresh_UMI));
cv_abundant = std(unique_UMIs_nReads_all_cells(unique_UMIs_nReads_all_cells>settings.thresh_UMI))/mean_abundant;
line([log10(settings.thresh_UMI) log10(settings.thresh_UMI)],[0 max(y*100)],'color','r','linewidth',1)
set(gca,'ylim',[0 max(y*100)])
set(gca,'fontsize',12)
xlabel('# Reads per Transcript (Log10)')
ylabel('Fraction of Total Reads (%)')
st = subtitle({['Total # reads examined = ', num2str(sum(unique_UMIs_nReads_all_cells))],...
['# unique transcripts = ' num2str(sum(unique_UMIs_nReads_all_cells>0))],...
['# transcripts with > ' num2str(settings.thresh_UMI) ' reads = ' num2str(nAbundant_transcripts)]...
['% reads from abund. transcripts = ' num2str(100*fracReads_from_abund_transcripts,2) '%'],...
['# reads from abund. transcripts = ' num2str(nReads_from_abund_transcripts)],...
['Avg. reads/transcript = ' num2str(round(mean_abundant))],...
['CV reads/transcript = ' num2str(round(100*cv_abundant)) '%' ],...
['Max reads/transcript = ' num2str(max(unique_UMIs_nReads_all_cells))]},'TopLeft',[0.02 0.01]);
set(st,'fontsize',12)
title([libname ': Transcript oversequencing'],'Interpreter', 'none')
saveas(gcf, ['plots/' libname '_UMI_WeightedHist.png'])

%% Generate consensus sequences for each unique TRACER mRNA
% generate container for final consensus sequences
consensus_seqs = cell(1,2);
% also report the original cell barcodes
cell_names = cb_seq(abundant_cellbcodes)';
% loop through each cell
for j = 1:length(find_abundant_cellbcodes)
    % only proceed if this cell had at least one valid mRNA
    if nValid_UMIs(j) > 0
        % get the original FQT row indices for this cell
        this_cells_FQT_inds = cellbcode_ic == find_abundant_cellbcodes(j);
        % now loop through each validated UMI for this cell
        for k = 1:length(valid_UMIs{j})
            % display progress
            %clc; disp('Generating TRACER consensus sequences...'); disp(['Cell: ', num2str(j)]); disp(['mRNA: ', num2str(k)]);
            % get the original FQT rows corresponding to this UMI (logical)
            this_UMI_FQT_logical = strcmp(valid_UMIs{j}(k),T.UMI);
            % take the intersection of row indices for this cell and this UMI
            this_mRNA_FQT_inds = this_cells_FQT_inds & this_UMI_FQT_logical;
            % get the original Tracer sequences for this mRNA
            this_mRNA_Tracer_seqs = T.TracerSeq(this_mRNA_FQT_inds);
            % disregard reads that don't contain the flanking sequences;
            % these reads aren't going to contribute productively to a
            % multi-sequence alignment
            valid_flanks = contains(this_mRNA_Tracer_seqs,'TCTACAAATAAG') & contains(this_mRNA_Tracer_seqs,'TCATACGTATCC');
            this_mRNA_Tracer_seqs = this_mRNA_Tracer_seqs(valid_flanks);
            % sort reads by how close they are to the median read length
            % use up to 50 reads for constructing a multialignment & consensus
            individual_read_lengths = cellfun(@length, this_mRNA_Tracer_seqs);
            distance_to_median = abs(individual_read_lengths-median(individual_read_lengths));
            [~,close_to_median] = sort(distance_to_median, 'ascend');
            if length(this_mRNA_Tracer_seqs) > 50
                this_mRNA_Tracer_seqs = this_mRNA_Tracer_seqs(close_to_median(1:50));
            end
            % get the consensus sequence of the TracerSeq barcode
            if length(this_mRNA_Tracer_seqs) >= 3 % can't align fewer than 3 sequences
                % expose all gaps as '-', then use regexprep to exclude them from the final consensus
                untrimmed_consensus = regexprep(seqconsensus(multialign(this_mRNA_Tracer_seqs, 'ScoringMatrix', 'NUC44'), 'Alphabet', 'NT', 'ScoringMatrix', 'NUC44', 'Gaps', 'all'),'-','');
                % trim flanking sequences
                trimmed_consensus = regexp(untrimmed_consensus,'(?<=TCTACAAATAAG)(.*)(?=TCATACGTATCC)', 'match');
            else
                untrimmed_consensus = []; trimmed_consensus = [];
            end
            % now for this k mRNA in this j cell, we have a consensus sequence for
            % the TRACER barcode
            if ~isempty(trimmed_consensus)
                results_this_cell{k,1} = cell_names{j};
                results_this_cell{k,2} = trimmed_consensus{:};
            else
                results_this_cell{k,1} = cell_names{j};
                results_this_cell{k,2} = '';
            end
        end
        % append this cell to the final results table
        consensus_seqs = [consensus_seqs; results_this_cell];
        clear results_this_cell
    end
end
disp('generated Tracer consensus seqs')

%% Save results to file
% remove short/empty TracerSeq barcodes
consensus_seqs((cellfun(@length,consensus_seqs(:,2))<15),:) = [];
% write csv
writetable(cell2table(consensus_seqs), [libname '_tracerSeqs.csv'],'WriteVariableNames',0);
disp('wrote files')