hatex/2015_Spring/BISC-577/project4.tex at master · saketkc/hatex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Structured General Purpose Assignment
% LaTeX Template
%
% This template has been downloaded from:
% http://www.latextemplates.com
%
% Original author:
% Ted Pavlic (http://www.tedpavlic.com)
%
% Note:
% The \lipsum[#] commands throughout this template generate dummy text
% to fill the template out. These commands should all be removed when
% writing assignment content.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%----------------------------------------------------------------------------------------
%	PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
%----------------------------------------------------------------------------------------

\documentclass{article}

\usepackage{fancyhdr} % Required for custom headers
\usepackage{lastpage} % Required to determine the last page for the footer
\usepackage{extramarks} % Required for headers and footers
\usepackage{graphicx} % Required to insert images
\usepackage{latexsym}
\usepackage{mathtools}

\usepackage{lipsum} % Used for inserting dummy 'Lorem ipsum' text into the template
%\usepackage[]{algorithm2e}
%\usepackage{algorithmicx}
%\usepackage{algorithm}
%\usepackage{algorithm}
%\usepackage{algorithmic}
%\usepackage{algpseudocode}
%\usepackage{algcompatible}


\usepackage{algorithm}
\usepackage{algorithmic}
%\usepackage{algorithmicx}


%\usepackage{algpseudocode}

%\usepackage{algpseudocode}

%\usepackage[noend]{algpseudocode}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\newcommand{\algorithmicbreak}{\textbf{break}}
\newcommand{\algorithmicgiven}{\textbf{Given:}}
\newcommand{\BREAK}{\STATE \algorithmicbreak}
\newcommand{\GIVEN}{\STATEx \algorithmicgiven}
%\def\NoNumber#1{{\def\alglinenumber##1{}\State #1}\addtocounter{ALG@line}{-1}}

\usepackage{amsmath}
%\usepackage{multline}

% Margins
\topmargin=-0.45in
\evensidemargin=0in
\oddsidemargin=0in
\textwidth=6.5in
\textheight=9.0in
\headsep=0.25in

\linespread{1.1} % Line spacing

% Set up the header and footer
\pagestyle{fancy}
\lhead{\hmwkAuthorName} % Top left header
\chead{\hmwkClass\ : \hmwkTitle} % Top center header
\rhead{\firstxmark} % Top right header
\lfoot{\lastxmark} % Bottom left footer
\cfoot{} % Bottom center footer
\rfoot{Page\ \thepage\ of\ \pageref{LastPage}} % Bottom right footer
\renewcommand\headrulewidth{0.4pt} % Size of the header rule
\renewcommand\footrulewidth{0.4pt} % Size of the footer rule

\setlength\parindent{0pt} % Removes all indentation from paragraphs

%----------------------------------------------------------------------------------------
%	DOCUMENT STRUCTURE COMMANDS
%	Skip this unless you know what you're doing
%----------------------------------------------------------------------------------------

% Header and footer for when a page split occurs within a problem environment
\newcommand{\enterProblemHeader}[1]{
\nobreak\extramarks{#1}{#1 continued on next page\ldots}\nobreak
\nobreak\extramarks{#1 (continued)}{#1 continued on next page\ldots}\nobreak
}

% Header and footer for when a page split occurs between problem environments
\newcommand{\exitProblemHeader}[1]{
\nobreak\extramarks{#1 (continued)}{#1 continued on next page\ldots}\nobreak
\nobreak\extramarks{#1}{}\nobreak
}

\setcounter{secnumdepth}{0} % Removes default section numbers
\newcounter{homeworkProblemCounter} % Creates a counter to keep track of the number of problems

\newcommand{\homeworkProblemName}{}
\newenvironment{homeworkProblem}[1][Problem \arabic{homeworkProblemCounter}]{ % Makes a new environment called homeworkProblem which takes 1 argument (custom name) but the default is "Problem #"
\stepcounter{homeworkProblemCounter} % Increase counter for number of problems
\renewcommand{\homeworkProblemName}{#1} % Assign \homeworkProblemName the name of the problem
\section{\homeworkProblemName} % Make a section in the document with the custom problem count
\enterProblemHeader{\homeworkProblemName} % Header and footer within the environment
}{
\exitProblemHeader{\homeworkProblemName} % Header and footer after the environment
}

\newcommand{\problemAnswer}[1]{ % Defines the problem answer command with the content as the only argument
\noindent\framebox[\columnwidth][c]{\begin{minipage}{0.98\columnwidth}#1\end{minipage}} % Makes the box around the problem answer and puts the content inside
}

\newcommand{\homeworkSectionName}{}
\newenvironment{homeworkSection}[1]{ % New environment for sections within homework problems, takes 1 argument - the name of the section
\renewcommand{\homeworkSectionName}{#1} % Assign \homeworkSectionName to the name of the section from the environment argument
\subsection{\homeworkSectionName} % Make a subsection with the custom name of the subsection
\enterProblemHeader{\homeworkProblemName\ [\homeworkSectionName]} % Header and footer within the environment
}{
\enterProblemHeader{\homeworkProblemName} % Header and footer after the environment
}

%----------------------------------------------------------------------------------------
%	NAME AND CLASS SECTION
%----------------------------------------------------------------------------------------
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
\newcommand{\hmwkTitle}{Project\ \# 4 } % Assignment title
\newcommand{\hmwkDueDate}{Tuesday,\ Mayl \ 05,\ 2015} % Due date
\newcommand{\hmwkClass}{BISC-577} % Course/class
\newcommand{\hmwkClassTime}{11:00am} % Class/lecture time
\newcommand{\hmwkAuthorName}{Saket Choudhary} % Your name
\newcommand{\hmwkAuthorID}{2170058637} % Teacher/lecturer
%----------------------------------------------------------------------------------------
%	TITLE PAGE
%----------------------------------------------------------------------------------------

\title{
\vspace{2in}
\textmd{\textbf{\hmwkClass:\ \hmwkTitle}}\\
\normalsize\vspace{0.1in}\small{Due\ on\ \hmwkDueDate}\\
%\vspace{0.1in}\large{\textit{\hmwkClassTime}}
\vspace{3in}
}

\author{\textbf{\hmwkAuthorName} \\
	\textbf{\hmwkAuthorID}
	}
\date{} % Insert date here if you want it to appear below your name

%----------------------------------------------------------------------------------------

\begin{document}

\maketitle

%----------------------------------------------------------------------------------------
%	TABLE OF CONTENTS
%----------------------------------------------------------------------------------------

%\setcounter{tocdepth}{1} % Uncomment this line if you don't want subsections listed in the ToC

\newpage
\tableofcontents
\newpage


\begin{homeworkSection}{Question \# 1} % Section within problem

\problemAnswer{

 \textbf{(A)}: mRNA are a family of RNA that upon translation result into a sequence of amino acids as spedicifed by the corresponding codons as a result of gene expression \\
 \textbf{(B)}: transfer RNAs(tRNA) serves as a carrier of the amino acids trranspoting them to the ribosomes. Amino-acid-codon matching happens via the presence of an anticodon and is specific.\\
 \textbf{(C)}: Introns are 'inter-genic' regions that do not code for proteins and hence are absent in the mature RNA as they are removed via splicing. Exons on the other hand are the  'coding' regions of DNA. Mature RNA consists primarily of exons.\\
 \textbf{(D)}: Alternative splicing which involves removal of non-coding regions also gives rise to the possibility of multiple proteins being translated from the same gene depending on which exons are included and which ones are excluded. RNA silencing is another such process that increases RNA variability. \\
 \textbf{(E)}: Coding region of RNA consists of exons that  for a protein. 5' UTRs and 3' UTRs which are also part of exon are upstream of initiation codon and downstream of the termination codon and both act as post transcriptional regulators. UTRs are not translated into proteins.\\


}

\end{homeworkSection}

\begin{homeworkSection}{Question \# 2}
	\problemAnswer{
		Gene ID: 3569 [IL6 interleukin 6]\\
		HUGO: HGNC:HGNC:6018 \\

		B)
		$http://www.ncbi.nlm.nih.gov/gene/3569geneGenomic regions, transcripts, and products$

\textbf{Exon count: $6$}
	\textbf{From RefSeq:}
			\begin{tabular}{|c|c|c|}
				\hline Transcript  & Length(nucleotides bp)  & No. of exons \\
				\hline $XM\_011515390.1$ & 2555  & - \\
				\hline $XM\_005249745.3$ &  1969 & - \\
				\hline $XM\_011515391.1$ & 978 & -\\
			\end{tabular}

	I was not able to locate the number of exons on NCBI. So the number of exons is not indicated.

\textbf{From Ensembl:}

			\begin{tabular}{|c|c|c|}
				\hline Transcript  & Length(nucleotides bp)  & No. of exons \\
				\hline $XM\_005249745.2$ & 1412  & 3 \\
				\hline $NM\_000600.3$ & 1184 & 5 \\
				\hline
			\end{tabular}


	\textbf{(C)} None of the transcripts have the same number of exons as the original gene(6). This is expected, since the mature mRNA
	is a result of alternative splicing resulting in few exons being assembled while the introns are chunked off.\\

	\textbf{(D)} Ensembl. The transcipts do not match. The $XM\_*$ comes from NCBI's automated eukaryotic genome annotation pipeline and are 'predicted' transcripts while the $NM\_*$ are the curated ones. This likely seems to differ, because the $XM\_*$ predicted transcripts arerefresshed periodically and the change might not reflect on Ensembl at the same time.

    }
\end{homeworkSection}


\begin{homeworkSection}{Question \# 3}
	\problemAnswer{

		\textbf{A} Splicing leads to removal of introns resulting in joining of exons. This process
		van suffer a lot of variation and hence mapping to a single reference as in the case of DNA is often not possible.

        \textbf{B} Project Accession: $http://www.ncbi.nlm.nih.gov/bioproject/PRJNA257207$
        There are 2 types of samples, investigating the reorganization of  nuclear architecture ofhuman fobroblasts and
        MSCs. One type of samples come from early passage of replicative senescence while the other set
        are in late passage. r
        %the efficiency of trastuzumab efficiency: cancer cell line,
        %trastuzumab resistant version of the cell line and cell line under trastuzumab treatment.
       There are three biological replicates for the 2 type of conditions but no technical replicates(single run in each experiment)\\

        \textbf{C}  SRR1533801.fastq: 34507899 \\
           SRR1533801.fastq: 31246550 \\

        \textbf{D} Tophat is essentially an aligner that internally uses bowtie2. Reads from RNA-seq experiment
        will involve spliced regions. Hence a single read could have originally originated from two regions(exons) that are far apart
        on the genome(the reference sequence). Tophat first aligns the reads to the references, some of the reads will remain unmapped, possibly
        due to the alternative splicing(other reason might be contamination, mutations etc) in which case Tophat takes these
        unmapped reads and then infers the splice site regions. Bowtie2 cannot handle aligniing reads by splitting(allowing very large gaps)\\

        \textbf{E}
        $sort\ -k5,5\ junctions.bed\ \| tail$ \\
        SP1: "chr2    216243994       216245583       JUNC00073269    9995    -       216243994       216245583       255,0,0 2       46,50   0,1539"\\
        SP2: "chr19   55897756        55897987        JUNC00054766    999     +       55897756        55897987        255,0,0 2       50,50   0,181"\\
        SRR1533801: \\
        Number of splice sites: 125761\\
        Splice junction with max reads: SP1(above)\\
        Number of reads at SP1: 9995\\

        SRR1533804: \\
        Number of splice sites: 95823\\
        Splice junction with max reads: SP2(above)\\
        Number of reads at SP1: 999\\


        \textbf{F}
        $awk '{printf\ "\%s\%d\%d\%d\%d",\$1,\$3-\$2,\$5,\$2,\$3}'\ junctions.bed\ |\ sort -k2,2\ |\ tail$
            SRR1533801 longest junction site(SP1): "chr4    9999    7       186231930       186241929"\\
            SRR1533805 longest junction site(SP2): "chr5    9999    40      168139310       168149309"\\
        $[chr][length][numer bf reads][start][end]$

    }
\end{homeworkSection}


\begin{homeworkSection}{Question \# 4}
    \problemAnswer{
        \textbf{A} Cufflinks takes an alignment file, assembles the transcript and estimates their abundance testing for
        differential expression.

        \textbf{B} "chr7    unknown stop\_codon      22771190        22771192        .       +       .       gene\_id IL6; gene\_name 'IL6'; p\_id 'P4693'; transcript\_id 'NM\_000600'; tss\_id 'TSS3170';"

Command used: $grep\ -r\ 'IL6'\ genes.gtf\ \| grep NM$\\
        The 'genes.gtf' was downloaded from tophat's website and came bundled with other indices(iGenome bundle)

        \textbf{C} FPKM measures the abundance of transcripts(RPKM for single end reads) is a normalized count to measure the abundance.
        Normalization is essential to adjust for the number of sequenced and mapped reads.

        SRR1533804.fastq gene: "CUFF.8926       -       -       CUFF.8926       -       -       chr14:24702148-24702409 -       -       9.99986 4.20454 15.7952 OK"\\
        SRR1533804.fastq transcript: CUFF.8926.1     -       -       CUFF.8926       -       -       chr14:24702148-24702409 261     9.13859 9.99986 4.20454 15.7952 OK"\\

        The $10^{th}$ column indicates the abundance which are 9.99 and 9.12 respectively for gene and transcript of SRR1533804


    }

\end{homeworkSection}

\begin{homeworkSection}{Question \# 5}
    \problemAnswer{

        \textbf{A} "cuffdiff transcripts.gtf SRR1533801.bam SRR1533804.bam"\\

        \textbf{B}
       "SERPINA9        SERPINA9        -       chr14:94929057-94942670 q1      q2      OK      0       0.555802        inf     -nan    0.00015 0.0408021   y"
       The logFC tunrs out to be inf, probably indicating a novel transcript not found in the original gtf file.

        Command: "sort -k10,10 genediff | head"\\

    \textbf{C} Given  a control and treatement experiment, the differences can arise due to multiple attributes.
    As the number of attributes increase the probability of difference between control and experiment groups
    will tend to increase. Correcting for multiple testing adjusts this probability for those multiple attributes
    to reflect the corrected probability.

    }
\end{homeworkSection}

\begin{homeworkSection}{Question \# 6}
    \problemAnswer{

        \textbf{A} I did not get any output for differential splicing, splicingdiff was empty. I think I did the 'cuffdiff' part incorrect. I tried merging
        the gtf for control and treatment, but it seemed to fail with an invalid transcript id error.
        \\
\textbf{B} cuffdiff finds significant changes in transcript levels. Given two samples and the number of reads
mapping to each transcript, cuffduff's performs a hypothesis test, of how likely the change is due to the difference
in two groups rather than just by chance
}.
\end{homeworkSection}


\end{document}
p