1- function hpaData = parseHPA(fileName )
1+ function hpaData = parseHPA(fileName , version )
22% parseHPA
33% Parses a database dump of the Human Protein Atlas (HPA)
44%
5- % fileName comma-separated database dump of HPA. For details
5+ % fileName comma- or tab- separated database dump of HPA. For details
66% regarding the format, see
77% http://www.proteinatlas.org/about/download.
8+ % version version of HPA [optional, default=17]
89%
910% hpaData
10- % genes cell array with the unique gene names
11+ % genes cell array with the unique gene names. In
12+ % version 17 this is the ensamble name, see
13+ % geneNames below for the names in ver 17
14+ % geneNames cell array with the gene names, indexed the
15+ % same way as genes.
1116% tissues cell array with the tissue names. The list may not be
1217% unique, as there can be multiple cell types per tissue
1318% celltypes cell array with the cell type names for each tissue
2025% hpaData.levels of gene i in cell type j
2126% gene2Type gene-to-evidence type mapping in sparse matrix form.
2227% The value for element i,j is the index in
23- % hpaData.types of gene i in cell type j
28+ % hpaData.types of gene i in cell type j. Doesn't
29+ % exist in version 17.
2430% gene2Reliability gene-to-reliability level mapping in sparse matrix form.
2531% The value for element i,j is the index in
2632% hpaData.reliabilities of gene i in cell type j
2935% Usage: hpaData=parseHPA(fileName)
3036%
3137% Rasmus Agren, 2014-01-08
32- %
38+ % Johan Gustafsson, 2017-10-10
39+
40+ if nargin < 2
41+ version= 17 ; % Change this and add code for more versions when the current HPA version is increased and the format is changed
42+ end ;
43+
44+ if (version == 17 )
45+ fid= fopen(fileName ,' r' );
46+ hpa= textscan(fid ,' %q %q %q %q %q %q' ,' Delimiter' ,' \t ' );
47+ fclose(fid );
48+
49+ % Go through and see if the headers match what was expected
50+ headers= {' Gene' ' Gene name' ' Tissue' ' Cell type' ' Level' ' Reliability' };
51+ for i= 1 : numel(headers )
52+ if ~strcmpi(headers(i ),hpa{i }(1 ))
53+ EM= [' Could not find the header "' headers{i } ' ". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download' ];
54+ dispEM(EM );
55+ end
56+ % Remove the header line here
57+ hpa{i }(1 )=[];
58+ end
59+
60+ % Get the unique values of each data type
61+ [hpaData .genes , P , I ]=unique(hpa{1 });
62+ hpaData.geneNames= hpa{2 }(P ); % make this vector use the index as genes
63+ [~ , J , K ]=unique(strcat(hpa{3 },' €' ,hpa{4 }));
64+ hpaData.tissues= hpa{3 }(J );
65+ hpaData.celltypes= hpa{4 }(J );
66+ [hpaData .levels , ~ , L ]=unique(hpa{5 });
67+ [hpaData .reliabilities , ~ , N ]=unique(hpa{6 });
3368
34- fid= fopen(fileName ,' r' );
35- hpa= textscan(fid ,' %q %q %q %q %q %q' ,' Delimiter' ,' ,' );
36- fclose(fid );
69+ % Map the data to be sparse matrises instead
70+ hpaData.gene2Level= sparse(I ,K ,L ,numel(hpaData .genes ),numel(hpaData .tissues ));
71+ hpaData.gene2Reliability= sparse(I ,K ,N ,numel(hpaData .genes ),numel(hpaData .tissues ));
72+ else
73+ fid= fopen(fileName ,' r' );
74+ hpa= textscan(fid ,' %q %q %q %q %q %q' ,' Delimiter' ,' ,' );
75+ fclose(fid );
3776
38- % Go through and see if the headers match what was expected
39- headers= {' Gene' ' Tissue' ' Cell type' ' Level' ' Expression type' ' Reliability' };
40- for i= 1 : numel(headers )
41- if ~strcmpi(headers(i ),hpa{i }(1 ))
42- EM= [' Could not find the header "' headers{i } ' ". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download' ];
43- dispEM(EM );
77+ % Go through and see if the headers match what was expected
78+ headers= {' Gene' ' Tissue' ' Cell type' ' Level' ' Expression type' ' Reliability' };
79+ for i= 1 : numel(headers )
80+ if ~strcmpi(headers(i ),hpa{i }(1 ))
81+ EM= [' Could not find the header "' headers{i } ' ". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download' ];
82+ dispEM(EM );
83+ end
84+ % Remove the header line here
85+ hpa{i }(1 )=[];
4486 end
45- % Remove the header line here
46- hpa{i }(1 )=[];
47- end
48-
49- % Get the unique values of each data type
50- [hpaData .genes , ~ , I ]=unique(hpa{1 });
51- [~ , J , K ]=unique(strcat(hpa{2 },' €' ,hpa{3 }));
52- hpaData.tissues= hpa{2 }(J );
53- hpaData.celltypes= hpa{3 }(J );
54- [hpaData .levels , ~ , L ]=unique(hpa{4 });
55- [hpaData .types , ~ , M ]=unique(hpa{5 });
56- [hpaData .reliabilities , ~ , N ]=unique(hpa{6 });
57-
58- % Map the data to be sparse matrises instead
59- hpaData.gene2Level= sparse(I ,K ,L ,numel(hpaData .genes ),numel(hpaData .tissues ));
60- hpaData.gene2Type= sparse(I ,K ,M ,numel(hpaData .genes ),numel(hpaData .tissues ));
61- hpaData.gene2Reliability= sparse(I ,K ,N ,numel(hpaData .genes ),numel(hpaData .tissues ));
87+
88+ % Get the unique values of each data type
89+ [hpaData .genes , ~ , I ]=unique(hpa{1 });
90+ [~ , J , K ]=unique(strcat(hpa{2 },' €' ,hpa{3 }));
91+ hpaData.tissues= hpa{2 }(J );
92+ hpaData.celltypes= hpa{3 }(J );
93+ [hpaData .levels , ~ , L ]=unique(hpa{4 });
94+ [hpaData .types , ~ , M ]=unique(hpa{5 });
95+ [hpaData .reliabilities , ~ , N ]=unique(hpa{6 });
96+
97+ % Map the data to be sparse matrises instead
98+ hpaData.gene2Level= sparse(I ,K ,L ,numel(hpaData .genes ),numel(hpaData .tissues ));
99+ hpaData.gene2Type= sparse(I ,K ,M ,numel(hpaData .genes ),numel(hpaData .tissues ));
100+ hpaData.gene2Reliability= sparse(I ,K ,N ,numel(hpaData .genes ),numel(hpaData .tissues ));
101+ end
0 commit comments