1- function hpaData = parseHPA(fileName )
1+ function hpaData = parseHPA(fileName , version )
22% parseHPA
33% Parses a database dump of the Human Protein Atlas (HPA)
44%
5- % fileName comma-separated database dump of HPA. For details
5+ % fileName comma- or tab- separated database dump of HPA. For details
66% regarding the format, see
77% http://www.proteinatlas.org/about/download.
8+ % version version of HPA [optional, default=17]
89%
910% hpaData
10- % genes cell array with the unique gene names
11+ % genes cell array with the unique gene names. In
12+ % version 17 this is the ensamble name, see
13+ % geneNames below for the names in ver 17
14+ % geneNames cell array with the gene names, indexed the
15+ % same way as genes.
1116% tissues cell array with the tissue names. The list may not be
1217% unique, as there can be multiple cell types per tissue
1318% celltypes cell array with the cell type names for each tissue
2025% hpaData.levels of gene i in cell type j
2126% gene2Type gene-to-evidence type mapping in sparse matrix form.
2227% The value for element i,j is the index in
23- % hpaData.types of gene i in cell type j
28+ % hpaData.types of gene i in cell type j. Doesn't
29+ % exist in version 17.
2430% gene2Reliability gene-to-reliability level mapping in sparse matrix form.
2531% The value for element i,j is the index in
2632% hpaData.reliabilities of gene i in cell type j
2733%
2834%
2935% Usage: hpaData=parseHPA(fileName)
3036%
31- % Rasmus Agren, 2014-01-08
32- %
37+ % Johan Gustafsson, 2017-10-10
38+
39+ if nargin < 2
40+ version= 17 ; % Change this and add code for more versions when the current HPA version is increased and the format is changed
41+ end ;
42+
43+ if (version == 17 )
44+ fid= fopen(fileName ,' r' );
45+ hpa= textscan(fid ,' %q %q %q %q %q %q' ,' Delimiter' ,' \t ' );
46+ fclose(fid );
47+
48+ % Go through and see if the headers match what was expected
49+ headers= {' Gene' ' Gene name' ' Tissue' ' Cell type' ' Level' ' Reliability' };
50+ for i= 1 : numel(headers )
51+ if ~strcmpi(headers(i ),hpa{i }(1 ))
52+ EM= [' Could not find the header "' headers{i } ' ". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download' ];
53+ dispEM(EM );
54+ end
55+ % Remove the header line here
56+ hpa{i }(1 )=[];
57+ end
58+
59+ % Get the unique values of each data type
60+ [hpaData .genes , P , I ]=unique(hpa{1 });
61+ hpaData.geneNames= hpa{2 }(P ); % make this vector use the index as genes
62+ [~ , J , K ]=unique(strcat(hpa{3 },' €' ,hpa{4 }));
63+ hpaData.tissues= hpa{3 }(J );
64+ hpaData.celltypes= hpa{4 }(J );
65+ [hpaData .levels , ~ , L ]=unique(hpa{5 });
66+ [hpaData .reliabilities , ~ , N ]=unique(hpa{6 });
3367
34- fid= fopen(fileName ,' r' );
35- hpa= textscan(fid ,' %q %q %q %q %q %q' ,' Delimiter' ,' ,' );
36- fclose(fid );
68+ % Map the data to be sparse matrises instead
69+ hpaData.gene2Level= sparse(I ,K ,L ,numel(hpaData .genes ),numel(hpaData .tissues ));
70+ hpaData.gene2Reliability= sparse(I ,K ,N ,numel(hpaData .genes ),numel(hpaData .tissues ));
71+ else
72+ fid= fopen(fileName ,' r' );
73+ hpa= textscan(fid ,' %q %q %q %q %q %q' ,' Delimiter' ,' ,' );
74+ fclose(fid );
3775
38- % Go through and see if the headers match what was expected
39- headers= {' Gene' ' Tissue' ' Cell type' ' Level' ' Expression type' ' Reliability' };
40- for i= 1 : numel(headers )
41- if ~strcmpi(headers(i ),hpa{i }(1 ))
42- EM= [' Could not find the header "' headers{i } ' ". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download' ];
43- dispEM(EM );
76+ % Go through and see if the headers match what was expected
77+ headers= {' Gene' ' Tissue' ' Cell type' ' Level' ' Expression type' ' Reliability' };
78+ for i= 1 : numel(headers )
79+ if ~strcmpi(headers(i ),hpa{i }(1 ))
80+ EM= [' Could not find the header "' headers{i } ' ". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download' ];
81+ dispEM(EM );
82+ end
83+ % Remove the header line here
84+ hpa{i }(1 )=[];
4485 end
45- % Remove the header line here
46- hpa{i }(1 )=[];
47- end
48-
49- % Get the unique values of each data type
50- [hpaData .genes , ~ , I ]=unique(hpa{1 });
51- [~ , J , K ]=unique(strcat(hpa{2 },' €' ,hpa{3 }));
52- hpaData.tissues= hpa{2 }(J );
53- hpaData.celltypes= hpa{3 }(J );
54- [hpaData .levels , ~ , L ]=unique(hpa{4 });
55- [hpaData .types , ~ , M ]=unique(hpa{5 });
56- [hpaData .reliabilities , ~ , N ]=unique(hpa{6 });
57-
58- % Map the data to be sparse matrises instead
59- hpaData.gene2Level= sparse(I ,K ,L ,numel(hpaData .genes ),numel(hpaData .tissues ));
60- hpaData.gene2Type= sparse(I ,K ,M ,numel(hpaData .genes ),numel(hpaData .tissues ));
61- hpaData.gene2Reliability= sparse(I ,K ,N ,numel(hpaData .genes ),numel(hpaData .tissues ));
62- end
86+
87+ % Get the unique values of each data type
88+ [hpaData .genes , ~ , I ]=unique(hpa{1 });
89+ [~ , J , K ]=unique(strcat(hpa{2 },' €' ,hpa{3 }));
90+ hpaData.tissues= hpa{2 }(J );
91+ hpaData.celltypes= hpa{3 }(J );
92+ [hpaData .levels , ~ , L ]=unique(hpa{4 });
93+ [hpaData .types , ~ , M ]=unique(hpa{5 });
94+ [hpaData .reliabilities , ~ , N ]=unique(hpa{6 });
95+
96+ % Map the data to be sparse matrises instead
97+ hpaData.gene2Level= sparse(I ,K ,L ,numel(hpaData .genes ),numel(hpaData .tissues ));
98+ hpaData.gene2Type= sparse(I ,K ,M ,numel(hpaData .genes ),numel(hpaData .tissues ));
99+ hpaData.gene2Reliability= sparse(I ,K ,N ,numel(hpaData .genes ),numel(hpaData .tissues ));
100+ end
0 commit comments