Skip to content

Commit 6bfea32

Browse files
author
Simonas Marcišauskas
authored
Merge pull request #29 from johan-gson/master
Adapted the HPA code to be able to import the tsv files published for…
2 parents 4960794 + a008222 commit 6bfea32

File tree

1 file changed

+71
-31
lines changed

1 file changed

+71
-31
lines changed

hpa/parseHPA.m

Lines changed: 71 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
1-
function hpaData=parseHPA(fileName)
1+
function hpaData=parseHPA(fileName, version)
22
% parseHPA
33
% Parses a database dump of the Human Protein Atlas (HPA)
44
%
5-
% fileName comma-separated database dump of HPA. For details
5+
% fileName comma- or tab-separated database dump of HPA. For details
66
% regarding the format, see
77
% http://www.proteinatlas.org/about/download.
8+
% version version of HPA [optional, default=17]
89
%
910
% hpaData
10-
% genes cell array with the unique gene names
11+
% genes cell array with the unique gene names. In
12+
% version 17 this is the ensamble name, see
13+
% geneNames below for the names in ver 17
14+
% geneNames cell array with the gene names, indexed the
15+
% same way as genes.
1116
% tissues cell array with the tissue names. The list may not be
1217
% unique, as there can be multiple cell types per tissue
1318
% celltypes cell array with the cell type names for each tissue
@@ -20,7 +25,8 @@
2025
% hpaData.levels of gene i in cell type j
2126
% gene2Type gene-to-evidence type mapping in sparse matrix form.
2227
% The value for element i,j is the index in
23-
% hpaData.types of gene i in cell type j
28+
% hpaData.types of gene i in cell type j. Doesn't
29+
% exist in version 17.
2430
% gene2Reliability gene-to-reliability level mapping in sparse matrix form.
2531
% The value for element i,j is the index in
2632
% hpaData.reliabilities of gene i in cell type j
@@ -29,33 +35,67 @@
2935
% Usage: hpaData=parseHPA(fileName)
3036
%
3137
% Rasmus Agren, 2014-01-08
32-
%
38+
% Johan Gustafsson, 2017-10-10
39+
40+
if nargin<2
41+
version=17; %Change this and add code for more versions when the current HPA version is increased and the format is changed
42+
end;
43+
44+
if (version == 17)
45+
fid=fopen(fileName,'r');
46+
hpa=textscan(fid,'%q %q %q %q %q %q','Delimiter','\t');
47+
fclose(fid);
48+
49+
%Go through and see if the headers match what was expected
50+
headers={'Gene' 'Gene name' 'Tissue' 'Cell type' 'Level' 'Reliability'};
51+
for i=1:numel(headers)
52+
if ~strcmpi(headers(i),hpa{i}(1))
53+
EM=['Could not find the header "' headers{i} '". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download'];
54+
dispEM(EM);
55+
end
56+
%Remove the header line here
57+
hpa{i}(1)=[];
58+
end
59+
60+
%Get the unique values of each data type
61+
[hpaData.genes, P, I]=unique(hpa{1});
62+
hpaData.geneNames=hpa{2}(P); %make this vector use the index as genes
63+
[~, J, K]=unique(strcat(hpa{3},'',hpa{4}));
64+
hpaData.tissues=hpa{3}(J);
65+
hpaData.celltypes=hpa{4}(J);
66+
[hpaData.levels, ~, L]=unique(hpa{5});
67+
[hpaData.reliabilities, ~, N]=unique(hpa{6});
3368

34-
fid=fopen(fileName,'r');
35-
hpa=textscan(fid,'%q %q %q %q %q %q','Delimiter',',');
36-
fclose(fid);
69+
%Map the data to be sparse matrises instead
70+
hpaData.gene2Level=sparse(I,K,L,numel(hpaData.genes),numel(hpaData.tissues));
71+
hpaData.gene2Reliability=sparse(I,K,N,numel(hpaData.genes),numel(hpaData.tissues));
72+
else
73+
fid=fopen(fileName,'r');
74+
hpa=textscan(fid,'%q %q %q %q %q %q','Delimiter',',');
75+
fclose(fid);
3776

38-
%Go through and see if the headers match what was expected
39-
headers={'Gene' 'Tissue' 'Cell type' 'Level' 'Expression type' 'Reliability'};
40-
for i=1:numel(headers)
41-
if ~strcmpi(headers(i),hpa{i}(1))
42-
EM=['Could not find the header "' headers{i} '". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download'];
43-
dispEM(EM);
77+
%Go through and see if the headers match what was expected
78+
headers={'Gene' 'Tissue' 'Cell type' 'Level' 'Expression type' 'Reliability'};
79+
for i=1:numel(headers)
80+
if ~strcmpi(headers(i),hpa{i}(1))
81+
EM=['Could not find the header "' headers{i} '". Make sure that the input file matches the format specified at http://www.proteinatlas.org/about/download'];
82+
dispEM(EM);
83+
end
84+
%Remove the header line here
85+
hpa{i}(1)=[];
4486
end
45-
%Remove the header line here
46-
hpa{i}(1)=[];
47-
end
48-
49-
%Get the unique values of each data type
50-
[hpaData.genes, ~, I]=unique(hpa{1});
51-
[~, J, K]=unique(strcat(hpa{2},'',hpa{3}));
52-
hpaData.tissues=hpa{2}(J);
53-
hpaData.celltypes=hpa{3}(J);
54-
[hpaData.levels, ~, L]=unique(hpa{4});
55-
[hpaData.types, ~, M]=unique(hpa{5});
56-
[hpaData.reliabilities, ~, N]=unique(hpa{6});
57-
58-
%Map the data to be sparse matrises instead
59-
hpaData.gene2Level=sparse(I,K,L,numel(hpaData.genes),numel(hpaData.tissues));
60-
hpaData.gene2Type=sparse(I,K,M,numel(hpaData.genes),numel(hpaData.tissues));
61-
hpaData.gene2Reliability=sparse(I,K,N,numel(hpaData.genes),numel(hpaData.tissues));
87+
88+
%Get the unique values of each data type
89+
[hpaData.genes, ~, I]=unique(hpa{1});
90+
[~, J, K]=unique(strcat(hpa{2},'',hpa{3}));
91+
hpaData.tissues=hpa{2}(J);
92+
hpaData.celltypes=hpa{3}(J);
93+
[hpaData.levels, ~, L]=unique(hpa{4});
94+
[hpaData.types, ~, M]=unique(hpa{5});
95+
[hpaData.reliabilities, ~, N]=unique(hpa{6});
96+
97+
%Map the data to be sparse matrises instead
98+
hpaData.gene2Level=sparse(I,K,L,numel(hpaData.genes),numel(hpaData.tissues));
99+
hpaData.gene2Type=sparse(I,K,M,numel(hpaData.genes),numel(hpaData.tissues));
100+
hpaData.gene2Reliability=sparse(I,K,N,numel(hpaData.genes),numel(hpaData.tissues));
101+
end

0 commit comments

Comments
 (0)