Skip to content

Commit 0ce63eb

Browse files
committed
merge snp2h5 changes from development branch
1 parent 3999794 commit 0ce63eb

File tree

8 files changed

+68
-58
lines changed

8 files changed

+68
-58
lines changed

snp2h5/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Edit the following variables as needed
2-
HDF_INSTALL=$(HOME)/anaconda
2+
HDF_INSTALL=$(HOME)/anaconda2
33
EXTLIB=-L$(HDF_INSTALL)/lib
44
CC=gcc
55
LIB=-lz -lm -lhdf5 -lhdf5_hl

snp2h5/err.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ void my_err(const char *format, ...) {
2020
}
2121

2222
va_end(args);
23-
abort();
23+
exit(2);
2424
}
2525

2626

snp2h5/snp.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@
22
#define __SNP_H__
33

44

5-
#define SNP_MAX_ALLELE 32
6-
#define SNP_MAX_CHROM 32
5+
/* maximum sequence length of alleles (longer are truncated in snp_tab) */
6+
#define SNP_MAX_ALLELE 100
7+
8+
/* maximum length of chromosome name */
9+
/* #define SNP_MAX_CHROM 32 */
10+
/* maximum length of SNP identifier */
711
#define SNP_MAX_NAME 16
812

913

1014
typedef struct {
1115
char name[SNP_MAX_NAME];
12-
char chrom[SNP_MAX_CHROM];
16+
/* char chrom[SNP_MAX_CHROM]; */
1317
long pos;
1418
char allele1[SNP_MAX_ALLELE];
1519
char allele2[SNP_MAX_ALLELE];

snp2h5/snp2h5.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,8 @@ void parse_impute(Arguments *args, Chromosome *all_chroms, int n_chrom,
658658
/* open file, count number of lines */
659659
fprintf(stderr, "reading from file %s\n", imp_files[i]);
660660
gzf = util_must_gzopen(args->input_files[i], "rb");
661-
set_file_info(gzf, args->input_files[i], args, &file_info, NULL, impute_info);
661+
set_file_info(gzf, args->input_files[i], args, &file_info, NULL,
662+
impute_info);
662663

663664
/* initialize output files and memory to hold genotypes,
664665
* haplotypes, etc
@@ -813,7 +814,7 @@ void parse_impute(Arguments *args, Chromosome *all_chroms, int n_chrom,
813814

814815
line_num += 1;
815816

816-
if((line_num % 1000) == 0) {
817+
if((line_num % 10000) == 0) {
817818
fprintf(stderr, ".");
818819
}
819820
}
@@ -946,10 +947,9 @@ void parse_vcf(Arguments *args, Chromosome *all_chroms, int n_chrom,
946947

947948
fprintf(stderr, "parsing file and writing to HDF5 files\n");
948949

949-
950950
while(vcf_read_line(gzf, vcf, &snp,
951951
geno_probs, haplotypes) != -1) {
952-
952+
953953
if(geno_probs) {
954954
write_h5matrix_row(gprob_info, row, geno_probs);
955955
}
@@ -975,7 +975,7 @@ void parse_vcf(Arguments *args, Chromosome *all_chroms, int n_chrom,
975975
if(snp_tab) {
976976
snp_tab_append_row(snp_tab, &snp);
977977
}
978-
978+
979979
row++;
980980
if((row % 1000) == 0) {
981981
fprintf(stderr, ".");
@@ -1030,6 +1030,8 @@ int main(int argc, char **argv) {
10301030

10311031
/* read chromosomes */
10321032
all_chroms = chrom_read_file(args.chrom_file, &n_chrom);
1033+
1034+
fprintf(stderr, "long alleles will be truncated to %dbp\n", SNP_MAX_ALLELE);
10331035

10341036
/* create new HDF5 file(s) */
10351037
if(args.geno_prob_file) {

snp2h5/snptab.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,22 +35,30 @@ SNPTab *snp_tab_new(hid_t h5file, const char *chrom_name,
3535
/* set datatypes for each field */
3636
tab->name_type = H5Tcopy(H5T_C_S1);
3737
H5Tset_size(tab->name_type, SNP_MAX_NAME);
38+
/* no longer store chromosome as each chromosome
39+
* gets its own table
40+
/* tab->chrom_type = H5Tcopy(H5T_C_S1);
41+
* H5Tset_size(tab->chrom_type, SNP_MAX_CHROM);
42+
*/
3843
tab->allele_type = H5Tcopy(H5T_C_S1);
39-
H5Tset_size(tab->allele_type, SNP_MAX_ALLELE);
44+
H5Tset_size(tab->allele_type, SNP_MAX_ALLELE);
4045
tab->field_type[0] = tab->name_type; /* name */
46+
/* tab->field_type[1] = tab->chrom_type; */ /* chromosome */
4147
tab->field_type[1] = H5T_NATIVE_LONG; /* pos */
4248
tab->field_type[2] = tab->allele_type; /* allele1 */
4349
tab->field_type[3] = tab->allele_type; /* allele2 */
4450

4551
/* sizes of record and each field */
46-
tab->record_size = sizeof(SNPTab);
52+
tab->record_size = sizeof(SNP);
4753
tab->field_size[0] = sizeof(snp_desc.name);
54+
/* tab->field_size[1] = sizeof(snp_desc.chrom); */
4855
tab->field_size[1] = sizeof(snp_desc.pos);
4956
tab->field_size[2] = sizeof(snp_desc.allele1);
5057
tab->field_size[3] = sizeof(snp_desc.allele2);
51-
58+
5259
/* offsets of each field */
5360
tab->field_offset[0] = HOFFSET(SNP, name);
61+
/* tab->field_offset[1] = HOFFSET(SNP, chrom); */
5462
tab->field_offset[1] = HOFFSET(SNP, pos);
5563
tab->field_offset[2] = HOFFSET(SNP, allele1);
5664
tab->field_offset[3] = HOFFSET(SNP, allele2);

snp2h5/snptab.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,17 @@
66
#include "snp.h"
77

88

9-
#define SNPTAB_NFIELDS 5
109
#define SNPTAB_N_FIELDS 4
1110

1211
/* chunk size affects performance a lot
1312
* small chunks = much faster writing of tables, but
1413
* worse compression
1514
*/
15+
1616
#define SNPTAB_CHUNK_SIZE 12
1717

1818

19+
1920
/* SNPTab holds information about SNP table
2021
* and datatypes of each field.
2122
*/
@@ -30,6 +31,7 @@ typedef struct {
3031
size_t field_offset[SNPTAB_N_FIELDS];
3132

3233
hid_t name_type;
34+
hid_t chrom_type;
3335
hid_t allele_type;
3436

3537
int compress;

snp2h5/util.c

Lines changed: 17 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -71,18 +71,16 @@ char *util_read_entire_file(char *filename) {
7171
* before and after the count of newlines.
7272
*/
7373
long util_fcount_lines(FILE *fh) {
74-
char buf[UTIL_FGETS_BUF_SZ];
7574
long line_count;
76-
int len;
75+
char c;
7776

7877
if(fseek(fh, 0L, SEEK_SET) != 0) {
7978
my_err("%s:%d: could not rewind filehandle", __FILE__, __LINE__);
8079
}
8180

8281
line_count = 0;
83-
while(fgets(buf, UTIL_FGETS_BUF_SZ, fh) != NULL) {
84-
len = strlen(buf);
85-
if(buf[len-1] == '\n') {
82+
while((c = getc(fh)) != EOF) {
83+
if(c == '\n') {
8684
line_count++;
8785
}
8886
}
@@ -95,6 +93,9 @@ long util_fcount_lines(FILE *fh) {
9593
}
9694

9795

96+
97+
98+
9899
/**
99100
* write a subset of lines which are flagged as TRUE in the provided
100101
* array (that should have a length corresponding to the number lines
@@ -151,6 +152,7 @@ long util_count_lines(const char *filename) {
151152
return line_count;
152153
}
153154

155+
154156
/**
155157
* Counts the number of '\n' characters in the provided gzipped file.
156158
* The gzFile is rewound to the beginning of the file before and after the count of
@@ -572,40 +574,23 @@ char *util_str_ndup(const char *str, size_t n) {
572574
* This function still gives no indication that src string is
573575
* truncated, however.
574576
*/
575-
size_t util_strncpy(char *dest, const char *src, size_t n) {
576-
size_t i, len;
577-
int src_end;
578577

579-
len = 0;
580-
src_end = FALSE;
581-
582-
for(i = 0; i < n-1; i++) {
583-
if(src_end) {
584-
/* Continue NULL padding to end of dest buf with '\0'.
585-
* This is to have consistent behaviour with strncpy and also
586-
* HDF5 API seems to expect this. WHen HDF5 string written
587-
* with non-null chars past terminating '\0', entire length of
588-
* string buffer is printed by pytables (including unprintable
589-
* binary chars) when string retrieved.
590-
*/
591-
dest[i] = '\0';
592-
} else {
578+
size_t util_strncpy(char *dest, const char *src, size_t n) {
579+
size_t i = 0;
580+
if(n > 0) {
581+
while((i < n-1) && (src[i] != '\0')) {
593582
dest[i] = src[i];
594-
if(src[i] == '\0') {
595-
/* hit end of src string */
596-
src_end = TRUE;
597-
} else {
598-
len += 1;
599-
}
583+
i++;
600584
}
585+
/* padd to end with \0 */
586+
memset(&dest[i], '\0', n-i);
587+
/* dest[i] = '\0'; */
601588
}
602-
603-
dest[n-1] = '\0';
604-
return len;
589+
590+
return i;
605591
}
606592

607593

608-
609594
/**
610595
* Does an in-place uppercase of all characters in a string
611596
*/

snp2h5/vcf.c

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,11 @@ int vcf_read_line(gzFile vcf_fh, VCFInfo *vcf_info, SNP *snp,
350350
if(token == NULL) {
351351
my_err("expected at least %d tokens per line\n", n_fix_header);
352352
}
353-
util_strncpy(snp->chrom, token, sizeof(snp->chrom));
353+
354+
/* we don't bother to store chromosome since we store
355+
* SNPs from each chromosome in their own table
356+
*/
357+
/* util_strncpy(snp->chrom, token, sizeof(snp->chrom)); */
354358

355359

356360
/* pos */
@@ -372,26 +376,31 @@ int vcf_read_line(gzFile vcf_fh, VCFInfo *vcf_info, SNP *snp,
372376
if(token == NULL) {
373377
my_err("expected at least %d tokens per line\n", n_fix_header);
374378
}
375-
vcf_info->ref_len = strlen(token);
376379
ref_len = util_strncpy(snp->allele1, token, sizeof(snp->allele1));
377380

378-
if(ref_len != vcf_info->ref_len) {
379-
my_warn("truncating long allele (%ld bp) to %ld bp\n",
380-
vcf_info->ref_len, ref_len);
381-
}
381+
/* used to warn about truncations, but makes program too
382+
* chatty if there are a lot of them
383+
*/
384+
vcf_info->ref_len = 0;
385+
/* vcf_info->ref_len = strlen(token); */
386+
/* if(ref_len != vcf_info->ref_len) { */
387+
/* my_warn("truncating long allele (%ld bp) to %ld bp\n", */
388+
/* vcf_info->ref_len, ref_len); */
389+
/* } */
382390

383391
/* alt */
384392
token = strsep(&cur, delim);
385393
if(token == NULL) {
386394
my_err("expected at least %d tokens per line\n", n_fix_header);
387395
}
388-
vcf_info->alt_len = strlen(token);
389396
alt_len = util_strncpy(snp->allele2, token, sizeof(snp->allele2));
390-
391-
if(alt_len != vcf_info->alt_len) {
392-
my_warn("truncating long allele (%ld bp) to %ld bp\n",
393-
vcf_info->alt_len, alt_len);
394-
}
397+
398+
vcf_info->alt_len = 0;
399+
/* vcf_info->alt_len = strlen(token); */
400+
/* if(alt_len != vcf_info->alt_len) { */
401+
/* my_warn("truncating long allele (%ld bp) to %ld bp\n", */
402+
/* vcf_info->alt_len, alt_len); */
403+
/* } */
395404

396405
/* qual */
397406
token = strsep(&cur, delim);

0 commit comments

Comments
 (0)