Skip to content

Commit 23d7897

Browse files
Initial version of plink1.9-vcf command
1 parent 5aa8c01 commit 23d7897

File tree

11 files changed

+745
-7
lines changed

11 files changed

+745
-7
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252
channels: conda-forge,bioconda
5353
- name: Install dependencies
5454
run: |
55-
conda install bcftools
55+
conda install bcftools plink
5656
python -m pip install --upgrade pip
5757
python -m pip install '.[dev]'
5858
# Build the extension module in-place so pytest can find it
@@ -158,7 +158,7 @@ jobs:
158158
channels: conda-forge,bioconda
159159
- name: Install dependencies
160160
run: |
161-
conda install bcftools
161+
conda install bcftools plink
162162
python -m pip install --upgrade pip
163163
python -m pip install '.[dev]'
164164
# Build the extension module in-place so pytest can find it
@@ -189,7 +189,7 @@ jobs:
189189
channels: conda-forge,bioconda
190190
- name: Install dependencies
191191
run: |
192-
conda install bcftools
192+
conda install bcftools plink
193193
python -m pip install --upgrade pip
194194
python -m pip install '.[dev]'
195195
# Build the extension module in-place so pytest can find it

lib/tests.c

Lines changed: 147 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -854,12 +854,153 @@ test_ftoa(void)
854854

855855
for (j = 0; j < sizeof(cases) / sizeof(*cases); j++) {
856856
len = vcz_ftoa(buf, cases[j].val);
857-
/* printf("j = %d %f->%s=='%s'\n", (int) j, cases[j].val, cases[j].expected, buf); */
857+
/* printf("j = %d %f->%s=='%s'\n", (int) j, cases[j].val, cases[j].expected,
858+
* buf); */
858859
CU_ASSERT_EQUAL_FATAL(len, strlen(cases[j].expected));
859860
CU_ASSERT_STRING_EQUAL_FATAL(buf, cases[j].expected);
860861
}
861862
}
862863

864+
static void
865+
test_encode_plink_single_genotype(void)
866+
{
867+
struct test_case {
868+
int8_t genotype[2];
869+
char expected;
870+
};
871+
// clang-format off
872+
struct test_case cases[] = {
873+
{{-1, -1}, VCZ_PLINK_MISSING},
874+
{{-2, -1}, VCZ_PLINK_MISSING},
875+
{{-1, -2}, VCZ_PLINK_MISSING},
876+
{{-2, -2}, VCZ_PLINK_MISSING},
877+
/* Unknown alleles are treated as missing */
878+
{{2, 2}, VCZ_PLINK_MISSING},
879+
{{-1, 2}, VCZ_PLINK_MISSING},
880+
{{2, -1}, VCZ_PLINK_MISSING},
881+
/* Half-calls are homozygous */
882+
{{0, -2}, VCZ_PLINK_HOM_A2},
883+
{{1, -2}, VCZ_PLINK_HOM_A1},
884+
/* Nominal cases */
885+
{{1, 0}, VCZ_PLINK_HET},
886+
{{0, 1}, VCZ_PLINK_HET},
887+
{{0, 0}, VCZ_PLINK_HOM_A2},
888+
{{1, 1}, VCZ_PLINK_HOM_A1},
889+
};
890+
// clang-format on
891+
size_t j;
892+
char buf;
893+
int8_t a12[] = { 1, 0 };
894+
895+
for (j = 0; j < sizeof(cases) / sizeof(*cases); j++) {
896+
vcz_encode_plink(1, 1, cases[j].genotype, a12, &buf);
897+
/* printf("j = %d [%d,%d]->%d==%d'\n", (int) j, cases[j].genotype[0], */
898+
/* cases[j].genotype[1], cases[j].expected, buf); */
899+
CU_ASSERT_EQUAL_FATAL(buf, cases[j].expected);
900+
}
901+
}
902+
903+
static void
904+
test_encode_plink_example(void)
905+
{
906+
907+
const size_t num_variants = 3;
908+
const size_t num_samples = 3;
909+
int j;
910+
// clang-format off
911+
int8_t genotypes[] = {
912+
0, 0, 0, 1, 0, 0,
913+
1, 0, 1, 1, 0, -2,
914+
1, 1, 0, 1, -1,-1,
915+
};
916+
// clang-format on
917+
int8_t a12[] = { 1, 0, 1, 0, 1, 0 };
918+
int8_t expected[] = { 59, 50, 24 };
919+
char buf[3];
920+
921+
vcz_encode_plink(num_variants, num_samples, genotypes, a12, buf);
922+
for (j = 0; j < 3; j++) {
923+
/* printf("%d\n", buf[j]); */
924+
CU_ASSERT_EQUAL_FATAL(buf[j], expected[j]);
925+
}
926+
}
927+
928+
static void
929+
test_encode_plink_single_genotype_vary_a12(void)
930+
{
931+
struct test_case {
932+
int8_t genotype[2];
933+
int8_t a12[2];
934+
char expected;
935+
};
936+
// clang-format off
937+
struct test_case cases[] = {
938+
{{-1, -1}, {0, 1}, VCZ_PLINK_MISSING},
939+
{{0, -2}, {0, 1}, VCZ_PLINK_HOM_A1},
940+
{{1, -2}, {0, 1}, VCZ_PLINK_HOM_A2},
941+
{{0, 0}, {1, 2}, VCZ_PLINK_MISSING},
942+
/* Allele 1 is missing */
943+
{{0, 0}, {-1, 0}, VCZ_PLINK_HOM_A2},
944+
{{1, 0}, {-1, 0}, VCZ_PLINK_MISSING},
945+
{{0, 1}, {-1, 0}, VCZ_PLINK_MISSING},
946+
{{1, 1}, {-1, 0}, VCZ_PLINK_MISSING},
947+
{{0, -1}, {-1, 0}, VCZ_PLINK_MISSING},
948+
{{-1, -1}, {-1, 0}, VCZ_PLINK_MISSING},
949+
/* Nominal cases */
950+
{{2, 3}, {2, 3}, VCZ_PLINK_HET},
951+
{{3, 2}, {2, 3}, VCZ_PLINK_HET},
952+
{{3, 3}, {2, 3}, VCZ_PLINK_HOM_A2},
953+
{{2, 2}, {2, 3}, VCZ_PLINK_HOM_A1},
954+
};
955+
// clang-format on
956+
size_t j;
957+
char buf;
958+
959+
for (j = 0; j < sizeof(cases) / sizeof(*cases); j++) {
960+
vcz_encode_plink(1, 1, cases[j].genotype, cases[j].a12, &buf);
961+
/* printf("j = %d [%d,%d]->%d==%d'\n", (int) j, cases[j].genotype[0], */
962+
/* cases[j].genotype[1], cases[j].expected, buf); */
963+
CU_ASSERT_EQUAL_FATAL(buf, cases[j].expected);
964+
}
965+
}
966+
967+
static void
968+
test_encode_plink_all_zeros_instance(size_t num_variants, size_t num_samples)
969+
{
970+
int8_t *genotypes = calloc(num_variants * num_samples, 2);
971+
int8_t *a12 = calloc(num_variants, 2);
972+
char *buf = malloc(num_variants * num_samples / 4);
973+
size_t j;
974+
975+
CU_ASSERT_FATAL(num_samples % 4 == 0);
976+
CU_ASSERT_FATAL(genotypes != NULL);
977+
CU_ASSERT_FATAL(a12 != NULL);
978+
CU_ASSERT_FATAL(buf != NULL);
979+
980+
for (j = 0; j < num_variants; j++) {
981+
a12[2 * j] = 1;
982+
}
983+
/* printf("variants=%d samples=%d\n", (int) num_variants, (int) num_samples); */
984+
vcz_encode_plink(num_variants, num_samples, genotypes, a12, buf);
985+
for (j = 0; j < num_variants * num_samples / 4; j++) {
986+
/* printf("%d %d\n", (int) j, buf[j]); */
987+
CU_ASSERT_EQUAL_FATAL(buf[j], -1);
988+
}
989+
990+
free(genotypes);
991+
free(a12);
992+
free(buf);
993+
}
994+
995+
static void
996+
test_encode_plink_all_zeros(void)
997+
{
998+
test_encode_plink_all_zeros_instance(1, 4);
999+
test_encode_plink_all_zeros_instance(10, 4);
1000+
test_encode_plink_all_zeros_instance(1, 400);
1001+
test_encode_plink_all_zeros_instance(100, 400);
1002+
}
1003+
8631004
/*=================================================
8641005
Test suite management
8651006
=================================================
@@ -972,6 +1113,11 @@ main(int argc, char **argv)
9721113
{ "test_itoa_pow10", test_itoa_pow10 },
9731114
{ "test_itoa_boundary", test_itoa_boundary },
9741115
{ "test_ftoa", test_ftoa },
1116+
{ "test_encode_plink_single_genotype", test_encode_plink_single_genotype },
1117+
{ "test_encode_plink_single_genotype_vary_a12",
1118+
test_encode_plink_single_genotype_vary_a12 },
1119+
{ "test_encode_plink_example", test_encode_plink_example },
1120+
{ "test_encode_plink_all_zeros", test_encode_plink_all_zeros },
9751121
{ NULL, NULL },
9761122
};
9771123
return test_main(tests, argc, argv);

lib/vcf_encoder.c

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ vcz_itoa(char *restrict buf, int64_t value)
6262
return p;
6363
}
6464

65-
6665
int
6766
vcz_ftoa(char *restrict buf, float value)
6867
{
@@ -998,3 +997,62 @@ vcz_variant_encoder_free(vcz_variant_encoder_t *self)
998997
self->format_fields = NULL;
999998
}
1000999
}
1000+
1001+
int
1002+
vcz_encode_plink(size_t num_variants, size_t num_samples, const int8_t *genotypes,
1003+
const int8_t *a12_allele, char *buf)
1004+
{
1005+
size_t j, k, variant_offset, byte_offset, bit_pos;
1006+
int8_t a, b, allele_1, allele_2, code;
1007+
int mask;
1008+
const size_t bytes_per_variant = (num_samples + 3) / 4;
1009+
1010+
variant_offset = 0;
1011+
1012+
memset(buf, 0, bytes_per_variant * num_variants);
1013+
1014+
for (j = 0; j < num_variants; j++) {
1015+
allele_1 = a12_allele[j * 2];
1016+
allele_2 = a12_allele[j * 2 + 1];
1017+
for (k = 0; k < num_samples; k++) {
1018+
code = VCZ_PLINK_MISSING;
1019+
a = genotypes[j * num_samples * 2 + k * 2];
1020+
b = genotypes[j * num_samples * 2 + k * 2 + 1];
1021+
if (b == -2) {
1022+
/* Treated as a haploid call by plink */
1023+
if (a == allele_1) {
1024+
code = VCZ_PLINK_HOM_A1;
1025+
} else if (a == allele_2) {
1026+
code = VCZ_PLINK_HOM_A2;
1027+
}
1028+
} else {
1029+
if (a == allele_1) {
1030+
if (b == allele_1) {
1031+
code = VCZ_PLINK_HOM_A1;
1032+
} else if (b == allele_2) {
1033+
code = VCZ_PLINK_HET;
1034+
}
1035+
} else if (a == allele_2) {
1036+
if (b == allele_2) {
1037+
code = VCZ_PLINK_HOM_A2;
1038+
} else if (b == allele_1) {
1039+
code = VCZ_PLINK_HET;
1040+
}
1041+
}
1042+
if ((allele_1 == -1)
1043+
&& (code == VCZ_PLINK_HOM_A1 || code == VCZ_PLINK_HET)) {
1044+
code = VCZ_PLINK_MISSING;
1045+
}
1046+
}
1047+
1048+
/* printf("a=%d b=%d a1=%d a2=%d code = %d\n", a, b, allele_1, allele_2,
1049+
* code); */
1050+
byte_offset = variant_offset + k / 4;
1051+
bit_pos = (k % 4) * 2;
1052+
mask = ~(0x3 << bit_pos);
1053+
buf[byte_offset] = (char) (buf[byte_offset] & mask) | (code << bit_pos);
1054+
}
1055+
variant_offset += bytes_per_variant;
1056+
}
1057+
return 0;
1058+
}

lib/vcf_encoder.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,11 @@ int64_t vcz_variant_encoder_encode(
109109

110110
int vcz_itoa(char *buf, int64_t v);
111111
int vcz_ftoa(char *buf, float v);
112+
113+
114+
#define VCZ_PLINK_HOM_A1 0x0 /* 00 */
115+
#define VCZ_PLINK_HOM_A2 0x3 /* 11 */
116+
#define VCZ_PLINK_HET 0x2 /* 10 */
117+
#define VCZ_PLINK_MISSING 0x1 /* 01 */
118+
int vcz_encode_plink(size_t num_variants, size_t num_samples, const int8_t *genotypes,
119+
const int8_t *a12_allele, char *buf);

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ indent-width = 4
7676
[tool.ruff.lint]
7777
select = ["E", "F", "B", "W", "I", "N", "UP", "A", "RUF", "PT"]
7878
#Allow uppercase names for e.g. call_AD
79-
ignore = ["N806", "N802", "A001", "A002"]
79+
ignore = ["N806", "N802", "N803", "A001", "A002"]
8080

8181
fixable = ["ALL"]
8282
unfixable = []

tests/test_cpython_interface.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,3 +453,83 @@ def test_attrs(self, name):
453453
encoder = cls.__new__(cls)
454454
with pytest.raises(SystemError, match="not initialised"):
455455
getattr(encoder, name)
456+
457+
458+
class TestEncodePlink:
459+
def test_bad_num_arguments(self):
460+
with pytest.raises(TypeError):
461+
_vcztools.encode_plink()
462+
with pytest.raises(TypeError):
463+
_vcztools.encode_plink(0)
464+
465+
@pytest.mark.parametrize("bad_type", [[], {}, "string", 4])
466+
def test_bad_types(self, bad_type):
467+
array = np.zeros(5, dtype=np.int8)
468+
with pytest.raises(TypeError):
469+
_vcztools.encode_plink(bad_type, array)
470+
with pytest.raises(TypeError):
471+
_vcztools.encode_plink(array, bad_type)
472+
473+
def test_wrong_genotype_dims(self):
474+
with pytest.raises(ValueError, match="genotypes has wrong dimension"):
475+
_vcztools.encode_plink(
476+
np.zeros((1, 1), dtype=np.int8), np.zeros((1, 2), dtype=np.int8)
477+
)
478+
479+
def test_wrong_a12_allele_dims(self):
480+
with pytest.raises(ValueError, match="a12_allele has wrong dimension"):
481+
_vcztools.encode_plink(
482+
np.zeros((1, 1, 2), dtype=np.int8), np.zeros((1, 2, 3), dtype=np.int8)
483+
)
484+
485+
@pytest.mark.parametrize("bad_dtype", [np.int16, np.int64, np.float64, "S1"])
486+
def test_bad_genotype_dtype(self, bad_dtype):
487+
with pytest.raises(ValueError, match="Wrong dtype for genotypes"):
488+
_vcztools.encode_plink(
489+
np.zeros((1, 1, 2), dtype=bad_dtype), np.zeros((1, 2), dtype=np.int8)
490+
)
491+
492+
@pytest.mark.parametrize("bad_dtype", [np.int16, np.int64, np.float64, "S1"])
493+
def test_bad_a12_allele_dtype(self, bad_dtype):
494+
with pytest.raises(ValueError, match="Wrong dtype for a12_allele"):
495+
_vcztools.encode_plink(
496+
np.zeros((1, 1, 2), dtype=np.int8), np.zeros((1, 2), dtype=bad_dtype)
497+
)
498+
499+
@pytest.mark.parametrize("bad_ploidy", [1, 3])
500+
def test_bad_ploidy(self, bad_ploidy):
501+
with pytest.raises(ValueError, match="Only diploid genotypes"):
502+
_vcztools.encode_plink(
503+
np.zeros((1, 1, bad_ploidy), dtype=np.int8),
504+
np.zeros((1, 2), dtype=np.int8),
505+
)
506+
507+
@pytest.mark.parametrize("num_variants", [1, 3])
508+
def test_num_variants_mismatch(self, num_variants):
509+
with pytest.raises(ValueError, match="same first dimension"):
510+
_vcztools.encode_plink(
511+
np.zeros((2, 1, 2), dtype=np.int8),
512+
np.zeros((num_variants, 2), dtype=np.int8),
513+
)
514+
515+
@pytest.mark.parametrize("num_cols", [1, 3])
516+
def test_bad_a12_allele_cols(self, num_cols):
517+
with pytest.raises(ValueError, match="exactly 2 columns"):
518+
_vcztools.encode_plink(
519+
np.zeros((2, 1, 2), dtype=np.int8),
520+
np.zeros((2, num_cols), dtype=np.int8),
521+
)
522+
523+
def test_example(self):
524+
G = np.array(
525+
[
526+
[[0, 0], [0, 1], [0, 0]],
527+
[[1, 0], [1, 1], [0, -2]],
528+
[[1, 1], [0, 1], [-1, -1]],
529+
],
530+
dtype=np.int8,
531+
)
532+
a12_allele = np.array([[1, 0], [1, 0], [1, 0]], dtype=np.int8)
533+
enc = _vcztools.encode_plink(G, a12_allele)
534+
assert enc.dtype == np.uint8
535+
assert list(enc) == [59, 50, 24]

0 commit comments

Comments
 (0)