Use the pandas-based FAM reader

jeromekelleher · jeromekelleher · commit ac9f1d942ab2 · 2025-05-22T21:29:01.000+01:00
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -14,7 +14,7 @@
 
 FAM_FIELDS = [
     ("family_id", str, "U"),
-    ("member_id", str, "U"),
+    ("individual_id", str, "U"),
     ("paternal_id", str, "U"),
     ("maternal_id", str, "U"),
     ("sex", str, "int8"),
@@ -36,20 +36,16 @@
 
 
 def read_fam(path, sep=None):
-    if sep is None:
-        sep = " "
     # See: https://www.cog-genomics.org/plink/1.9/formats#fam
     names = [f[0] for f in FAM_FIELDS]
-    return pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
+    df = pd.read_csv(path, sep=sep, names=names, dtype=FAM_DF_DTYPE)
+    return df
 
 
 def read_bim(path, sep=None):
-    if sep is None:
-        sep = "\t"
     # See: https://www.cog-genomics.org/plink/1.9/formats#bim
     names = [f[0] for f in BIM_FIELDS]
     df = pd.read_csv(str(path), sep=sep, names=names, dtype=BIM_DF_DTYPE)
-    # df["contig"] = df["contig"].where(df["contig"] != "0", None)
     return df
 
 
@@ -78,28 +74,21 @@ def __init__(self, prefix):
             self.prefix + ".fam",
         )
 
-        # Read sample information from .fam file
-        samples = []
-        with open(self.paths.fam_path) as f:
-            for line in f:
-                fields = line.strip().split()
-                if len(fields) >= 2:  # At minimum, we need FID and IID
-                    samples.append(fields[1])
-        self.fam = FamData(sid=np.array(samples), sid_count=len(samples))
-        self.n_samples = len(samples)
-
         self.bim = read_bim(self.paths.bim_path)
-        self.n_variants = self.bim.shape[0]
+        self.fam = read_fam(self.paths.fam_path)
+
+        self._num_records = self.bim.shape[0]
+        self._num_samples = self.fam.shape[0]
 
         # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
-        self.bytes_per_snp = (self.n_samples + 3) // 4
+        self.bytes_per_snp = (self._num_samples + 3) // 4
 
         # Verify BED file has correct magic bytes
         with open(self.paths.bed_path, "rb") as f:
             magic = f.read(3)
             assert magic == b"\x6c\x1b\x01", "Invalid BED file format"
 
-        expected_size = self.n_variants * self.bytes_per_snp + 3  # +3 for magic bytes
+        expected_size = self.num_records * self.bytes_per_snp + 3  # +3 for magic bytes
         actual_size = os.path.getsize(self.paths.bed_path)
         if actual_size < expected_size:
             raise ValueError(
@@ -144,20 +133,20 @@ def path(self):
 
     @property
     def num_records(self):
-        return self.n_variants
+        return self._num_records
+
+    @property
+    def num_samples(self):
+        return self._num_samples
 
     @property
     def samples(self):
-        return [vcz.Sample(id=sample) for sample in self.fam.sid]
+        return [vcz.Sample(id=iid) for iid in self.fam.individual_id]
 
     @property
     def contigs(self):
         return [vcz.Contig(id=str(chrom)) for chrom in self.bim.contig.unique()]
 
-    @property
-    def num_samples(self):
-        return len(self.samples)
-
     def iter_contig(self, start, stop):
         chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
         for chrom in self.bim.contig[start:stop]:
@@ -198,9 +187,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
         samples_padded = self.bytes_per_snp * 4
         genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
 
-        gt = genotypes_reshaped[:, : self.n_samples]
+        gt = genotypes_reshaped[:, : self._num_samples]
 
-        phased = np.zeros((chunk_size, self.n_samples), dtype=bool)
+        phased = np.zeros((chunk_size, self._num_samples), dtype=bool)
 
         for i, (ref, alt) in enumerate(
             zip(ref_field[start:stop], alt_field[start:stop])
@@ -217,7 +206,7 @@ def generate_schema(
         variants_chunk_size=None,
         samples_chunk_size=None,
     ):
-        n = self.fam.sid_count
+        n = self.num_samples
         m = self.num_records
         logging.info(f"Scanned plink with {n} samples and {m} variants")
         dimensions = vcz.standard_dimensions(
diff --git a/tests/test_plink.py b/tests/test_plink.py
@@ -24,6 +24,31 @@ def test_example(self):
         nt.assert_array_equal(df["allele_2"].values, ["GG", "C"])
 
 
+class TestReadFam:
+    def test_example(self):
+        path = "tests/data/plink/example.fam"
+        df = plink.read_fam(path)
+        # FID IID FATHER MOTHER SEX PHENOTYPE
+        # ind0 ind0 0 0 0 -9
+        # ind1 ind1 0 0 0 -9
+        # ind2 ind2 0 0 0 -9
+        # ind3 ind3 0 0 0 -9
+        # ind4 ind4 0 0 0 -9
+        # ind5 ind5 0 0 0 -9
+        # ind6 ind6 0 0 0 -9
+        # ind7 ind7 0 0 0 -9
+        # ind8 ind8 0 0 0 -9
+        # ind9 ind9 0 0 0 -9
+        nt.assert_array_equal(df["family_id"].values, [f"ind{j}" for j in range(10)])
+        nt.assert_array_equal(
+            df["individual_id"].values, [f"ind{j}" for j in range(10)]
+        )
+        nt.assert_array_equal(df["paternal_id"].values, ["0" for j in range(10)])
+        nt.assert_array_equal(df["maternal_id"].values, ["0" for j in range(10)])
+        nt.assert_array_equal(df["sex"].values, ["0" for j in range(10)])
+        nt.assert_array_equal(df["phenotype"].values, ["-9" for j in range(10)])
+
+
 class TestSmallExample:
     @pytest.fixture(scope="class")
     def bed_path(self, tmp_path_factory):