update vcf_collect script to be compatible with references of 3.0.1

rannick · rannick · commit 0c92b712a5d5 · 2025-11-18T16:01:43.000+01:00
diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
@@ -96,21 +96,23 @@ def vcf_collect(
         | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0))
     ]
 
-    all_df["Left_transcript_version"] = all_df["CDS_LEFT_ID"].astype(str).str.split(".").str[-1]
 
     all_df.replace("", np.nan, inplace=True)
     all_df = all_df.drop_duplicates()
 
-    all_df[["exon_number", "Left_transcript_version"]] = all_df[
-        ["exon_number", "Left_transcript_version"]
+
+    all_df[["exon_number", "transcript_version"]] = all_df[
+        ["exon_number", "transcript_version"]
     ].replace(0, np.nan)
+
     # Fill non-empty values within each group for 'exon_number' and 'transcript_version'
     all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform(
         lambda x: x.fillna(method="ffill").fillna(method="bfill")
     )
-    all_df["Left_transcript_version"] = all_df.groupby("PosA")[
-        "Left_transcript_version"
+    all_df["transcript_version"] = all_df.groupby("PosA")[
+        "transcript_version"
     ].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill"))
+    all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"})
 
     all_df = all_df.rename(columns={"exon_number": "Left_exon_number"})
     all_df = all_df[
@@ -162,20 +164,20 @@ def vcf_collect(
     all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan)
     all_df = all_df.replace("", np.nan)
 
-    all_df["Right_transcript_version"] = all_df["CDS_RIGHT_ID"].astype(str).str.split(".").str[-1]
 
 
-    all_df[["exon_number", "Right_transcript_version"]] = all_df[
-        ["exon_number", "Right_transcript_version"]
+    all_df[["exon_number", "transcript_version"]] = all_df[
+        ["exon_number", "transcript_version"]
     ].replace(0, np.nan)
     # Fill non-empty values within each group for 'exon_number' and 'transcript_version'
     all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform(
         lambda x: x.fillna(method="ffill").fillna(method="bfill")
     )
-    all_df["Right_transcript_version"] = all_df.groupby("PosB")[
-        "Right_transcript_version"
+    all_df["transcript_version"] = all_df.groupby("PosB")[
+        "transcript_version"
     ].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill"))
 
+    all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"})
     all_df = all_df.rename(columns={"exon_number": "Right_exon_number"})
 
     all_df = all_df[
@@ -589,7 +591,7 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame:
         "orig_coord_info"
     ].str.split(",", expand=True)
     return df[
-        ["Transcript_id", "exon_number", "orig_start", "orig_end"]
+        ["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]
     ]