Skip to content

Commit 0c92b71

Browse files
committed
update vcf_collect script to be compatible with references of 3.0.1
1 parent 9fcbad9 commit 0c92b71

File tree

1 file changed

+13
-11
lines changed

1 file changed

+13
-11
lines changed

bin/vcf_collect.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -96,21 +96,23 @@ def vcf_collect(
9696
| ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0))
9797
]
9898

99-
all_df["Left_transcript_version"] = all_df["CDS_LEFT_ID"].astype(str).str.split(".").str[-1]
10099

101100
all_df.replace("", np.nan, inplace=True)
102101
all_df = all_df.drop_duplicates()
103102

104-
all_df[["exon_number", "Left_transcript_version"]] = all_df[
105-
["exon_number", "Left_transcript_version"]
103+
104+
all_df[["exon_number", "transcript_version"]] = all_df[
105+
["exon_number", "transcript_version"]
106106
].replace(0, np.nan)
107+
107108
# Fill non-empty values within each group for 'exon_number' and 'transcript_version'
108109
all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform(
109110
lambda x: x.fillna(method="ffill").fillna(method="bfill")
110111
)
111-
all_df["Left_transcript_version"] = all_df.groupby("PosA")[
112-
"Left_transcript_version"
112+
all_df["transcript_version"] = all_df.groupby("PosA")[
113+
"transcript_version"
113114
].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill"))
115+
all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"})
114116

115117
all_df = all_df.rename(columns={"exon_number": "Left_exon_number"})
116118
all_df = all_df[
@@ -162,20 +164,20 @@ def vcf_collect(
162164
all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan)
163165
all_df = all_df.replace("", np.nan)
164166

165-
all_df["Right_transcript_version"] = all_df["CDS_RIGHT_ID"].astype(str).str.split(".").str[-1]
166167

167168

168-
all_df[["exon_number", "Right_transcript_version"]] = all_df[
169-
["exon_number", "Right_transcript_version"]
169+
all_df[["exon_number", "transcript_version"]] = all_df[
170+
["exon_number", "transcript_version"]
170171
].replace(0, np.nan)
171172
# Fill non-empty values within each group for 'exon_number' and 'transcript_version'
172173
all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform(
173174
lambda x: x.fillna(method="ffill").fillna(method="bfill")
174175
)
175-
all_df["Right_transcript_version"] = all_df.groupby("PosB")[
176-
"Right_transcript_version"
176+
all_df["transcript_version"] = all_df.groupby("PosB")[
177+
"transcript_version"
177178
].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill"))
178179

180+
all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"})
179181
all_df = all_df.rename(columns={"exon_number": "Right_exon_number"})
180182

181183
all_df = all_df[
@@ -589,7 +591,7 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame:
589591
"orig_coord_info"
590592
].str.split(",", expand=True)
591593
return df[
592-
["Transcript_id", "exon_number", "orig_start", "orig_end"]
594+
["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]
593595
]
594596

595597

0 commit comments

Comments
 (0)