@@ -96,21 +96,23 @@ def vcf_collect(
9696 | ((all_df ["orig_start" ] == 0 ) & (all_df ["orig_end" ] == 0 ))
9797 ]
9898
99- all_df ["Left_transcript_version" ] = all_df ["CDS_LEFT_ID" ].astype (str ).str .split ("." ).str [- 1 ]
10099
101100 all_df .replace ("" , np .nan , inplace = True )
102101 all_df = all_df .drop_duplicates ()
103102
104- all_df [["exon_number" , "Left_transcript_version" ]] = all_df [
105- ["exon_number" , "Left_transcript_version" ]
103+
104+ all_df [["exon_number" , "transcript_version" ]] = all_df [
105+ ["exon_number" , "transcript_version" ]
106106 ].replace (0 , np .nan )
107+
107108 # Fill non-empty values within each group for 'exon_number' and 'transcript_version'
108109 all_df ["exon_number" ] = all_df .groupby ("PosA" )["exon_number" ].transform (
109110 lambda x : x .fillna (method = "ffill" ).fillna (method = "bfill" )
110111 )
111- all_df ["Left_transcript_version " ] = all_df .groupby ("PosA" )[
112- "Left_transcript_version "
112+ all_df ["transcript_version " ] = all_df .groupby ("PosA" )[
113+ "transcript_version "
113114 ].transform (lambda x : x .fillna (method = "ffill" ).fillna (method = "bfill" ))
115+ all_df = all_df .rename (columns = {"transcript_version" : "Left_transcript_version" })
114116
115117 all_df = all_df .rename (columns = {"exon_number" : "Left_exon_number" })
116118 all_df = all_df [
@@ -162,20 +164,20 @@ def vcf_collect(
162164 all_df [["PosA" , "PosB" ]] = all_df [["PosA" , "PosB" ]].replace (0 , np .nan )
163165 all_df = all_df .replace ("" , np .nan )
164166
165- all_df ["Right_transcript_version" ] = all_df ["CDS_RIGHT_ID" ].astype (str ).str .split ("." ).str [- 1 ]
166167
167168
168- all_df [["exon_number" , "Right_transcript_version " ]] = all_df [
169- ["exon_number" , "Right_transcript_version " ]
169+ all_df [["exon_number" , "transcript_version " ]] = all_df [
170+ ["exon_number" , "transcript_version " ]
170171 ].replace (0 , np .nan )
171172 # Fill non-empty values within each group for 'exon_number' and 'transcript_version'
172173 all_df ["exon_number" ] = all_df .groupby ("PosB" )["exon_number" ].transform (
173174 lambda x : x .fillna (method = "ffill" ).fillna (method = "bfill" )
174175 )
175- all_df ["Right_transcript_version " ] = all_df .groupby ("PosB" )[
176- "Right_transcript_version "
176+ all_df ["transcript_version " ] = all_df .groupby ("PosB" )[
177+ "transcript_version "
177178 ].transform (lambda x : x .fillna (method = "ffill" ).fillna (method = "bfill" ))
178179
180+ all_df = all_df .rename (columns = {"transcript_version" : "Right_transcript_version" })
179181 all_df = all_df .rename (columns = {"exon_number" : "Right_exon_number" })
180182
181183 all_df = all_df [
@@ -589,7 +591,7 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame:
589591 "orig_coord_info"
590592 ].str .split ("," , expand = True )
591593 return df [
592- ["Transcript_id" , "exon_number" , "orig_start" , "orig_end" ]
594+ ["Transcript_id" , "transcript_version" , " exon_number" , "orig_start" , "orig_end" ]
593595 ]
594596
595597
0 commit comments