@@ -164,34 +164,30 @@ def filter_gff(gff_data, keep):
164164 return gff_keep , gff_discard
165165
166166def getGeneId (attr ):
167- # Handle various ID formats, not just FILTER prefix
168- m = re .search (r"ID=([A-Za-z_]+\d+)(\.t[0-9 ]+)?(\.[a-zA-Z\d\_\.]+)?; " , attr )
167+ # Extract ID from attribute string - handles IDs with path prefixes like "results/FILTER00000010.t1"
168+ m = re .search (r"ID=([^;\s ]+)" , attr )
169169 if not m :
170170 return None
171- match = m .group (1 )
172- try :
173- return (match + m .group (2 )) if m .group (2 ) else match
174- except :
175- return match
171+ return m .group (1 )
176172
177173def getParent (attr ):
178- m = re .search ("Parent=[a-zA-Z\d\._-]*;" , attr )
174+ # Extract Parent from attribute string - handles IDs with path prefixes
175+ m = re .search (r"Parent=([^;\s]+)" , attr )
179176 if m :
180- m = m .group (0 ).replace ("Parent=" , "" ).replace (";" , "" )
181- else :
182- m = None
183- return (m )
177+ return m .group (1 )
178+ return None
184179
185180def checkKeep (id , names ):
186181 val = sum (names .str .match (id )) >= 1
187182 return (val )
188183
189184def formatKeepIDs (id ):
190- # Handle various ID formats - extract base gene ID before any suffix
185+ # Extract base gene ID (without transcript suffix like .t1)
191186 if pd .isna (id ):
192187 return id
193- # Try to match pattern like "PREFIX123" or "PREFIX123.1"
194- match = re .search (r"^([A-Za-z_]+\d+)" , str (id ))
188+ id_str = str (id )
189+ # Remove transcript suffix if present (e.g., ".t1", ".t2")
190+ match = re .search (r"^(.+?)(?:\.t\d+)?$" , id_str )
195191 if match :
196192 return match .group (1 )
197193 return id
0 commit comments