Skip to content

Commit aa7f72e

Browse files
committed
update Filter
1 parent b11f13f commit aa7f72e

File tree

1 file changed

+11
-15
lines changed

1 file changed

+11
-15
lines changed

bin/Filter.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -164,34 +164,30 @@ def filter_gff(gff_data, keep):
164164
return gff_keep, gff_discard
165165

166166
def getGeneId(attr):
167-
# Handle various ID formats, not just FILTER prefix
168-
m = re.search(r"ID=([A-Za-z_]+\d+)(\.t[0-9]+)?(\.[a-zA-Z\d\_\.]+)?;", attr)
167+
# Extract ID from attribute string - handles IDs with path prefixes like "results/FILTER00000010.t1"
168+
m = re.search(r"ID=([^;\s]+)", attr)
169169
if not m:
170170
return None
171-
match = m.group(1)
172-
try:
173-
return(match + m.group(2)) if m.group(2) else match
174-
except:
175-
return match
171+
return m.group(1)
176172

177173
def getParent(attr):
178-
m = re.search("Parent=[a-zA-Z\d\._-]*;", attr)
174+
# Extract Parent from attribute string - handles IDs with path prefixes
175+
m = re.search(r"Parent=([^;\s]+)", attr)
179176
if m:
180-
m = m.group(0).replace("Parent=", "").replace(";", "")
181-
else:
182-
m = None
183-
return(m)
177+
return m.group(1)
178+
return None
184179

185180
def checkKeep(id, names):
186181
val = sum(names.str.match(id)) >= 1
187182
return(val)
188183

189184
def formatKeepIDs(id):
190-
# Handle various ID formats - extract base gene ID before any suffix
185+
# Extract base gene ID (without transcript suffix like .t1)
191186
if pd.isna(id):
192187
return id
193-
# Try to match pattern like "PREFIX123" or "PREFIX123.1"
194-
match = re.search(r"^([A-Za-z_]+\d+)", str(id))
188+
id_str = str(id)
189+
# Remove transcript suffix if present (e.g., ".t1", ".t2")
190+
match = re.search(r"^(.+?)(?:\.t\d+)?$", id_str)
195191
if match:
196192
return match.group(1)
197193
return id

0 commit comments

Comments
 (0)