Resolve comments

YuWei-CH · YuWei-CH · commit eccc15cdab99 · 2025-04-21T20:25:36.000-04:00
diff --git a/reform.py b/reform.py
@@ -101,7 +101,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 			else:
 				SeqIO.write([chrom_seqs[s]], f, "fasta")
 	## Read in new GFF features from in_gff, False means modify existing chrom
-	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
+	in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], existing_chrom=in_arg.chrom, new_chrom=None)
 	## Create a temp file for gff, if index is not equal to last iteration
 	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
 	if index < iterations - 1:
@@ -157,7 +157,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
 		SeqIO.write([new_record], f, "fasta")
 	## Read in new GFF features from in_gff
 	## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line
-	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], in_arg.new_chrom[index], len(new_seq))
+	in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=len(new_seq))
 	## Create a temp file for gff, if index is not equal to last iteration
 	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
 	if index < iterations - 1:
@@ -190,6 +190,19 @@ def read_fasta(in_arg, index, prev_fasta_path):
 			raise FileNotFoundError(f"Error: File {filename_fa} does not exist.")
 		real_path_fa = os.path.realpath(filename_fa)
 		record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0]
+		# Check for mismatch between FASTA record ID and command line chromosome name
+		if hasattr(in_arg, 'new_chrom') and in_arg.new_chrom is not None:
+			if record.id != in_arg.new_chrom[index]:
+				print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) "
+                      f"and command line parameter ({in_arg.new_chrom[index]}).")
+				print(f"Using command line chromosome name: {in_arg.new_chrom[index]}")
+                # The actual override happens in add_new_chrom_seq where a new SeqRecord is created
+		elif hasattr(in_arg, 'chrom') and in_arg.chrom is not None:
+			if record.id != in_arg.chrom:
+				print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) "
+					  f"and command line parameter ({in_arg.chrom}).")
+				print(f"Using command line chromosome name: {in_arg.chrom}")
+				# The actual override happens in modify_existing_chrom_seq where the existing sequence is modified
 	except IndexError:
 		raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.")
 	except Exception as e:
@@ -289,7 +302,7 @@ def valid_gff_line(line_elements):
 			return False
 	return True
 
-def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
+def get_in_gff_lines(in_gff=None, existing_chrom=None, new_chrom=None, sequence_length=None):
 	'''
 	Takes a gff file and returns a list of lists where 
 	each parent list item is a single line of the gff file
@@ -303,7 +316,7 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
 				continue
 			
 			# Handle differently based on whether we're adding a new chromosome or modifying existing
-			if line.startswith("##sequence-region"):
+			if line.startswith("##sequence-region") and new_chrom is not None:
 				## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome.
 				## Select user used delimiter based on content
 				if '\t' in line:
@@ -336,6 +349,11 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
 			else:
 				## Split, check and add feature lines
 				line_elements = line.split('\t')
+				chorme_id = existing_chrom if existing_chrom else new_chrom
+				if line_elements[0] != chorme_id:
+					print("** Warning: The chromosome name in the GFF file does not match the new chromosome name.")
+					print(f"Correct the chromosome name {line_elements[0]} to {chorme_id}")
+					line_elements[0] = chorme_id
 				if not valid_gff_line(line_elements):
 					exit()
 			in_gff_lines.append(line_elements)
@@ -667,6 +685,7 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
 	"""
 	Appends new annotations to an existing GFF file without modifying existing features.
 	"""
+	gff_splitor = ''
 	ref_gff_path = ref_gff
 	## Handle compressed .gz GFF files
 	if ref_gff.endswith('.gz'):
@@ -681,15 +700,23 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
         ## Copy all existing annotations to new GFF file
 		with open(ref_gff_path, "r", encoding="utf-8") as f:
 			for line in f:
+				if line.startswith("##sequence-region") and gff_splitor == '':
+					## Select user used delimiter based on content
+					if '\t' in line:
+						gff_splitor = ('\t')
+					else:
+						gff_splitor = (' ')
 				gff_out.write(line)
 		## Append new annotations if present
 		if in_gff_lines:
 			print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.")
 			for new_annotation in in_gff_lines:
 				if new_annotation[0] == "##sequence-region":
 					## Use predefined format for sequence-region line
+					if gff_splitor == '':
+						gff_splitor = new_annotation[-1]
 					## Remove format indicater, and add new line
-					gff_out.write(new_annotation[-1].join(new_annotation[:-1])+'\n')
+					gff_out.write(gff_splitor.join(new_annotation[:-1])+'\n')
 				elif new_annotation:
 					gff_out.write("\t".join(new_annotation))
 	
diff --git a/test_data/19/gold.fa b/test_data/19/gold.fa
@@ -0,0 +1,8 @@
+>X
+ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK
+>Y
+AAAATTTTGGGGCCCC
+>H
+GGGGAATTCCCCGGGG
+>M
+CCCCGGGGAAAATTTT
diff --git a/test_data/19/gold.gtf b/test_data/19/gold.gtf
@@ -0,0 +1,18 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+##sequence-region Y 1 16
+Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	stop_codon	14	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+##sequence-region H 1 16
+H	ref	exon	1	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	CDS	4	14	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	start_codon	1	3	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	stop_codon	14	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+M	ref	exon	1	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	CDS	4	14	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	start_codon	1	3	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	stop_codon	14	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
diff --git a/test_data/19/in1.fa b/test_data/19/in1.fa
@@ -0,0 +1,2 @@
+>Z
+AAAATTTTGGGGCCCC
diff --git a/test_data/19/in1.gtf b/test_data/19/in1.gtf
@@ -0,0 +1,5 @@
+##sequence-region Y.11 1 16
+Z	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Z	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Z	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Z	ref	stop_codon	14	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
diff --git a/test_data/19/in2.fa b/test_data/19/in2.fa
@@ -0,0 +1,2 @@
+>Y
+GGGGAATTCCCCGGGG
diff --git a/test_data/19/in2.gtf b/test_data/19/in2.gtf
@@ -0,0 +1,5 @@
+##sequence-region F.11 3 27
+H	ref	exon	1	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	CDS	4	14	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	start_codon	1	3	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	stop_codon	14	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
diff --git a/test_data/19/in3.fa b/test_data/19/in3.fa
@@ -0,0 +1,2 @@
+>M
+CCCCGGGGAAAATTTT
diff --git a/test_data/19/in3.gtf b/test_data/19/in3.gtf
@@ -0,0 +1,5 @@
+##Test Data
+M	ref	exon	1	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+K	ref	CDS	4	14	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+Z	ref	start_codon	1	3	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	stop_codon	14	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
diff --git a/test_data/19/ref.fa b/test_data/19/ref.fa
@@ -0,0 +1,2 @@
+>X
+ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK
diff --git a/test_data/19/ref.gtf b/test_data/19/ref.gtf
@@ -0,0 +1,4 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
diff --git a/test_reform.py b/test_reform.py
@@ -762,5 +762,45 @@ def test_case_18(self):
 
 		os.chdir(wd)
 
+	def test_case_19(self):
+		"""
+		Case 19:
+		Testing Reform with incorrect new chrom and comments in input FASTA 
+  		and annotation files. Also, testing reform with more formal printing
+		"""
+
+		wd = os.getcwd()
+		os.chdir('test_data/19/')
+
+		command = """
+		python3 ../../reform.py \
+		--new_chrom="Y,H,M" \
+		--in_fasta=in1.fa,in2.fa,in3.fa \
+		--in_gff=in1.gtf,in2.gtf,in3.gtf \
+		--ref_fasta=ref.fa \
+		--ref_gff=ref.gtf
+		"""
+
+		response = subprocess.getoutput(command)
+		print(response)
+
+		with open('gold.gtf', 'r') as f:
+			gold_gff = f.read()
+		with open('ref_reformed.gtf', 'r') as f:
+			new_gff = f.read()
+		print("Testing gtf")
+		self.assertListEqual(list(gold_gff), list(new_gff))
+		print("Done")
+
+		with open('gold.fa', 'r') as f:
+			gold_fa = f.read()
+		with open('ref_reformed.fa', 'r') as f:
+			new_fa = f.read()
+		print("Testing Fasta")
+		self.assertListEqual(list(gold_fa), list(new_fa))
+		print("Done")
+
+		os.chdir(wd)
+
 if __name__ == '__main__':
     unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+>X`
	`2`	`+ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK`