ENH update 06_Compare_with_other_datasets

cocodyq · cocodyq · commit 073dab7cbefe · 2024-05-16T15:41:15.000+08:00
diff --git a/General_Scripts/06_Compare_with_other_datasets/Readme.md b/General_Scripts/06_Compare_with_other_datasets/Readme.md
@@ -2,5 +2,6 @@
 
 | **Code** | **Description** |
 | :---: | :---: |
-| 01_download.sh 02_filter_sp_dedup.py | Download archaeal and bacterial proteins from Refseq, filter sequences (<100aa) and remove redundancy | 
+| 01_download.sh | Download archaeal and bacterial proteins from Refseq | 
+| 02_filter_sp_dedup.py | Filter sequences (<100aa) and remove redundancy | 
 | 03_align.sh | Use Diamond to align sequences to GMSC | 
diff --git a/General_Scripts/06_Compare_with_other_datasets/fasta.py b/General_Scripts/06_Compare_with_other_datasets/fasta.py
@@ -0,0 +1,28 @@
+def fasta_iter(fname, full_header=False):
+    header = None
+    chunks = []
+    if fname.endswith('.gz'):
+        import gzip
+        op = gzip.open
+    elif fname.endswith('.xz'):
+        import lzma
+        op = lzma.open
+    else:
+        op = open
+    with op(fname, 'rt') as f:
+        for line in f:
+            if line[0] == '>':
+                if header is not None:
+                    yield header,''.join(chunks)
+                line = line[1:].strip()
+                if not line:
+                    header = ''
+                elif full_header:
+                    header = line.strip()
+                else:
+                    header = line.split()[0]
+                chunks = []
+            else:
+                chunks.append(line.strip())
+        if header is not None:
+            yield header, ''.join(chunks)