Skip to content

Commit a774a53

Browse files
committed
Initial commit: peptide generator tool and GUI
0 parents  commit a774a53

12 files changed

+288
-0
lines changed

.DS_Store

6 KB
Binary file not shown.

.idea/.gitignore

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Context.md

Whitespace-only changes.

control_peptides.fasta

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
>peptide_1
2+
WIDQYDLIN
3+
>peptide_2
4+
LYMMFILNW
5+
>peptide_3
6+
IMADNPCTD
7+
>peptide_4
8+
WESQMAPLF
9+
>peptide_5
10+
ILLYWGLTQ
11+
>peptide_6
12+
NHFPTFHCL
13+
>peptide_7
14+
IGVTVYWLK
15+
>peptide_8
16+
MMDKIFEAW
17+
>peptide_9
18+
MPKKSLRTK
19+
>peptide_10
20+
QLDQIGLGS

data/.DS_Store

6 KB
Binary file not shown.

data/assembly_data_report.jsonl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"accession":"GCF_000001405.40","annotationInfo":{"busco":{"buscoLineage":"primates_odb10","buscoVer":"5.7.1","complete":0.9887518,"duplicated":0.009433962,"fragmented":0.0045718434,"missing":0.0066763423,"singleCopy":0.97931784,"totalCount":"13780"},"method":"Best-placed RefSeq; Gnomon; RefSeqFE; cmsearch; tRNAscan-SE","name":"GCF_000001405.40-RS_2024_08","pipeline":"NCBI eukaryotic genome annotation pipeline","provider":"NCBI RefSeq","releaseDate":"2024-08-23","reportUrl":"https://www.ncbi.nlm.nih.gov/genome/annotation_euk/Homo_sapiens/GCF_000001405.40-RS_2024_08.html","softwareVersion":"10.3","stats":{"geneCounts":{"nonCoding":22163,"other":411,"proteinCoding":20078,"pseudogene":17063,"total":59715}},"status":"Updated annotation"},"assemblyInfo":{"assemblyLevel":"Chromosome","assemblyName":"GRCh38.p14","assemblyStatus":"current","assemblyType":"haploid-with-alt-loci","bioprojectAccession":"PRJNA31257","bioprojectLineage":[{"bioprojects":[{"accession":"PRJNA31257","title":"The Human Genome Project, currently maintained by the Genome Reference Consortium (GRC)"}]}],"blastUrl":"https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&PROG_DEF=blastn&BLAST_SPEC=GDH_GCF_000001405.40","description":"Genome Reference Consortium Human Build 38 patch release 14 (GRCh38.p14)","pairedAssembly":{"accession":"GCA_000001405.29","differences":"Removed 4 unlocalized and unplaced scaffolds.","onlyGenbank":"4 unlocalized and unplaced scaffolds.","refseqGenbankAreDifferent":true,"status":"current"},"refseqCategory":"reference genome","releaseDate":"2022-02-03","submitter":"Genome Reference Consortium","synonym":"hg38"},"assemblyStats":{"atgcCount":"3357952943","contigL50":18,"contigN50":57879411,"gapsBetweenScaffoldsCount":349,"gcCount":"1374283647","gcPercent":41.0,"numberOfComponentSequences":35611,"numberOfContigs":996,"numberOfOrganelles":1,"numberOfScaffolds":470,"scaffoldL50":16,"scaffoldN50":67794873,"totalNumberOfChromosomes":24,"totalSequenceLength":"3099441038","totalUngappedLength":"2948318359"},"currentAccession":"GCF_000001405.40","organelleInfo":[{"description":"Mitochondrion","submitter":"Genome Reference Consortium","totalSeqLength":"16569"}],"organism":{"commonName":"human","organismName":"Homo sapiens","taxId":9606},"pairedAccession":"GCA_000001405.29","sourceDatabase":"SOURCE_DATABASE_REFSEQ"}

data/data_summary.tsv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Organism Scientific Name Organism Common Name Organism Qualifier Taxonomy id Assembly Name Assembly Accession Source Annotation Level Contig N50 Size Submission Date Gene Count BioProject BioSample
2+
Homo sapiens human 9606 GRCh38.p14 GCF_000001405.40 RefSeq GCF_000001405.40-RS_2024_08 Chromosome 57879411 3099441038 2022-02-03 59715 PRJNA31257

data/dataset_catalog.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"apiVersion": "V2",
3+
"assemblies": [
4+
{
5+
"files": [
6+
{
7+
"filePath": "data_summary.tsv",
8+
"fileType": "DATA_TABLE",
9+
"uncompressedLengthBytes": "348"
10+
},
11+
{
12+
"filePath": "assembly_data_report.jsonl",
13+
"fileType": "DATA_REPORT",
14+
"uncompressedLengthBytes": "2344"
15+
}
16+
]
17+
},{
18+
"accession": "GCF_000001405.40",
19+
"files": [
20+
{
21+
"filePath": "GCF_000001405.40/protein.faa",
22+
"fileType": "PROTEIN_FASTA",
23+
"uncompressedLengthBytes": "105928370"
24+
}
25+
]
26+
}]}

generate_control_peptides.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import random
4+
import sys
5+
from pathlib import Path
6+
from typing import List
7+
8+
AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
9+
10+
def generate_random_peptides(length: int, count: int) -> List[str]:
11+
return ["".join(random.choices(AMINO_ACIDS, k=length)) for _ in range(count)]
12+
13+
def parse_fasta_sequences(fasta_path: Path) -> List[str]:
14+
sequences = []
15+
seq = []
16+
with open(fasta_path, 'r') as f:
17+
for line in f:
18+
line = line.strip()
19+
if not line:
20+
continue
21+
if line.startswith('>'):
22+
if seq:
23+
sequences.append(''.join(seq))
24+
seq = []
25+
else:
26+
seq.append(line)
27+
if seq:
28+
sequences.append(''.join(seq))
29+
return sequences
30+
31+
def sample_peptides_from_fasta(fasta_path: Path, length: int, count: int) -> List[str]:
32+
sequences = parse_fasta_sequences(fasta_path)
33+
all_subseqs = []
34+
for seq in sequences:
35+
if len(seq) >= length:
36+
for i in range(len(seq) - length + 1):
37+
all_subseqs.append(seq[i:i+length])
38+
if not all_subseqs:
39+
raise ValueError(f"No subsequences of length {length} found in {fasta_path}")
40+
peptides = random.sample(all_subseqs, k=min(count, len(all_subseqs)))
41+
while len(peptides) < count:
42+
peptides.append(random.choice(all_subseqs))
43+
return peptides[:count]
44+
45+
def generate_protgpt2_peptides(length: int, count: int) -> List[str]:
46+
try:
47+
from transformers import pipeline
48+
except ImportError:
49+
print("Error: transformers package is required for ProtGPT2 generation. Please install with 'pip install transformers torch'", file=sys.stderr)
50+
sys.exit(1)
51+
# Each token is ~4 amino acids, so for a peptide of length N, set max_length ≈ N/4 (rounded up)
52+
max_length = max(5, (length + 3) // 4) # ensure at least 1 token
53+
protgpt2 = pipeline('text-generation', model="nferruz/ProtGPT2", framework="pt")
54+
peptides = []
55+
tries = 0
56+
while len(peptides) < count and tries < count * 10:
57+
sequences = protgpt2("<|endoftext|>", max_length=max_length, do_sample=True, top_k=950, repetition_penalty=1.2, num_return_sequences=min(count - len(peptides), 10), eos_token_id=0)
58+
if not sequences or not hasattr(sequences, '__iter__'):
59+
tries += 1
60+
continue
61+
for seq in sequences:
62+
if not isinstance(seq, dict):
63+
continue
64+
gen_text = seq.get('generated_text', '')
65+
if not isinstance(gen_text, str):
66+
continue
67+
# Remove whitespace and newlines, keep only valid amino acids
68+
pep = ''.join([c for c in gen_text if c in AMINO_ACIDS])
69+
if len(pep) == length:
70+
peptides.append(pep)
71+
tries += 1
72+
if len(peptides) < count:
73+
print(f"Warning: Only generated {len(peptides)} peptides of requested {count} with exact length {length}.", file=sys.stderr)
74+
return peptides[:count]
75+
76+
def write_fasta(peptides: List[str], output_path: Path, prefix: str = "peptide"):
77+
with open(output_path, 'w') as f:
78+
for i, pep in enumerate(peptides, 1):
79+
f.write(f">{prefix}_{i}\n{pep}\n")
80+
81+
def main():
82+
parser = argparse.ArgumentParser(description="Generate control peptides for neoantigen analysis.")
83+
parser.add_argument('--length', type=int, required=True, help='Peptide length (e.g., 8, 9, 10)')
84+
parser.add_argument('--count', type=int, required=True, help='Number of peptides to generate')
85+
parser.add_argument('--source', choices=['random', 'fasta', 'protgpt2'], required=True, help='Source of peptides: random, fasta, or protgpt2')
86+
parser.add_argument('--fasta_file', type=Path, help='Path to reference FASTA file (required if source is fasta)')
87+
parser.add_argument('--output', type=Path, default=Path('control_peptides.fasta'), help='Output FASTA file')
88+
parser.add_argument('--seed', type=int, help='Random seed for reproducibility (not used for protgpt2)')
89+
args = parser.parse_args()
90+
91+
if args.seed is not None and args.source != 'protgpt2':
92+
random.seed(args.seed)
93+
94+
if args.source == 'random':
95+
peptides = generate_random_peptides(args.length, args.count)
96+
elif args.source == 'fasta':
97+
if not args.fasta_file:
98+
print('Error: --fasta_file is required when source is fasta', file=sys.stderr)
99+
sys.exit(1)
100+
peptides = sample_peptides_from_fasta(args.fasta_file, args.length, args.count)
101+
elif args.source == 'protgpt2':
102+
peptides = generate_protgpt2_peptides(args.length, args.count)
103+
else:
104+
print(f"Unknown source: {args.source}", file=sys.stderr)
105+
sys.exit(1)
106+
107+
write_fasta(peptides, args.output)
108+
print(f"Wrote {len(peptides)} peptides to {args.output}")
109+
110+
if __name__ == "__main__":
111+
main()

peptide_gui.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Requires: pip install PySimpleGUI
2+
import PySimpleGUI as sg
3+
import subprocess
4+
import sys
5+
import os
6+
7+
# Layout for the GUI
8+
layout = [
9+
[sg.Text('Peptide Source:'), sg.Combo(['random', 'fasta', 'protgpt2'], default_value='random', key='-SOURCE-', enable_events=True)],
10+
[sg.Text('Peptide Length:'), sg.Input(key='-LENGTH-', size=(5,1)), sg.Text('Count:'), sg.Input(key='-COUNT-', size=(5,1))],
11+
[sg.Text('FASTA File:'), sg.Input(key='-FASTA-', size=(30,1)), sg.FileBrowse(file_types=(('FASTA Files', '*.fasta *.faa'),), key='-FASTA_BROWSE-')],
12+
[sg.Text('Output File:'), sg.Input(key='-OUTPUT-', size=(30,1)), sg.FileSaveAs(file_types=(('FASTA Files', '*.fasta'),), key='-OUTPUT_BROWSE-')],
13+
[sg.Frame('ProtGPT2 Options (optional)', [
14+
[sg.Text('Temperature:'), sg.Input(key='-TEMP-', size=(5,1)),
15+
sg.Text('Top-k:'), sg.Input(key='-TOPK-', size=(5,1)),
16+
sg.Text('Top-p:'), sg.Input(key='-TOPP-', size=(5,1)),
17+
sg.Text('Repetition Penalty:'), sg.Input(key='-REPPEN-', size=(5,1))]
18+
])],
19+
[sg.Button('Generate'), sg.Exit()],
20+
[sg.Multiline(size=(80,10), key='-OUTPUTBOX-', autoscroll=True, disabled=True)]
21+
]
22+
23+
window = sg.Window('Peptide Control Generator', layout)
24+
25+
while True:
26+
event, values = window.read()
27+
if event in (sg.WIN_CLOSED, 'Exit'):
28+
break
29+
30+
# Enable/disable FASTA file input based on source
31+
if event == '-SOURCE-':
32+
if values['-SOURCE-'] == 'fasta':
33+
window['-FASTA-'].update(disabled=False)
34+
window['-FASTA_BROWSE-'].update(disabled=False)
35+
else:
36+
window['-FASTA-'].update(disabled=True)
37+
window['-FASTA_BROWSE-'].update(disabled=True)
38+
39+
if event == 'Generate':
40+
source = values['-SOURCE-']
41+
length = values['-LENGTH-']
42+
count = values['-COUNT-']
43+
fasta_file = values['-FASTA-']
44+
output_file = values['-OUTPUT-']
45+
temp = values['-TEMP-']
46+
topk = values['-TOPK-']
47+
topp = values['-TOPP-']
48+
reppen = values['-REPPEN-']
49+
50+
# Build command
51+
cmd = [sys.executable, 'generate_control_peptides.py', '--length', length, '--count', count, '--source', source, '--output', output_file]
52+
if source == 'fasta' and fasta_file:
53+
cmd += ['--fasta_file', fasta_file]
54+
if source == 'protgpt2':
55+
if temp:
56+
cmd += ['--temperature', temp]
57+
if topk:
58+
cmd += ['--top_k', topk]
59+
if topp:
60+
cmd += ['--top_p', topp]
61+
if reppen:
62+
cmd += ['--repetition_penalty', reppen]
63+
64+
outputbox = window['-OUTPUTBOX-']
65+
if outputbox is not None:
66+
outputbox.update('Running command:\n' + ' '.join(cmd) + '\n')
67+
try:
68+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
69+
if proc.stdout is not None:
70+
for line in proc.stdout:
71+
if outputbox is not None:
72+
outputbox.update(line, append=True)
73+
proc.wait()
74+
if outputbox is not None:
75+
outputbox.update(f'\nDone. Exit code: {proc.returncode}\n', append=True)
76+
except Exception as e:
77+
if outputbox is not None:
78+
outputbox.update(f'Error: {e}\n', append=True)
79+
80+
window.close()

0 commit comments

Comments
 (0)