-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfind_antigens.py
More file actions
69 lines (55 loc) · 2.22 KB
/
find_antigens.py
File metadata and controls
69 lines (55 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import re
import os
class AntigenExtractor:
def __init__(self):
self.file_contents = {}
self.antigen_pattern = r'\b(?:CD\d+|HLA-\w+|[A-Z]{2,}\d*)\b'
def read_files_from_directory(self, directory_path):
"""
Read all text files from a given directory and store their contents.
Parameters:
directory_path (str): Path to the directory containing the text files.
"""
for filename in os.listdir(directory_path):
if filename.endswith(".txt"):
file_path = os.path.join(directory_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
self.file_contents[filename] = file.read()
def extract_antigens(self):
"""
Extract all unique antigen names mentioned in the files.
Returns:
dict: A dictionary where keys are file names and values are sets of unique antigen names.
"""
antigen_data = {}
for file_name, content in self.file_contents.items():
antigens = set(re.findall(self.antigen_pattern, content))
antigen_data[file_name] = antigens
return antigen_data
def query_unique_antigens(self):
"""
Get the unique antigen names across all files.
Returns:
set: A set of unique antigen names found across all files.
"""
all_antigens = set()
antigen_data = self.extract_antigens()
for antigens in antigen_data.values():
all_antigens.update(antigens)
return all_antigens
# Usage example
# Create an instance of the extractor
extractor = AntigenExtractor()
# Provide the path to the directory where your text files are stored
directory_path = 'downloaded_articles/txt'
# Read the files from the directory
extractor.read_files_from_directory(directory_path)
# Extract unique antigens from each file
antigen_data = extractor.extract_antigens()
print("Antigens found in each file:")
for file, antigens in antigen_data.items():
print(f"{file}: {antigens}")
# Query unique antigens across all files
unique_antigens = extractor.query_unique_antigens()
print("\nUnique antigens across all files:")
print(unique_antigens)