-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
74 lines (63 loc) · 3.36 KB
/
main.py
File metadata and controls
74 lines (63 loc) · 3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Importing the scripts as part of a package
from preprocessing import BayPass, PCA, PoPoolation
import pandas as pd
import numpy as np
# Library for Logging
# import loguru
# Library for better markdown console messages
# from rich.console import Console
# from rich.markdown import Markdown
# Library for cli
# import argparse
# Library alternative to pandas for quicker dataframe operations (supports multithreading as well)
# import polars
# data validation
# import pydantic
maf_files = ['maf_LR537121.csv', 'maf_LR537122.csv', 'maf_LR537123.csv', 'maf_LR537124.csv',
'maf_LR537125.csv', 'maf_LR537126.csv', 'maf_LR537127.csv', 'maf_LR537128.csv',
'maf_LR537129.csv', 'maf_LR537130.csv', 'maf_LR537131.csv', 'maf_LR537132.csv',
'maf_LR537133.csv', 'maf_LR537134.csv', 'maf_LR537135.csv', 'maf_LR537136.csv',
'maf_LR537137.csv', 'maf_LR537138.csv', 'maf_LR537139.csv', 'maf_LR537140.csv',
'maf_LR537141.csv', 'maf_LR537142.csv', 'maf_LR537143.csv', 'maf_LR537144.csv', 'maf_all.csv']
# maf_files = ['maf_all.csv']
def detect_zero_sum_rows(input_dataframe):
# Extract only the columns corresponding to allele counts (A, T, C, G)
allele_columns = input_dataframe.iloc[:, 4:] # Ignoring the first 4 columns: chr, pos, index, ref_n
# Group the data in sets of 4 columns (A, T, C, G) per sample
num_samples = allele_columns.shape[1] // 4
# Create 3D numpy array with rows (genomic positions), columns (samples), allele counts (ATCG) as dimensions
reshaped_data = allele_columns.to_numpy().reshape(allele_columns.shape[0], num_samples, 4)
# Sum ATCG values for each sample
allele_sums = reshaped_data.sum(axis=2)
# Extract the positions and rows where any sample has an allele sum of zero
zero_sum_positions = input_dataframe[np.any(allele_sums == 0, axis=1)]
zero_sum_indices = zero_sum_positions.index.tolist()
zero_count = np.sum(allele_sums == 0)
print(f"Number of rows with zero allele sum: {zero_sum_positions.shape[0]}")
print(f"Total ATCG zero sum instances count: {zero_count}")
return zero_sum_indices
for maf_filename in maf_files:
# Read the CSV, skipping the duplicate header row
biallelic_fish_df = pd.read_csv(f'maf_files/{maf_filename}', sep=",", header=1)
biallelic_fish_df.drop(index = detect_zero_sum_rows(biallelic_fish_df), inplace = True)
# Processing with BayPass
try:
print(f"Processing {maf_filename} with BayPass...")
BayPass.run_baypass_prep(biallelic_fish_df, maf_filename)
print(f"Successfully processed {maf_filename} with BayPass.\n")
except Exception as e:
print(f"Error processing {maf_filename} with BayPass: {e}\n")
# Processing with PCA
try:
print(f"Processing {maf_filename} with PCA...")
PCA.run_pca(biallelic_fish_df, maf_filename)
print(f"Successfully processed {maf_filename} with PCA.\n")
except Exception as e:
print(f"Error processing {maf_filename} with PCA: {e}\n")
# Processing with PoPoolation
try:
print(f"Processing {maf_filename} with PoPoolation...")
PoPoolation.run_popoolation_prep(biallelic_fish_df, maf_filename)
print(f"Successfully processed {maf_filename} with PoPoolation.\n")
except Exception as e:
print(f"Error processing {maf_filename} with PoPoolation: {e}\n")