GAN-Based-Synthetic-MRI-Augmentation/organize_adni_data.py at main · Moses-Mk/GAN-Based-Synthetic-MRI-Augmentation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# ==============================
# DATASET FILTERING AND CLEANUP
# ==============================

"""
This script performs preprocessing and filtering of raw NIfTI MRI files
for the CN and EMCI groups. Key steps include:

1. Validating CSV and raw NIfTI directories.
2. Ensuring NIfTI files are correctly gzipped (.nii.gz) or uncompressed (.nii).
3. Matching subject IDs from CSVs to available NIfTI scans.
4. Copying and renaming files to a unified output directory.
5. Reporting missing scans.

After running this script, the `gan_data` folder will contain a clean
and normalized subset of MRI scans ready for downstream GAN training
or slice extraction.
"""

# ------------------------------
# Imports
# ------------------------------
import os
import shutil
import pandas as pd
import gzip
from pathlib import Path

# ------------------------------
# CONFIGURATION
# ------------------------------

# Automatically sets base directory to location of this script
BASE_DIR = Path(__file__).resolve().parent

CN_CSV = BASE_DIR / "Dataset" / "CN.csv"
EMCI_CSV = BASE_DIR / "Dataset" / "EMCI.csv"

RAW_ROOT = BASE_DIR / "Dataset"
OUTPUT_ROOT = BASE_DIR / "gan_data"

USE_SYMLINKS = False  # Use symbolic links instead of copies (safer=False for Windows)

# ------------------------------
# UTILITY FUNCTIONS
# ------------------------------

def safe_mkdir(path: Path):
    """Create a directory and all parent directories if they don't exist."""
    path.mkdir(parents=True, exist_ok=True)

def validate_path(path: Path, description: str):
    """Raise an error if a required path is missing."""
    if not path.exists():
        raise FileNotFoundError(f"❌ Missing {description}: {path}")

def is_real_gzip(path: Path):
    """Check if a file is a valid gzip archive."""
    try:
        with gzip.open(path, "rb") as f:
            f.read(1)
        return True
    except OSError:
        return False

def normalize_nii_file(path: Path) -> Path:
    """
    Ensure that .nii.gz files are properly compressed.
    If a .gz is corrupted, rename it to .nii.
    """
    if path.suffix == ".gz":
        if not is_real_gzip(path):
            new_path = path.with_suffix("")
            path.rename(new_path)
            print(f"🔧 Renamed: {path.name} → {new_path.name}")
            return new_path
    return path

def find_nii_files(root: Path):
    """Recursively find all .nii and .nii.gz files under root."""
    validate_path(root, "raw dataset directory")
    return list(root.rglob("*.nii")) + list(root.rglob("*.nii.gz"))

def detect_subject_column(df):
    """Detect which CSV column contains subject IDs."""
    for c in df.columns:
        if "subject" in c.lower():
            return c
    raise ValueError(
        f"❌ No subject column found. Columns were: {list(df.columns)}"
    )

# ------------------------------
# CORE PROCESSING FUNCTION
# ------------------------------

def process_group(csv_path: Path, diagnosis: str, raw_subdir: Path, out_root: Path):
    """
    Filter and copy NIfTI files for a specific diagnostic group.

    Args:
        csv_path: Path to CSV containing subject IDs.
        diagnosis: "CN" or "EMCI".
        raw_subdir: Directory with raw NIfTI files for the group.
        out_root: Output directory for filtered and normalized files.
    """
    validate_path(csv_path, f"{diagnosis} CSV")

    # Load CSV and extract unique subject IDs
    df = pd.read_csv(csv_path)
    subject_col = detect_subject_column(df)
    subjects = df[subject_col].dropna().astype(str).unique()
    print(f"[{diagnosis}] Subjects in CSV: {len(subjects)}")

    # Prepare output directory
    out_dir = out_root / diagnosis
    safe_mkdir(out_dir)

    # Index all raw NIfTI files
    nii_files = find_nii_files(raw_subdir)
    print(f"[{diagnosis}] Indexed {len(nii_files)} raw NIfTI files")

    used = set()

    for subj in subjects:
        # Match NIfTI file corresponding to the current subject
        matches = [
            f for f in nii_files
            if subj in f.name and f not in used
        ]

        if not matches:
            print(f"⚠️ Missing scan for {subj}")
            continue

        # Normalize gzip files if needed
        src = normalize_nii_file(matches[0])
        used.add(matches[0])

        # Copy to output directory
        dst = out_dir / f"{subj}{src.suffix}"
        shutil.copy2(src, dst)

    print(f"[{diagnosis}] Done.\n")

# ------------------------------
# RUN FOR CN AND EMCI
# ------------------------------

process_group(
    CN_CSV,
    "CN",
    RAW_ROOT / "CN",
    OUTPUT_ROOT
)

process_group(
    EMCI_CSV,
    "EMCI",
    RAW_ROOT / "EMCI DATASET",
    OUTPUT_ROOT
)

print("✅ Dataset restructuring + NIfTI normalization complete.")