|
| 1 | +""" |
| 2 | +Analyze eFP database structures to find the most compact schema representation. |
| 3 | +
|
| 4 | +Since we only need 3 columns (data_probeset_id, data_signal, data_bot_id), |
| 5 | +this script groups databases by their column signatures to identify |
| 6 | +shared patterns and enable a table-driven schema definition. |
| 7 | +
|
| 8 | +Usage: |
| 9 | + python api/Archive/analyze_efp_schemas.py |
| 10 | +""" |
| 11 | + |
| 12 | +import csv |
| 13 | +from collections import defaultdict |
| 14 | + |
| 15 | +STRUCTURE_CSV = "api/Archive/efp_tables_structure_sample_data_dump_01_28_25.csv" |
| 16 | +SAMPLE_DATA_CSV = "api/Archive/sample_data_export_feb_4.csv" |
| 17 | + |
| 18 | +# Only these 3 columns matter for the API |
| 19 | +NEEDED_COLUMNS = {"data_probeset_id", "data_signal", "data_bot_id"} |
| 20 | + |
| 21 | +# Extra columns that some databases have (we want to know which ones) |
| 22 | +EXTRA_COLUMNS = { |
| 23 | + "channel", "data_call", "data_num", "data_p_val", "data_p_value", |
| 24 | + "genome", "genome_id", "log", "orthogroup", "p_val", "project_id", |
| 25 | + "qvalue", "sample_file_name", "sample_tissue", "version", |
| 26 | +} |
| 27 | + |
| 28 | + |
| 29 | +def parse_structure_csv(): |
| 30 | + """Parse the structure CSV into per-database column definitions.""" |
| 31 | + db_columns = defaultdict(dict) # {db_name: {col_name: {type, nullable, default}}} |
| 32 | + |
| 33 | + with open(STRUCTURE_CSV, newline="") as f: |
| 34 | + reader = csv.DictReader(f) |
| 35 | + for row in reader: |
| 36 | + db = row["database_name"] |
| 37 | + col = row["COLUMN_NAME"] |
| 38 | + db_columns[db][col] = { |
| 39 | + "type": row["COLUMN_TYPE"], |
| 40 | + "nullable": row["IS_NULLABLE"] == "YES", |
| 41 | + "default": row["COLUMN_DEFAULT"], |
| 42 | + } |
| 43 | + return db_columns |
| 44 | + |
| 45 | + |
| 46 | +def extract_signature(db_cols): |
| 47 | + """ |
| 48 | + Extract a signature tuple for the 3 needed columns. |
| 49 | + Returns (probeset_type, probeset_nullable, signal_nullable, signal_default, bot_type, bot_nullable) |
| 50 | + """ |
| 51 | + p = db_cols.get("data_probeset_id", {}) |
| 52 | + s = db_cols.get("data_signal", {}) |
| 53 | + b = db_cols.get("data_bot_id", {}) |
| 54 | + return ( |
| 55 | + p.get("type", "?"), |
| 56 | + p.get("nullable", False), |
| 57 | + s.get("nullable", False), |
| 58 | + s.get("default", "NULL"), |
| 59 | + b.get("type", "?"), |
| 60 | + b.get("nullable", False), |
| 61 | + ) |
| 62 | + |
| 63 | + |
| 64 | +def parse_varchar_length(col_type): |
| 65 | + """Extract length from varchar(N) or return None for tinytext/text.""" |
| 66 | + if col_type.startswith("varchar("): |
| 67 | + return int(col_type[8:-1]) |
| 68 | + return None # tinytext, text, etc. |
| 69 | + |
| 70 | + |
| 71 | +def main(): |
| 72 | + db_columns = parse_structure_csv() |
| 73 | + |
| 74 | + print("=" * 80) |
| 75 | + print("EFP SCHEMA ANALYSIS - Only 3 columns needed") |
| 76 | + print("=" * 80) |
| 77 | + print(f"\nTotal databases: {len(db_columns)}") |
| 78 | + print(f"Needed columns: {', '.join(sorted(NEEDED_COLUMNS))}") |
| 79 | + |
| 80 | + # ---- 1. Check which databases have extra columns beyond the 3 ---- |
| 81 | + print("\n" + "=" * 80) |
| 82 | + print("DATABASES WITH EXTRA COLUMNS (beyond the 3 needed + proj_id + sample_id)") |
| 83 | + print("=" * 80) |
| 84 | + dbs_with_extras = {} |
| 85 | + for db, cols in sorted(db_columns.items()): |
| 86 | + extras = set(cols.keys()) - NEEDED_COLUMNS - {"proj_id", "sample_id"} |
| 87 | + if extras: |
| 88 | + dbs_with_extras[db] = extras |
| 89 | + print(f" {db}: {', '.join(sorted(extras))}") |
| 90 | + |
| 91 | + dbs_simple = set(db_columns.keys()) - set(dbs_with_extras.keys()) |
| 92 | + print(f"\n -> {len(dbs_simple)} databases have ONLY the 5 standard columns") |
| 93 | + print(f" -> {len(dbs_with_extras)} databases have extra columns") |
| 94 | + |
| 95 | + # ---- 2. Group databases by their 3-column signature ---- |
| 96 | + print("\n" + "=" * 80) |
| 97 | + print("GROUPING BY SIGNATURE (probeset_type, probeset_nullable, signal_nullable, signal_default, bot_type, bot_nullable)") |
| 98 | + print("=" * 80) |
| 99 | + |
| 100 | + sig_groups = defaultdict(list) |
| 101 | + for db, cols in sorted(db_columns.items()): |
| 102 | + sig = extract_signature(cols) |
| 103 | + sig_groups[sig].append(db) |
| 104 | + |
| 105 | + for sig, dbs in sorted(sig_groups.items(), key=lambda x: -len(x[1])): |
| 106 | + print(f"\n Signature: probeset={sig[0]}(nullable={sig[1]}) signal(nullable={sig[2]}, default={sig[3]}) bot={sig[4]}(nullable={sig[5]})") |
| 107 | + print(f" Count: {len(dbs)}") |
| 108 | + print(f" DBs: {', '.join(dbs[:10])}{'...' if len(dbs) > 10 else ''}") |
| 109 | + |
| 110 | + # ---- 3. Group by (probeset_len, bot_len) - the key variable dimensions ---- |
| 111 | + print("\n" + "=" * 80) |
| 112 | + print("DATA-DRIVEN COMPACT FORMAT: Group by (probeset_type, bot_type)") |
| 113 | + print("Only considering the 3 needed columns") |
| 114 | + print("=" * 80) |
| 115 | + |
| 116 | + # For the compact representation, what varies per database is: |
| 117 | + # - data_probeset_id: type (varchar(N) or tinytext) and length |
| 118 | + # - data_bot_id: type (varchar(N) or tinytext) and length |
| 119 | + # - data_signal: nullable and default (always float) |
| 120 | + # We can represent this as a tuple per database |
| 121 | + |
| 122 | + compact_entries = [] |
| 123 | + for db, cols in sorted(db_columns.items()): |
| 124 | + p = cols.get("data_probeset_id", {}) |
| 125 | + s = cols.get("data_signal", {}) |
| 126 | + b = cols.get("data_bot_id", {}) |
| 127 | + |
| 128 | + probeset_type = p.get("type", "varchar(24)") |
| 129 | + bot_type = b.get("type", "varchar(16)") |
| 130 | + signal_nullable = s.get("nullable", False) |
| 131 | + |
| 132 | + probeset_len = parse_varchar_length(probeset_type) |
| 133 | + bot_len = parse_varchar_length(bot_type) |
| 134 | + |
| 135 | + # Determine extra columns this DB needs |
| 136 | + extras = set(cols.keys()) - NEEDED_COLUMNS - {"proj_id", "sample_id"} |
| 137 | + |
| 138 | + compact_entries.append({ |
| 139 | + "db": db, |
| 140 | + "probeset_len": probeset_len, # None = tinytext |
| 141 | + "probeset_type": probeset_type, |
| 142 | + "bot_len": bot_len, # None = tinytext |
| 143 | + "bot_type": bot_type, |
| 144 | + "signal_nullable": signal_nullable, |
| 145 | + "extras": extras, |
| 146 | + }) |
| 147 | + |
| 148 | + # ---- 4. Show the most compact table-driven representation ---- |
| 149 | + print("\n" + "=" * 80) |
| 150 | + print("PROPOSED COMPACT TUPLE FORMAT") |
| 151 | + print("Each DB needs: (name, probeset_len_or_None, bot_len_or_None, signal_nullable)") |
| 152 | + print("None = tinytext (TEXT in our schema)") |
| 153 | + print("=" * 80) |
| 154 | + |
| 155 | + # Group by shared properties to find patterns |
| 156 | + pattern_groups = defaultdict(list) |
| 157 | + for e in compact_entries: |
| 158 | + key = (e["probeset_len"], e["bot_len"], e["signal_nullable"], tuple(sorted(e["extras"]))) |
| 159 | + pattern_groups[key].append(e["db"]) |
| 160 | + |
| 161 | + print(f"\nUnique (probeset_len, bot_len, signal_nullable, extras) combinations: {len(pattern_groups)}") |
| 162 | + print("\nTop patterns (most databases sharing the same column spec):") |
| 163 | + for (pl, bl, sn, ex), dbs in sorted(pattern_groups.items(), key=lambda x: -len(x[1]))[:20]: |
| 164 | + extras_str = f", extras={list(ex)}" if ex else "" |
| 165 | + print(f" probeset={pl}, bot={bl}, signal_nullable={sn}{extras_str}") |
| 166 | + print(f" Count: {len(dbs)}, DBs: {', '.join(dbs[:5])}{'...' if len(dbs) > 5 else ''}") |
| 167 | + |
| 168 | + # ---- 5. Generate the most compact code ---- |
| 169 | + print("\n" + "=" * 80) |
| 170 | + print("GENERATED COMPACT TABLE (for efp_schemas.py)") |
| 171 | + print("Format: (db_name, probeset_len, bot_len)") |
| 172 | + print(" - probeset_len: int for varchar(N), 0 for tinytext") |
| 173 | + print(" - bot_len: int for varchar(N), 0 for tinytext") |
| 174 | + print(" - signal is always float, nullable is always True (safe default)") |
| 175 | + print("=" * 80) |
| 176 | + |
| 177 | + # Simple databases (only 3 needed columns, no extras of concern) |
| 178 | + simple_dbs = [] |
| 179 | + complex_dbs = [] |
| 180 | + for e in compact_entries: |
| 181 | + # Filter out databases that ONLY have unneeded extras |
| 182 | + # (sample_file_name, data_call, data_p_val etc. are not needed) |
| 183 | + has_important_extras = e["extras"] - { |
| 184 | + "sample_file_name", "data_call", "data_p_val", "data_p_value", "data_num" |
| 185 | + } |
| 186 | + if has_important_extras: |
| 187 | + complex_dbs.append(e) |
| 188 | + else: |
| 189 | + simple_dbs.append(e) |
| 190 | + |
| 191 | + print(f"\nSimple databases (only need 3 columns): {len(simple_dbs)}") |
| 192 | + print(f"Complex databases (have unique extra columns): {len(complex_dbs)}") |
| 193 | + |
| 194 | + print("\n# ---- SIMPLE DATABASES (table-driven) ----") |
| 195 | + print("# (db_name, probeset_len, bot_len)") |
| 196 | + print("# probeset_len/bot_len: positive int = varchar(N), 0 = tinytext") |
| 197 | + print("_SIMPLE_EFP_SPECS = [") |
| 198 | + for e in sorted(simple_dbs, key=lambda x: x["db"]): |
| 199 | + pl = e["probeset_len"] if e["probeset_len"] is not None else 0 |
| 200 | + bl = e["bot_len"] if e["bot_len"] is not None else 0 |
| 201 | + print(f' ("{e["db"]}", {pl}, {bl}),') |
| 202 | + print("]") |
| 203 | + |
| 204 | + print(f"\n# ---- COMPLEX DATABASES (need manual definition) ----") |
| 205 | + for e in sorted(complex_dbs, key=lambda x: x["db"]): |
| 206 | + pl = e["probeset_len"] if e["probeset_len"] is not None else "tinytext" |
| 207 | + bl = e["bot_len"] if e["bot_len"] is not None else "tinytext" |
| 208 | + print(f'# {e["db"]}: probeset={pl}, bot={bl}, extras={sorted(e["extras"])}') |
| 209 | + |
| 210 | + # ---- 6. Analyze sample data for testing ---- |
| 211 | + print("\n" + "=" * 80) |
| 212 | + print("SAMPLE DATA SUMMARY (for test verification)") |
| 213 | + print("=" * 80) |
| 214 | + |
| 215 | + try: |
| 216 | + db_samples = defaultdict(list) |
| 217 | + with open(SAMPLE_DATA_CSV, newline="") as f: |
| 218 | + reader = csv.DictReader(f) |
| 219 | + for row in reader: |
| 220 | + db_samples[row["source_database"]].append({ |
| 221 | + "data_bot_id": row["data_bot_id"], |
| 222 | + "data_probeset_id": row["data_probeset_id"], |
| 223 | + "data_signal": row["data_signal"], |
| 224 | + }) |
| 225 | + |
| 226 | + print(f"Total databases with sample data: {len(db_samples)}") |
| 227 | + print(f"Total sample rows: {sum(len(v) for v in db_samples.values())}") |
| 228 | + |
| 229 | + # Verify sample data matches structure |
| 230 | + for db in sorted(db_samples.keys()): |
| 231 | + if db not in db_columns: |
| 232 | + print(f" WARNING: {db} has sample data but no structure definition!") |
| 233 | + for db in sorted(db_columns.keys()): |
| 234 | + if db not in db_samples: |
| 235 | + print(f" WARNING: {db} has structure but no sample data!") |
| 236 | + |
| 237 | + except FileNotFoundError: |
| 238 | + print(" Sample data file not found, skipping.") |
| 239 | + |
| 240 | + # ---- 7. Final recommendation ---- |
| 241 | + print("\n" + "=" * 80) |
| 242 | + print("RECOMMENDATION") |
| 243 | + print("=" * 80) |
| 244 | + print(f""" |
| 245 | +Since you only need 3 columns (data_probeset_id, data_signal, data_bot_id), |
| 246 | +the entire schema can be reduced to a simple lookup table. |
| 247 | +
|
| 248 | +Current efp_schemas.py: ~1984 lines |
| 249 | +Proposed compact version: ~{len(simple_dbs) + 50} lines (table + builder) |
| 250 | +
|
| 251 | +Each database only differs in: |
| 252 | + 1. data_probeset_id length (varchar(N) or tinytext) |
| 253 | + 2. data_bot_id length (varchar(N) or tinytext) |
| 254 | +
|
| 255 | +data_signal is always float. |
| 256 | +
|
| 257 | +The compact format uses a list of tuples: |
| 258 | + (db_name, probeset_len, bot_len) |
| 259 | +
|
| 260 | +A single builder function converts these tuples into full schema dicts. |
| 261 | +
|
| 262 | +Complex databases ({len(complex_dbs)}) that have unique extra columns |
| 263 | +(channel, genome, genome_id, orthogroup, version, log, p_val, qvalue, |
| 264 | +sample_tissue) need individual definitions. |
| 265 | +""") |
| 266 | + |
| 267 | + |
| 268 | +if __name__ == "__main__": |
| 269 | + main() |
0 commit comments