Skip to content

Commit bedbddf

Browse files
authored
Merge pull request #305 from rmobmina/fix/refactor-efp-data-and-schemas
Add gene ID utils and refactor eFP expression endpoint
2 parents 1a55d65 + 4c1823c commit bedbddf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+53647
-460
lines changed

.flake8

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
[flake8]
2-
max-line-length = 120
3-
extend-ignore = E203, W503, E501
2+
ignore = E501, E203, E121, E123, E126, W503, W504
3+
per-file-ignores =
4+
# DATABASE_SPECIES uses aligned dict values for readability (intentional)
5+
api/utils/gene_id_utils.py: E241
46
exclude =
57
.git,
6-
__pycache__,
7-
docs/source/conf.py,
8-
old,
9-
build,
10-
dist,
11-
venv,
12-
env,
138
.venv,
14-
./venv-docs,
9+
__pycache__,
10+
api/Archive,
11+
data,
1512
docs,
16-
.env
13+
instance,
14+
output,
15+
venv

.github/workflows/bar-api.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ jobs:
1212

1313
runs-on: ubuntu-24.04
1414
strategy:
15+
fail-fast: false
1516
matrix:
1617
python-version: [3.10.18, 3.11, 3.12, 3.13]
1718

.github/workflows/codeql.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,16 @@ jobs:
2929

3030
# Initializes the CodeQL tools for scanning.
3131
- name: Initialize CodeQL
32-
uses: github/codeql-action/init@v2
32+
uses: github/codeql-action/init@v3
3333
with:
3434
languages: ${{ matrix.language }}
3535

3636
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
3737
# If this step fails, then you should remove it and run the build manually (see below)
3838
- name: Autobuild
39-
uses: github/codeql-action/autobuild@v2
39+
uses: github/codeql-action/autobuild@v3
4040

4141
- name: Perform CodeQL Analysis
42-
uses: github/codeql-action/analyze@v2
42+
uses: github/codeql-action/analyze@v3
4343
with:
4444
category: "/language:${{matrix.language}}"

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,6 @@ dmypy.json
141141
output/*
142142
!output
143143
!output/placeholder.txt
144+
145+
# Local sqlite mirrors generated from MySQL dumps
146+
config/databases/*.db

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44

55
[![Website Status](https://img.shields.io/website?url=http%3A%2F%2Fbar.utoronto.ca%2Fapi%2F)](http://bar.utoronto.ca/api/) ![GitHub repo size](https://img.shields.io/github/repo-size/BioAnalyticResource/BAR_API) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Documentation Status](https://readthedocs.org/projects/bar-api/badge/?version=latest)](https://bar-api.readthedocs.io/en/latest/?badge=latest)
66

7-
This is the official repository for the Bio-Analytic Resource API. The API documentation can be found [here](https://bar-api.readthedocs.io/en/latest/).
7+
This is the official repository for the Bio-Analytic Resource API. The API documentation can be found [here](https://bar-api.readthedocs.io/en/latest/).

api/Archive/analyze_efp_schemas.py

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
"""
2+
Analyze eFP database structures to find the most compact schema representation.
3+
4+
Since we only need 3 columns (data_probeset_id, data_signal, data_bot_id),
5+
this script groups databases by their column signatures to identify
6+
shared patterns and enable a table-driven schema definition.
7+
8+
Usage:
9+
python api/Archive/analyze_efp_schemas.py
10+
"""
11+
12+
import csv
13+
from collections import defaultdict
14+
15+
STRUCTURE_CSV = "api/Archive/efp_tables_structure_sample_data_dump_01_28_25.csv"
16+
SAMPLE_DATA_CSV = "api/Archive/sample_data_export_feb_4.csv"
17+
18+
# Only these 3 columns matter for the API
19+
NEEDED_COLUMNS = {"data_probeset_id", "data_signal", "data_bot_id"}
20+
21+
# Extra columns that some databases have (we want to know which ones)
22+
EXTRA_COLUMNS = {
23+
"channel", "data_call", "data_num", "data_p_val", "data_p_value",
24+
"genome", "genome_id", "log", "orthogroup", "p_val", "project_id",
25+
"qvalue", "sample_file_name", "sample_tissue", "version",
26+
}
27+
28+
29+
def parse_structure_csv():
30+
"""Parse the structure CSV into per-database column definitions."""
31+
db_columns = defaultdict(dict) # {db_name: {col_name: {type, nullable, default}}}
32+
33+
with open(STRUCTURE_CSV, newline="") as f:
34+
reader = csv.DictReader(f)
35+
for row in reader:
36+
db = row["database_name"]
37+
col = row["COLUMN_NAME"]
38+
db_columns[db][col] = {
39+
"type": row["COLUMN_TYPE"],
40+
"nullable": row["IS_NULLABLE"] == "YES",
41+
"default": row["COLUMN_DEFAULT"],
42+
}
43+
return db_columns
44+
45+
46+
def extract_signature(db_cols):
47+
"""
48+
Extract a signature tuple for the 3 needed columns.
49+
Returns (probeset_type, probeset_nullable, signal_nullable, signal_default, bot_type, bot_nullable)
50+
"""
51+
p = db_cols.get("data_probeset_id", {})
52+
s = db_cols.get("data_signal", {})
53+
b = db_cols.get("data_bot_id", {})
54+
return (
55+
p.get("type", "?"),
56+
p.get("nullable", False),
57+
s.get("nullable", False),
58+
s.get("default", "NULL"),
59+
b.get("type", "?"),
60+
b.get("nullable", False),
61+
)
62+
63+
64+
def parse_varchar_length(col_type):
65+
"""Extract length from varchar(N) or return None for tinytext/text."""
66+
if col_type.startswith("varchar("):
67+
return int(col_type[8:-1])
68+
return None # tinytext, text, etc.
69+
70+
71+
def main():
72+
db_columns = parse_structure_csv()
73+
74+
print("=" * 80)
75+
print("EFP SCHEMA ANALYSIS - Only 3 columns needed")
76+
print("=" * 80)
77+
print(f"\nTotal databases: {len(db_columns)}")
78+
print(f"Needed columns: {', '.join(sorted(NEEDED_COLUMNS))}")
79+
80+
# ---- 1. Check which databases have extra columns beyond the 3 ----
81+
print("\n" + "=" * 80)
82+
print("DATABASES WITH EXTRA COLUMNS (beyond the 3 needed + proj_id + sample_id)")
83+
print("=" * 80)
84+
dbs_with_extras = {}
85+
for db, cols in sorted(db_columns.items()):
86+
extras = set(cols.keys()) - NEEDED_COLUMNS - {"proj_id", "sample_id"}
87+
if extras:
88+
dbs_with_extras[db] = extras
89+
print(f" {db}: {', '.join(sorted(extras))}")
90+
91+
dbs_simple = set(db_columns.keys()) - set(dbs_with_extras.keys())
92+
print(f"\n -> {len(dbs_simple)} databases have ONLY the 5 standard columns")
93+
print(f" -> {len(dbs_with_extras)} databases have extra columns")
94+
95+
# ---- 2. Group databases by their 3-column signature ----
96+
print("\n" + "=" * 80)
97+
print("GROUPING BY SIGNATURE (probeset_type, probeset_nullable, signal_nullable, signal_default, bot_type, bot_nullable)")
98+
print("=" * 80)
99+
100+
sig_groups = defaultdict(list)
101+
for db, cols in sorted(db_columns.items()):
102+
sig = extract_signature(cols)
103+
sig_groups[sig].append(db)
104+
105+
for sig, dbs in sorted(sig_groups.items(), key=lambda x: -len(x[1])):
106+
print(f"\n Signature: probeset={sig[0]}(nullable={sig[1]}) signal(nullable={sig[2]}, default={sig[3]}) bot={sig[4]}(nullable={sig[5]})")
107+
print(f" Count: {len(dbs)}")
108+
print(f" DBs: {', '.join(dbs[:10])}{'...' if len(dbs) > 10 else ''}")
109+
110+
# ---- 3. Group by (probeset_len, bot_len) - the key variable dimensions ----
111+
print("\n" + "=" * 80)
112+
print("DATA-DRIVEN COMPACT FORMAT: Group by (probeset_type, bot_type)")
113+
print("Only considering the 3 needed columns")
114+
print("=" * 80)
115+
116+
# For the compact representation, what varies per database is:
117+
# - data_probeset_id: type (varchar(N) or tinytext) and length
118+
# - data_bot_id: type (varchar(N) or tinytext) and length
119+
# - data_signal: nullable and default (always float)
120+
# We can represent this as a tuple per database
121+
122+
compact_entries = []
123+
for db, cols in sorted(db_columns.items()):
124+
p = cols.get("data_probeset_id", {})
125+
s = cols.get("data_signal", {})
126+
b = cols.get("data_bot_id", {})
127+
128+
probeset_type = p.get("type", "varchar(24)")
129+
bot_type = b.get("type", "varchar(16)")
130+
signal_nullable = s.get("nullable", False)
131+
132+
probeset_len = parse_varchar_length(probeset_type)
133+
bot_len = parse_varchar_length(bot_type)
134+
135+
# Determine extra columns this DB needs
136+
extras = set(cols.keys()) - NEEDED_COLUMNS - {"proj_id", "sample_id"}
137+
138+
compact_entries.append({
139+
"db": db,
140+
"probeset_len": probeset_len, # None = tinytext
141+
"probeset_type": probeset_type,
142+
"bot_len": bot_len, # None = tinytext
143+
"bot_type": bot_type,
144+
"signal_nullable": signal_nullable,
145+
"extras": extras,
146+
})
147+
148+
# ---- 4. Show the most compact table-driven representation ----
149+
print("\n" + "=" * 80)
150+
print("PROPOSED COMPACT TUPLE FORMAT")
151+
print("Each DB needs: (name, probeset_len_or_None, bot_len_or_None, signal_nullable)")
152+
print("None = tinytext (TEXT in our schema)")
153+
print("=" * 80)
154+
155+
# Group by shared properties to find patterns
156+
pattern_groups = defaultdict(list)
157+
for e in compact_entries:
158+
key = (e["probeset_len"], e["bot_len"], e["signal_nullable"], tuple(sorted(e["extras"])))
159+
pattern_groups[key].append(e["db"])
160+
161+
print(f"\nUnique (probeset_len, bot_len, signal_nullable, extras) combinations: {len(pattern_groups)}")
162+
print("\nTop patterns (most databases sharing the same column spec):")
163+
for (pl, bl, sn, ex), dbs in sorted(pattern_groups.items(), key=lambda x: -len(x[1]))[:20]:
164+
extras_str = f", extras={list(ex)}" if ex else ""
165+
print(f" probeset={pl}, bot={bl}, signal_nullable={sn}{extras_str}")
166+
print(f" Count: {len(dbs)}, DBs: {', '.join(dbs[:5])}{'...' if len(dbs) > 5 else ''}")
167+
168+
# ---- 5. Generate the most compact code ----
169+
print("\n" + "=" * 80)
170+
print("GENERATED COMPACT TABLE (for efp_schemas.py)")
171+
print("Format: (db_name, probeset_len, bot_len)")
172+
print(" - probeset_len: int for varchar(N), 0 for tinytext")
173+
print(" - bot_len: int for varchar(N), 0 for tinytext")
174+
print(" - signal is always float, nullable is always True (safe default)")
175+
print("=" * 80)
176+
177+
# Simple databases (only 3 needed columns, no extras of concern)
178+
simple_dbs = []
179+
complex_dbs = []
180+
for e in compact_entries:
181+
# Filter out databases that ONLY have unneeded extras
182+
# (sample_file_name, data_call, data_p_val etc. are not needed)
183+
has_important_extras = e["extras"] - {
184+
"sample_file_name", "data_call", "data_p_val", "data_p_value", "data_num"
185+
}
186+
if has_important_extras:
187+
complex_dbs.append(e)
188+
else:
189+
simple_dbs.append(e)
190+
191+
print(f"\nSimple databases (only need 3 columns): {len(simple_dbs)}")
192+
print(f"Complex databases (have unique extra columns): {len(complex_dbs)}")
193+
194+
print("\n# ---- SIMPLE DATABASES (table-driven) ----")
195+
print("# (db_name, probeset_len, bot_len)")
196+
print("# probeset_len/bot_len: positive int = varchar(N), 0 = tinytext")
197+
print("_SIMPLE_EFP_SPECS = [")
198+
for e in sorted(simple_dbs, key=lambda x: x["db"]):
199+
pl = e["probeset_len"] if e["probeset_len"] is not None else 0
200+
bl = e["bot_len"] if e["bot_len"] is not None else 0
201+
print(f' ("{e["db"]}", {pl}, {bl}),')
202+
print("]")
203+
204+
print(f"\n# ---- COMPLEX DATABASES (need manual definition) ----")
205+
for e in sorted(complex_dbs, key=lambda x: x["db"]):
206+
pl = e["probeset_len"] if e["probeset_len"] is not None else "tinytext"
207+
bl = e["bot_len"] if e["bot_len"] is not None else "tinytext"
208+
print(f'# {e["db"]}: probeset={pl}, bot={bl}, extras={sorted(e["extras"])}')
209+
210+
# ---- 6. Analyze sample data for testing ----
211+
print("\n" + "=" * 80)
212+
print("SAMPLE DATA SUMMARY (for test verification)")
213+
print("=" * 80)
214+
215+
try:
216+
db_samples = defaultdict(list)
217+
with open(SAMPLE_DATA_CSV, newline="") as f:
218+
reader = csv.DictReader(f)
219+
for row in reader:
220+
db_samples[row["source_database"]].append({
221+
"data_bot_id": row["data_bot_id"],
222+
"data_probeset_id": row["data_probeset_id"],
223+
"data_signal": row["data_signal"],
224+
})
225+
226+
print(f"Total databases with sample data: {len(db_samples)}")
227+
print(f"Total sample rows: {sum(len(v) for v in db_samples.values())}")
228+
229+
# Verify sample data matches structure
230+
for db in sorted(db_samples.keys()):
231+
if db not in db_columns:
232+
print(f" WARNING: {db} has sample data but no structure definition!")
233+
for db in sorted(db_columns.keys()):
234+
if db not in db_samples:
235+
print(f" WARNING: {db} has structure but no sample data!")
236+
237+
except FileNotFoundError:
238+
print(" Sample data file not found, skipping.")
239+
240+
# ---- 7. Final recommendation ----
241+
print("\n" + "=" * 80)
242+
print("RECOMMENDATION")
243+
print("=" * 80)
244+
print(f"""
245+
Since you only need 3 columns (data_probeset_id, data_signal, data_bot_id),
246+
the entire schema can be reduced to a simple lookup table.
247+
248+
Current efp_schemas.py: ~1984 lines
249+
Proposed compact version: ~{len(simple_dbs) + 50} lines (table + builder)
250+
251+
Each database only differs in:
252+
1. data_probeset_id length (varchar(N) or tinytext)
253+
2. data_bot_id length (varchar(N) or tinytext)
254+
255+
data_signal is always float.
256+
257+
The compact format uses a list of tuples:
258+
(db_name, probeset_len, bot_len)
259+
260+
A single builder function converts these tuples into full schema dicts.
261+
262+
Complex databases ({len(complex_dbs)}) that have unique extra columns
263+
(channel, genome, genome_id, orthogroup, version, log, p_val, qvalue,
264+
sample_tissue) need individual definitions.
265+
""")
266+
267+
268+
if __name__ == "__main__":
269+
main()

0 commit comments

Comments
 (0)