Skip to content

Commit e549a97

Browse files
committed
WIP: Reduce lgx the alleles from the rel_dna_ser before loading to database nmdp-bioinformatics#252
Reduce "lgx" the alleles from the rel_dna_ser before loading and reduce "lgx" on input which will simplify the query and reduce the size of the table. Failing on G group reduction of Serology. See # tests/features/serology.feature:15 ``` Given the serology typing is A10 When reducing on the G level (ambiguous) ``` Doesn't match when serology_mapping was without lgx reduced alleles.
1 parent 47f347f commit e549a97

File tree

3 files changed

+27
-16
lines changed

3 files changed

+27
-16
lines changed

pyard/ard.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,9 @@ def __init__(
114114
broad_splits.broad_splits_ser_mapping = (
115115
dr.generate_serology_broad_split_mapping(self.db_connection, imgt_version)
116116
)
117-
dr.generate_serology_mapping(self.db_connection, imgt_version)
117+
dr.generate_serology_mapping(
118+
self.db_connection, imgt_version, self._redux_allele
119+
)
118120
# Load V2 to V3 mappings
119121
dr.generate_v2_to_v3_mapping(self.db_connection, imgt_version)
120122
# Save IMGT database version
@@ -124,9 +126,6 @@ def __init__(
124126
# Load CIWD mapping
125127
dr.generate_cwd_mapping(self.db_connection)
126128

127-
# Close the current read-write db connection
128-
self.db_connection.close()
129-
130129
# Adjust the cache for redux
131130
if max_cache_size != DEFAULT_CACHE_SIZE:
132131
self._redux_allele = functools.lru_cache(maxsize=max_cache_size)(
@@ -147,6 +146,9 @@ def __init__(
147146

148147
gc.freeze()
149148

149+
# Close the current read-write db connection
150+
self.db_connection.close()
151+
150152
# Re-open the connection in read-only mode as we're not updating it anymore
151153
self.db_connection, _ = db.create_db_connection(data_dir, imgt_version, ro=True)
152154

@@ -268,17 +270,19 @@ def _redux_allele(
268270
# If ambiguous, reduce to G group level
269271
return self._redux_allele(allele, "lgx")
270272
elif redux_type == "S":
273+
# reduce allele to ARD level
274+
lgx_allele = self._redux_allele(allele, "lgx", re_ping)
271275
# find serology equivalent in serology_mapping
272-
serology_mapping = db.find_serology_for_allele(self.db_connection, allele)
276+
# look for lgx_allele as a wildcard in the allele list
277+
serology_mapping = db.find_serology_for_allele(
278+
self.db_connection, lgx_allele
279+
)
280+
# Verify that the actual allele is in allele list, if so keep track of
281+
# mapped serology
273282
serology_set = set()
274283
for serology, allele_list in serology_mapping.items():
275-
if allele in allele_list.split("/"):
284+
if lgx_allele in allele_list.split("/"):
276285
serology_set.add(serology)
277-
if not serology_set and is_2_field_allele(allele):
278-
for serology, allele_list in serology_mapping.items():
279-
allele_list_lgx = self.redux(allele_list, "lgx")
280-
if allele in allele_list_lgx.split("/"):
281-
serology_set.add(serology)
282286
return "/".join(
283287
sorted(
284288
serology_set, key=functools.cmp_to_key(self.smart_sort_comparator)

pyard/data_repository.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,9 @@ def to_serological_name(locus_name: str):
354354
return sero_name
355355

356356

357-
def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
357+
def generate_serology_mapping(
358+
db_connection: sqlite3.Connection, imgt_version, redux_function
359+
):
358360
if not db.table_exists(db_connection, "serology_mapping"):
359361
df_sero = load_serology_mappings(imgt_version)
360362

@@ -388,9 +390,14 @@ def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
388390
to_serological_name
389391
)
390392

393+
sero_mapping_combined["lgx"] = sero_mapping_combined["Allele"].apply(
394+
lambda allele: redux_function(allele, "lgx")
395+
)
396+
397+
# Create a serology to lgx reduced allele list mapping
391398
sero_mapping = (
392399
sero_mapping_combined.groupby("Sero")
393-
.apply(lambda x: "/".join(sorted(x["Allele"])))
400+
.apply(lambda x: "/".join(set(x["lgx"])))
394401
.to_dict()
395402
)
396403

0 commit comments

Comments
 (0)