Skip to content

Commit c4abd35

Browse files
committed
Remove creation of multiple temporary data frames.
- Will speed up by re-using the same dataframe
1 parent e887950 commit c4abd35

File tree

2 files changed

+27
-20
lines changed

2 files changed

+27
-20
lines changed

pyard/data_repository.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,12 @@
2828
from . import db
2929
from .broad_splits import broad_splits_dna_mapping
3030
from .broad_splits import broad_splits_ser_mapping
31-
from .misc import get_2field_allele, get_3field_allele, number_of_fields
31+
from .misc import (
32+
get_2field_allele,
33+
get_3field_allele,
34+
number_of_fields,
35+
get_1field_allele,
36+
)
3237
from .misc import expression_chars, get_G_name, get_P_name
3338

3439
# GitHub URL where IMGT HLA files are downloaded.
@@ -241,9 +246,12 @@ def load_g_group(imgt_version):
241246
# A* + 02:01 = A*02:01
242247
df["A"] = df["Locus"] + df["A"]
243248
df["G"] = df["Locus"] + df["G"]
249+
# Create 2,3 field versions of the alleles
244250
df["2d"] = df["A"].apply(get_2field_allele)
245251
df["3d"] = df["A"].apply(get_3field_allele)
252+
# lgx is 2 field version of the G group allele
246253
df["lgx"] = df["G"].apply(get_2field_allele)
254+
247255
return df
248256

249257

@@ -384,28 +392,23 @@ def generate_alleles_and_xx_codes_and_who(
384392
db.save_dict(db_connection, "xx_codes", flat_xx_codes, ("allele_1d", "allele_list"))
385393

386394
# W H O
387-
who_alleles = set(allele_df["Allele"])
395+
who_alleles = allele_df["Allele"].to_list()
388396
# Save this version of the WHO alleles
389397
db.save_set(db_connection, "who_alleles", who_alleles, "allele")
398+
390399
# Create WHO mapping from the unique alleles in the 1-field column
391-
unique_alleles = allele_df["Allele"].unique()
392-
who_df1 = pd.DataFrame(unique_alleles, columns=["Allele"])
393-
who_df1["nd"] = allele_df["Allele"].apply(lambda x: x.split(":")[0])
394-
# Create WHO mapping from the unique alleles in the 2-field column
395-
who_df2 = pd.DataFrame(unique_alleles, columns=["Allele"])
396-
who_df2["nd"] = allele_df["Allele"].apply(get_2field_allele)
397-
# Create WHO mapping from the unique alleles in the 3-field column
398-
who_df3 = pd.DataFrame(unique_alleles, columns=["Allele"])
399-
who_df3["nd"] = allele_df["Allele"].apply(get_3field_allele)
400-
# Combine n-field dataframes in 1
401-
402-
# Create g_codes expansion mapping from the same tables used to reduce to G
403-
g_df = pd.DataFrame(list(ars_mappings.g_group.items()), columns=["Allele", "nd"])
404-
405-
# Create p_codes expansion mapping from the p_group table
406-
p_df = pd.DataFrame(list(p_group.items()), columns=["Allele", "nd"])
407-
408-
who_codes = pd.concat([who_df1, who_df2, who_df3, g_df, p_df])
400+
allele_df["1d"] = allele_df["Allele"].apply(get_1field_allele)
401+
402+
who_codes = pd.concat(
403+
[
404+
allele_df[["Allele", "1d"]].rename(columns={"1d": "nd"}),
405+
allele_df[["Allele", "2d"]].rename(columns={"2d": "nd"}),
406+
allele_df[["Allele", "3d"]].rename(columns={"3d": "nd"}),
407+
pd.DataFrame(ars_mappings.g_group.items(), columns=["Allele", "nd"]),
408+
pd.DataFrame(p_group.items(), columns=["Allele", "nd"]),
409+
],
410+
ignore_index=True,
411+
)
409412

410413
# remove valid alleles from who_codes to avoid recursion
411414
for k in who_alleles:

pyard/misc.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ def get_2field_allele(a: str) -> str:
3737
return get_n_field_allele(a, 2)
3838

3939

40+
def get_1field_allele(a: str) -> str:
41+
return get_n_field_allele(a, 1)
42+
43+
4044
def number_of_fields(allele: str) -> int:
4145
return len(allele.split(":"))
4246

0 commit comments

Comments
 (0)