|
| 1 | +from pathlib import Path |
| 2 | + |
| 3 | +import janitor.polars |
| 4 | +import polars as pl |
| 5 | + |
| 6 | +# Set the folder path for raw-data |
| 7 | +resource_dir = Path(__file__).resolve().parent.parent |
| 8 | +folder_path = resource_dir / "data-raw" |
| 9 | +unzip_path = folder_path / "downloaded" |
| 10 | + |
| 11 | +# Create the infant metadata files |
| 12 | +# FIO All infants in meta_infant_ur are included in meta_infant_bl. |
| 13 | +# There is one extra infant in meta_infant_bl. |
| 14 | +df_meta_infant_bl = pl.read_csv(unzip_path / "metadata_infant_blood.csv") |
| 15 | +df_meta_infant_ur = pl.read_csv(unzip_path / "metadata_infant_urine.csv") |
| 16 | + |
| 17 | +df_infant = ( |
| 18 | + df_meta_infant_bl.select( |
| 19 | + ["Infant_ID", "Mother_ID", "GD_Delivery", "Fostered", "Foster_ID"] |
| 20 | + ) |
| 21 | + .rename({"GD_Delivery": "gestation_day_delivery"}) |
| 22 | + .unique() |
| 23 | + .clean_names() |
| 24 | + .write_csv(folder_path / "data_infant_meta.csv", separator=";") |
| 25 | +) |
| 26 | +# infant gender is in data_adult at present, all subjects are male, |
| 27 | +# females are not included in the study |
| 28 | + |
| 29 | +df_infant_w1 = ( |
| 30 | + df_meta_infant_bl.select(["Infant_ID", "Weight_PD7", "ActualDay_PD7"]) |
| 31 | + .rename({"Weight_PD7": "weight_at_x_days_old", "ActualDay_PD7": "x_days_old"}) |
| 32 | + .clean_names() |
| 33 | +) |
| 34 | +df_infant_w2 = ( |
| 35 | + df_meta_infant_ur.select(["Infant_ID", "Infant_weight", "PD"]) |
| 36 | + .rename({"Infant_weight": "weight_at_x_days_old", "PD": "x_days_old"}) |
| 37 | + .clean_names() |
| 38 | +) |
| 39 | +df_infant_weight = ( |
| 40 | + pl.concat([df_infant_w1, df_infant_w2]) |
| 41 | + .unique() |
| 42 | + .clean_names() |
| 43 | + .write_csv(folder_path / "data_infant_weight.csv", separator=";") |
| 44 | +) # Concatenate (long) |
| 45 | + |
| 46 | +# Create the infant linking tables |
| 47 | + |
| 48 | +matter1 = "Blood" |
| 49 | +matter2 = "Urine" |
| 50 | + |
| 51 | +df_link_infant1 = df_meta_infant_bl.select( |
| 52 | + ["Infant_ID", "Exp", "PD", "Batch"] |
| 53 | +).with_columns(pl.lit(matter1).alias("type_of_matter")) |
| 54 | +df_link_infant2 = df_meta_infant_ur.select( |
| 55 | + ["Infant_ID", "Exp", "PD", "Batch"] |
| 56 | +).with_columns(pl.lit(matter2).alias("type_of_matter")) |
| 57 | + |
| 58 | +df_infant_link = ( |
| 59 | + pl.concat([df_link_infant1, df_link_infant2]) # Concatenate long |
| 60 | + .rename({"PD": "day_sample_taken", "Exp": "infant_sample_id"}) |
| 61 | + .clean_names() |
| 62 | + .write_csv(folder_path / "data_infant_sample_meta.csv", separator=";") |
| 63 | +) |
| 64 | + |
| 65 | +# Create the adult metadata files |
| 66 | +# FIO There are no additional Mother_ID in file 11 |
| 67 | +df_metadata_maternal_bl = pl.read_csv(unzip_path / "metadata_maternal_blood.csv") |
| 68 | +df_metadata_maternal_ur = pl.read_csv(unzip_path / "metadata_maternal_urine.csv") |
| 69 | +df_metadata_maternal_pl = pl.read_csv(unzip_path / "metadata_maternal_placenta.csv") |
| 70 | + |
| 71 | +# Adult metadata file |
| 72 | +df_adult_meta1 = df_metadata_maternal_bl.select( |
| 73 | + [ |
| 74 | + "Mother_ID", |
| 75 | + "Mother_age", |
| 76 | + "GD_Delivery", |
| 77 | + "Group", |
| 78 | + "Mode_birth", |
| 79 | + "Reject", |
| 80 | + "Infant_sex", |
| 81 | + ] |
| 82 | +) |
| 83 | +df_adult_meta2 = df_metadata_maternal_ur.select( |
| 84 | + [ |
| 85 | + "Mother_ID", |
| 86 | + "Mother_age", |
| 87 | + "GD_Delivery", |
| 88 | + "Group", |
| 89 | + "Mode_birth", |
| 90 | + "Reject", |
| 91 | + "Infant_sex", |
| 92 | + ] |
| 93 | +) |
| 94 | + |
| 95 | +df_adult = ( |
| 96 | + pl.concat([df_adult_meta1, df_adult_meta2]) # Concatenate long |
| 97 | + .rename( |
| 98 | + { |
| 99 | + "Mother_age": "Age_at_conception", |
| 100 | + "GD_Delivery": "gestation_day_at_delivery", |
| 101 | + "Group": "obesity_classification", |
| 102 | + "Mode_birth": "mode_of_birth", |
| 103 | + } |
| 104 | + ) |
| 105 | + .clean_names() |
| 106 | + .unique() |
| 107 | + .write_csv(folder_path / "data_adult_meta.csv", separator=";") |
| 108 | +) |
| 109 | +# Adult weight file |
| 110 | +df_adult_weight1 = df_metadata_maternal_bl.select( |
| 111 | + ["Mother_ID", "GD_day", "GD_targeted", "Mother_Weight", "BCS"] |
| 112 | +).rename( |
| 113 | + { |
| 114 | + "GD_day": "sample_gestation_day", |
| 115 | + "GD_targeted": "target_gestation_day", |
| 116 | + "Mother_Weight": "weight_at_gestation_day", |
| 117 | + "BCS": "body_condition_score", |
| 118 | + } |
| 119 | +) |
| 120 | +df_adult_weight2 = df_metadata_maternal_ur.select( |
| 121 | + ["Mother_ID", "GD", "Target_GD", "Mother_Weight", "BCS"] |
| 122 | +).rename( |
| 123 | + { |
| 124 | + "GD": "sample_gestation_day", |
| 125 | + "Target_GD": "target_gestation_day", |
| 126 | + "Mother_Weight": "weight_at_gestation_day", |
| 127 | + "BCS": "body_condition_score", |
| 128 | + } |
| 129 | +) |
| 130 | +df_adult_weight = ( |
| 131 | + pl.concat([df_adult_weight1, df_adult_weight2]) |
| 132 | + .unique() |
| 133 | + .write_csv(folder_path / "data_adult_weight.csv", separator=";") |
| 134 | +) # Concatenate long |
| 135 | + |
| 136 | +# Create the adult linking tables |
| 137 | + |
| 138 | +# matter1 = "Blood" define above in infant linking tables |
| 139 | +# matter2 = "Urine" define above in infant linking tables |
| 140 | +matter3 = "Placenta" |
| 141 | + |
| 142 | +df_link_adult1 = ( |
| 143 | + df_metadata_maternal_bl.select( |
| 144 | + ["Mother_ID", "Exp", "GD_day", "GD_targeted", "Batch", "Dilution_factor"] |
| 145 | + ) |
| 146 | + .rename({"GD_day": "GD", "GD_targeted": "Target_GD"}) |
| 147 | + .with_columns(pl.lit(matter1).alias("type_of_matter")) |
| 148 | +) |
| 149 | +df_link_adult2 = df_metadata_maternal_ur.select( |
| 150 | + ["Mother_ID", "Exp", "GD", "Target_GD", "Batch", "Dilution_factor"] |
| 151 | +).with_columns(pl.lit(matter2).alias("type_of_matter")) |
| 152 | +df_link_adult3 = df_metadata_maternal_pl.select( |
| 153 | + ["Mother_ID", "Exp", "GD", "Target_GD", "Batch", "Dilution_factor"] |
| 154 | +).with_columns(pl.lit(matter3).alias("type_of_matter")) |
| 155 | + |
| 156 | +df_adult_link = ( |
| 157 | + pl.concat([df_link_adult1, df_link_adult2, df_link_adult3]) # Concatenate long |
| 158 | + .rename( |
| 159 | + { |
| 160 | + "GD": "day_sample_taken", |
| 161 | + "Target_GD": "target_sampling_day", |
| 162 | + "Exp": "adult_sample_id", |
| 163 | + } |
| 164 | + ) |
| 165 | + .write_csv(folder_path / "data_adult_sample_meta.csv", separator=";") |
| 166 | +) |
| 167 | + |
| 168 | +# Create the placenta file |
| 169 | +# Both from metadata_maternal_bl (Placenta_Width,Placenta_Height, |
| 170 | +# Placenta_Thickness,EPV) and from 11 (see below) |
| 171 | +# More than one measure in metadata_maternal_bl? |
| 172 | +""" |
| 173 | +11 - Metadata_Maternal_placenta.csv |
| 174 | +* Variables |
| 175 | +o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.placenta.csv file. |
| 176 | +o Mother_ID: IDs of mothers. |
| 177 | +o Batch: The experiment was conducted over two batches (Batch1 or Batch2) |
| 178 | +o Group: Lean or Obese. |
| 179 | +o GD: Exact gestational day (GD) when samples were collected. |
| 180 | +o Target_GD: Target GD for sample collection. |
| 181 | +o Dilution_factor: Dilution factor used to prepare NMR samples. |
| 182 | +o BCS: Body Condition Score (BCS) |
| 183 | +o Tissue_weight: Weight of placental tissue sample. |
| 184 | +o V1: Volume of solvent used to extract (uL). Used to correct the metabolite concentration. |
| 185 | +o V2: Volume of polar layer (methanol + water) collected (uL). Used to correct the metabolite concentration. |
| 186 | +o V3: Buffer added to reconstitute the sample after freeze drying (uL). Used to correct the metabolite concentration. |
| 187 | +* Missing data codes: Indicated by NAs. |
| 188 | +""" |
0 commit comments