Skip to content

Commit 9b2301a

Browse files
authored
feat: add scripts to fetch and process data (#2)
# Description This PR will copy over the scripts from the examples repo to the new repo. Closes #3 <!-- Select quick/in-depth as necessary --> This PR needs a quick review. ## Checklist - [x] Read through for typos, added new words to the dictionary - [x] Checked that the README is up to date - [x] Resolved any Ruff errors / formatted in Markdown
1 parent 8888063 commit 9b2301a

File tree

5 files changed

+692
-47
lines changed

5 files changed

+692
-47
lines changed

poetry.lock

Lines changed: 77 additions & 47 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ python = "^3.12"
2222
polars = "^1.24.0"
2323
pyjanitor = "^0.30.0"
2424
seedcase-sprout = {git = "https://github.com/seedcase-project/seedcase-sprout.git"}
25+
requests = "^2.32.3"
26+
fastexcel = "^0.13.0"
2527

2628
[tool.poetry.group.dev.dependencies]
2729
ruff = "^0.6.2"

scripts/convert-meta.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
from pathlib import Path
2+
3+
import janitor.polars
4+
import polars as pl
5+
6+
# Set the folder path for raw-data
7+
resource_dir = Path(__file__).resolve().parent.parent
8+
folder_path = resource_dir / "data-raw"
9+
unzip_path = folder_path / "downloaded"
10+
11+
# Create the infant metadata files
12+
# FIO All infants in meta_infant_ur are included in meta_infant_bl.
13+
# There is one extra infant in meta_infant_bl.
14+
df_meta_infant_bl = pl.read_csv(unzip_path / "metadata_infant_blood.csv")
15+
df_meta_infant_ur = pl.read_csv(unzip_path / "metadata_infant_urine.csv")
16+
17+
df_infant = (
18+
df_meta_infant_bl.select(
19+
["Infant_ID", "Mother_ID", "GD_Delivery", "Fostered", "Foster_ID"]
20+
)
21+
.rename({"GD_Delivery": "gestation_day_delivery"})
22+
.unique()
23+
.clean_names()
24+
.write_csv(folder_path / "data_infant_meta.csv", separator=";")
25+
)
26+
# infant gender is in data_adult at present, all subjects are male,
27+
# females are not included in the study
28+
29+
df_infant_w1 = (
30+
df_meta_infant_bl.select(["Infant_ID", "Weight_PD7", "ActualDay_PD7"])
31+
.rename({"Weight_PD7": "weight_at_x_days_old", "ActualDay_PD7": "x_days_old"})
32+
.clean_names()
33+
)
34+
df_infant_w2 = (
35+
df_meta_infant_ur.select(["Infant_ID", "Infant_weight", "PD"])
36+
.rename({"Infant_weight": "weight_at_x_days_old", "PD": "x_days_old"})
37+
.clean_names()
38+
)
39+
df_infant_weight = (
40+
pl.concat([df_infant_w1, df_infant_w2])
41+
.unique()
42+
.clean_names()
43+
.write_csv(folder_path / "data_infant_weight.csv", separator=";")
44+
) # Concatenate (long)
45+
46+
# Create the infant linking tables
47+
48+
matter1 = "Blood"
49+
matter2 = "Urine"
50+
51+
df_link_infant1 = df_meta_infant_bl.select(
52+
["Infant_ID", "Exp", "PD", "Batch"]
53+
).with_columns(pl.lit(matter1).alias("type_of_matter"))
54+
df_link_infant2 = df_meta_infant_ur.select(
55+
["Infant_ID", "Exp", "PD", "Batch"]
56+
).with_columns(pl.lit(matter2).alias("type_of_matter"))
57+
58+
df_infant_link = (
59+
pl.concat([df_link_infant1, df_link_infant2]) # Concatenate long
60+
.rename({"PD": "day_sample_taken", "Exp": "infant_sample_id"})
61+
.clean_names()
62+
.write_csv(folder_path / "data_infant_sample_meta.csv", separator=";")
63+
)
64+
65+
# Create the adult metadata files
66+
# FIO There are no additional Mother_ID in file 11
67+
df_metadata_maternal_bl = pl.read_csv(unzip_path / "metadata_maternal_blood.csv")
68+
df_metadata_maternal_ur = pl.read_csv(unzip_path / "metadata_maternal_urine.csv")
69+
df_metadata_maternal_pl = pl.read_csv(unzip_path / "metadata_maternal_placenta.csv")
70+
71+
# Adult metadata file
72+
df_adult_meta1 = df_metadata_maternal_bl.select(
73+
[
74+
"Mother_ID",
75+
"Mother_age",
76+
"GD_Delivery",
77+
"Group",
78+
"Mode_birth",
79+
"Reject",
80+
"Infant_sex",
81+
]
82+
)
83+
df_adult_meta2 = df_metadata_maternal_ur.select(
84+
[
85+
"Mother_ID",
86+
"Mother_age",
87+
"GD_Delivery",
88+
"Group",
89+
"Mode_birth",
90+
"Reject",
91+
"Infant_sex",
92+
]
93+
)
94+
95+
df_adult = (
96+
pl.concat([df_adult_meta1, df_adult_meta2]) # Concatenate long
97+
.rename(
98+
{
99+
"Mother_age": "Age_at_conception",
100+
"GD_Delivery": "gestation_day_at_delivery",
101+
"Group": "obesity_classification",
102+
"Mode_birth": "mode_of_birth",
103+
}
104+
)
105+
.clean_names()
106+
.unique()
107+
.write_csv(folder_path / "data_adult_meta.csv", separator=";")
108+
)
109+
# Adult weight file
110+
df_adult_weight1 = df_metadata_maternal_bl.select(
111+
["Mother_ID", "GD_day", "GD_targeted", "Mother_Weight", "BCS"]
112+
).rename(
113+
{
114+
"GD_day": "sample_gestation_day",
115+
"GD_targeted": "target_gestation_day",
116+
"Mother_Weight": "weight_at_gestation_day",
117+
"BCS": "body_condition_score",
118+
}
119+
)
120+
df_adult_weight2 = df_metadata_maternal_ur.select(
121+
["Mother_ID", "GD", "Target_GD", "Mother_Weight", "BCS"]
122+
).rename(
123+
{
124+
"GD": "sample_gestation_day",
125+
"Target_GD": "target_gestation_day",
126+
"Mother_Weight": "weight_at_gestation_day",
127+
"BCS": "body_condition_score",
128+
}
129+
)
130+
df_adult_weight = (
131+
pl.concat([df_adult_weight1, df_adult_weight2])
132+
.unique()
133+
.write_csv(folder_path / "data_adult_weight.csv", separator=";")
134+
) # Concatenate long
135+
136+
# Create the adult linking tables
137+
138+
# matter1 = "Blood" define above in infant linking tables
139+
# matter2 = "Urine" define above in infant linking tables
140+
matter3 = "Placenta"
141+
142+
df_link_adult1 = (
143+
df_metadata_maternal_bl.select(
144+
["Mother_ID", "Exp", "GD_day", "GD_targeted", "Batch", "Dilution_factor"]
145+
)
146+
.rename({"GD_day": "GD", "GD_targeted": "Target_GD"})
147+
.with_columns(pl.lit(matter1).alias("type_of_matter"))
148+
)
149+
df_link_adult2 = df_metadata_maternal_ur.select(
150+
["Mother_ID", "Exp", "GD", "Target_GD", "Batch", "Dilution_factor"]
151+
).with_columns(pl.lit(matter2).alias("type_of_matter"))
152+
df_link_adult3 = df_metadata_maternal_pl.select(
153+
["Mother_ID", "Exp", "GD", "Target_GD", "Batch", "Dilution_factor"]
154+
).with_columns(pl.lit(matter3).alias("type_of_matter"))
155+
156+
df_adult_link = (
157+
pl.concat([df_link_adult1, df_link_adult2, df_link_adult3]) # Concatenate long
158+
.rename(
159+
{
160+
"GD": "day_sample_taken",
161+
"Target_GD": "target_sampling_day",
162+
"Exp": "adult_sample_id",
163+
}
164+
)
165+
.write_csv(folder_path / "data_adult_sample_meta.csv", separator=";")
166+
)
167+
168+
# Create the placenta file
169+
# Both from metadata_maternal_bl (Placenta_Width,Placenta_Height,
170+
# Placenta_Thickness,EPV) and from 11 (see below)
171+
# More than one measure in metadata_maternal_bl?
172+
"""
173+
11 - Metadata_Maternal_placenta.csv
174+
* Variables
175+
o Exp: Samples IDs. The same “Exp” represent the same sample in Concentration_Maternal.placenta.csv file.
176+
o Mother_ID: IDs of mothers.
177+
o Batch: The experiment was conducted over two batches (Batch1 or Batch2)
178+
o Group: Lean or Obese.
179+
o GD: Exact gestational day (GD) when samples were collected.
180+
o Target_GD: Target GD for sample collection.
181+
o Dilution_factor: Dilution factor used to prepare NMR samples.
182+
o BCS: Body Condition Score (BCS)
183+
o Tissue_weight: Weight of placental tissue sample.
184+
o V1: Volume of solvent used to extract (uL). Used to correct the metabolite concentration.
185+
o V2: Volume of polar layer (methanol + water) collected (uL). Used to correct the metabolite concentration.
186+
o V3: Buffer added to reconstitute the sample after freeze drying (uL). Used to correct the metabolite concentration.
187+
* Missing data codes: Indicated by NAs.
188+
"""

0 commit comments

Comments
 (0)