Skip to content

Commit 43dd1f4

Browse files
authored
first draft of the script that transfers mpog data to tabular format … (#473)
* script to transfer MPOG data to tabular format using new ingestion model created
1 parent 1d25989 commit 43dd1f4

File tree

1 file changed

+212
-0
lines changed

1 file changed

+212
-0
lines changed
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
import json
2+
import os
3+
import re
4+
import uuid
5+
6+
import pandas as pd
7+
8+
# Path to the directory containing input files
9+
os.environ.pop("INPUT_DIR", None)
10+
INPUT_DIR = os.environ.setdefault("INPUT_DIR", "S:/PCRC 166 Landis-Lewis/Final Data/ControlArmInputMessagesWithSimulatedHistory/2024-09_h")
11+
12+
13+
def extract_number(filename):
14+
# Extract numeric part from filename
15+
match = re.search(r"_(\d+)", filename)
16+
if match:
17+
return int(match.group(1))
18+
else:
19+
return float("inf") # Return infinity if no numeric part found
20+
21+
22+
def main():
23+
performance_rows = []
24+
preferences_rows = []
25+
history_rows = []
26+
columns = None
27+
input_files = sorted(
28+
[f for f in os.listdir(INPUT_DIR) if f.endswith(".json")], key=extract_number
29+
)
30+
31+
for filename in input_files:
32+
with open(os.path.join(INPUT_DIR, filename), "r") as file:
33+
data = json.load(file)
34+
performance_data = data["Performance_data"]
35+
if columns is None:
36+
columns = performance_data[0]
37+
for row in performance_data[1:]:
38+
performance_rows.append(row)
39+
40+
preferences_rows.append([performance_data[1][0], data["Preferences"]])
41+
42+
history_data = data["History"]
43+
44+
history_rows.extend(
45+
[
46+
[performance_data[1][0], key, value]
47+
for key, value in history_data.items()
48+
]
49+
)
50+
51+
performance_data_df = pd.DataFrame(performance_rows, columns=columns)
52+
performance_data_df["identifier"] = [
53+
str(uuid.uuid4()) for _ in range(len(performance_data_df))
54+
]
55+
56+
performance_data_df.rename(
57+
columns={
58+
"staff_number": "subject",
59+
"month": "period.start",
60+
"denominator": "measureScore.denominator",
61+
},
62+
inplace=True,
63+
)
64+
65+
performance_data_df["measureScore.rate"] = (
66+
performance_data_df["passed_count"]
67+
/ performance_data_df["measureScore.denominator"]
68+
)
69+
70+
performance_data_df["period.end"] = performance_data_df["period.start"]
71+
72+
performance_data_df["period.end"] = pd.to_datetime(
73+
performance_data_df["period.start"]
74+
)
75+
performance_data_df["period.end"] = performance_data_df[
76+
"period.end"
77+
] + pd.offsets.MonthEnd(0)
78+
performance_data_df["period.end"] = performance_data_df["period.end"].dt.strftime(
79+
"%Y-%m-%d"
80+
)
81+
performance_data_df["measureScore.range"] = None
82+
df_providers = pd.read_excel(r"S:\PCRC 166 Landis-Lewis\Final Data\Precison Feedback Data 2025-03-07.xlsx", sheet_name="Provider")
83+
performance_data_df = performance_data_df.merge(
84+
df_providers[["Provider_Number", "Institution", "Professional_Role"]],
85+
left_on="subject",
86+
right_on="Provider_Number",
87+
how="left"
88+
)
89+
comparator_df = performance_data_df[
90+
[
91+
"measure",
92+
"period.start",
93+
"period.end" ,
94+
"peer_average_comparator",
95+
"peer_75th_percentile_benchmark",
96+
"peer_90th_percentile_benchmark",
97+
"MPOG_goal",
98+
"Institution",
99+
"Professional_Role"
100+
]
101+
]
102+
103+
104+
subject_data_df = performance_data_df[["subject", "Institution","Professional_Role"]].drop_duplicates()
105+
subject_data_df["type"] = "Practitioner"
106+
107+
108+
109+
subject_data_df.rename(
110+
columns={
111+
"subject": "PractitionerRole.practitioner",
112+
"Institution":"PractitionerRole.organization",
113+
"Professional_Role":"PractitionerRole.code"
114+
},
115+
inplace=True,
116+
)
117+
118+
performance_data_df = performance_data_df[
119+
[
120+
"identifier",
121+
"measure",
122+
"subject",
123+
"period.start",
124+
"period.end",
125+
"measureScore.rate",
126+
"measureScore.denominator",
127+
"measureScore.range",
128+
]
129+
]
130+
131+
preferences_data_df = pd.DataFrame(
132+
preferences_rows, columns=["subject", "preferences.json"]
133+
)
134+
preferences_data_df = preferences_data_df[preferences_data_df["preferences.json"] != {}]
135+
136+
history_data_df = pd.DataFrame(
137+
history_rows, columns=["subject", "period.start", "history.json"]
138+
)
139+
history_data_df["period.end"] = history_data_df["period.start"]
140+
history_data_df["period.end"] = pd.to_datetime(history_data_df["period.start"])
141+
history_data_df["period.end"] = history_data_df["period.end"] + pd.offsets.MonthEnd(
142+
0
143+
)
144+
history_data_df["period.end"] = history_data_df["period.end"].dt.strftime(
145+
"%Y-%m-%d"
146+
)
147+
history_data_df = history_data_df[
148+
["subject", "period.start", "period.end", "history.json"]
149+
]
150+
151+
comparator_df = comparator_df.drop_duplicates()
152+
comparator_df = comparator_df.melt(
153+
id_vars=[
154+
"measure", "period.start", "period.end", "Institution", "Professional_Role"
155+
],
156+
value_vars=[
157+
"peer_average_comparator",
158+
"peer_75th_percentile_benchmark",
159+
"peer_90th_percentile_benchmark",
160+
"MPOG_goal"
161+
],
162+
var_name="group.code", # new column for the original column names
163+
value_name="measureScore.rate" # new column for the values
164+
)
165+
comparator_df.rename(
166+
columns={
167+
"Institution":"group.subject",
168+
"Professional_Role":"PractitionerRole.code"
169+
},
170+
inplace=True,
171+
)
172+
comparator_df["identifier"] = [
173+
str(uuid.uuid4()) for _ in range(len(comparator_df))
174+
]
175+
comparator_df=comparator_df[
176+
[
177+
"identifier",
178+
"measure",
179+
"period.start",
180+
"measureScore.rate",
181+
"period.end",
182+
"group.subject",
183+
"group.code",
184+
"PractitionerRole.code"
185+
]
186+
]
187+
188+
type_mapping = {
189+
"peer_average_comparator": "http://purl.obolibrary.org/obo/PSDO_0000126",
190+
"peer_75th_percentile_benchmark": "http://purl.obolibrary.org/obo/PSDO_0000128",
191+
"peer_90th_percentile_benchmark": "http://purl.obolibrary.org/obo/PSDO_0000129",
192+
"MPOG_goal": "http://purl.obolibrary.org/obo/PSDO_0000094"
193+
}
194+
195+
comparator_df["group.code"] = comparator_df["group.code"].replace(type_mapping)
196+
197+
198+
performance_data_df.to_csv("PerformanceMeasureReport.csv", index=False)
199+
comparator_df.to_csv("ComparatorMeasureReport.csv", index=False)
200+
subject_data_df.to_csv("PractitionerRole.csv", index=False)
201+
preferences_data_df.to_csv("Preference.csv", index=False)
202+
history_data_df.to_csv("MessageHistory.csv", index=False)
203+
204+
# with pd.ExcelWriter("output.xlsx", engine="openpyxl") as writer:
205+
# performance_data_df.to_excel(writer, sheet_name="performance data", index=False)
206+
# subject_data_df.to_excel(writer, sheet_name="PractitionerRole", index=False)
207+
# preferences_data_df.to_excel(writer, sheet_name="preference data", index=False)
208+
# history_data_df.to_excel(writer, sheet_name="message history data", index=False)
209+
210+
211+
if __name__ == "__main__":
212+
main()

0 commit comments

Comments
 (0)