Skip to content

Commit 9fa181d

Browse files
committed
aqdds cache got xml ingress
1 parent 3c50b7b commit 9fa181d

File tree

1 file changed

+334
-0
lines changed

1 file changed

+334
-0
lines changed

apps/015_SEN2_app/send2_app.py

Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
import streamlit as st
5+
6+
import xml.etree.ElementTree as ET
7+
import time
8+
9+
10+
11+
def get_values(xml_elements, table_dict: dict, xml_block):
12+
for element in xml_elements:
13+
try:
14+
table_dict[element] = xml_block.find(element).text
15+
except:
16+
table_dict[element] = pd.NA
17+
return table_dict
18+
19+
class XMLtoDF:
20+
header = pd.DataFrame(columns=["Collection", "Year", "Reference Date"])
21+
22+
persons = pd.DataFrame(
23+
columns=[
24+
"Surname",
25+
"Forename",
26+
"PersonBirthDate",
27+
"Sex",
28+
"Ethnicity",
29+
"PostCode",
30+
"UPN",
31+
"UniqueLearnerNumber",
32+
"UPNunknown",
33+
]
34+
)
35+
36+
requests = pd.DataFrame(
37+
columns=[
38+
"ReceivedDate",
39+
"RYA",
40+
"RequestOutcomeDate",
41+
"RequestOutcome",
42+
"RequestMediation",
43+
"RequestTribunal",
44+
"Exported",
45+
]
46+
)
47+
48+
assessments = pd.DataFrame(
49+
columns=[
50+
"AssessmentOutcome",
51+
"AssessmentOutcomeDate",
52+
"AssessmentMediation",
53+
"AssessmentTribunal",
54+
"OtherMediation",
55+
"OtherTribunal",
56+
"Week20",
57+
]
58+
)
59+
60+
named_plan = pd.DataFrame(
61+
columns=[
62+
"StartDate",
63+
"URN",
64+
"UKPRN",
65+
"SENSetting",
66+
"PlacementRank",
67+
"SENunitIndicator",
68+
"ResourcedProvisionIndicator",
69+
"PlanRes",
70+
"PlanWPB",
71+
"PB",
72+
"OA",
73+
"DP",
74+
"CeaseDate",
75+
"CeaseReason",
76+
]
77+
)
78+
79+
active_plans = pd.DataFrame(
80+
columns=[
81+
"TransferLA",
82+
"URN",
83+
"UKPRN",
84+
"SENSetting",
85+
"SENSettingOther",
86+
"PlacementRank",
87+
"EntryDate",
88+
"LeavingDate",
89+
"SENunitIndicator",
90+
"ResourcedProvisionIndicator",
91+
"RES",
92+
"WPB",
93+
"SENtype",
94+
"SENtypeRank",
95+
"ReviewMeeting",
96+
"ReviewOutcome",
97+
"LastReview",
98+
]
99+
)
100+
101+
102+
def __init__(self, root):
103+
self.child_id = 0
104+
header = root.find("Header")
105+
self.header = self.create_header(header)
106+
self.name = None
107+
108+
children = root.find("Persons")
109+
self.total_children = len(children)
110+
111+
for child in children.findall("Person"):
112+
self.create_child(child)
113+
if self.child_id % 1000 == 0:
114+
st.write(f'Read data for {self.child_id} children of {self.total_children} children.')
115+
116+
117+
self.named_plan = self.named_plan[self.named_plan["StartDate"].notna()].copy()
118+
119+
def create_header(self, header):
120+
121+
header_dict = {}
122+
collection_details = header.find("CollectionDetails")
123+
collection_elements = ["Collection", "Year", "ReferenceDate"]
124+
header_dict = get_values(collection_elements, header_dict, collection_details)
125+
126+
source = header.find("Source")
127+
source_elements = [
128+
"SourceLevel",
129+
"LEA",
130+
"SoftwareCode",
131+
"Release",
132+
"SerialNo",
133+
"DateTime",
134+
]
135+
header_dict = get_values(source_elements, header_dict, source)
136+
137+
header_df = pd.DataFrame.from_dict([header_dict])
138+
return header_df
139+
140+
def create_child(self, person):
141+
self.create_person(person)
142+
self.create_requests(person)
143+
144+
def create_person(self, child):
145+
forename = child.find("Forename").text
146+
surname = child.find("Surname").text
147+
self.name = f"{forename} {surname}"
148+
self.child_id += 1
149+
person_dict = {}
150+
elements = self.persons.columns
151+
person_dict = get_values(elements, person_dict, child)
152+
person_dict["child_id"] = self.child_id
153+
154+
persons_df = pd.DataFrame.from_dict([person_dict])
155+
self.persons = pd.concat([self.persons, persons_df], ignore_index=True)
156+
157+
def create_requests(self, child):
158+
self.requests_id = 0
159+
elements = self.requests.columns
160+
requests_list = []
161+
162+
requests = child.findall("Requests")
163+
for request in requests:
164+
requests_dict = {}
165+
self.requests_id += 1
166+
167+
requests_dict = get_values(elements, requests_dict, request)
168+
169+
requests_dict["child_id"] = self.child_id
170+
requests_dict["requests_id"] = self.requests_id
171+
172+
requests_list.append(requests_dict)
173+
174+
self.create_assessments(request)
175+
self.create_active_plans(request)
176+
177+
requests_df = pd.DataFrame(requests_list)
178+
self.requests = pd.concat([self.requests, requests_df], ignore_index=True)
179+
180+
def create_assessments(self, request):
181+
assessment_list = []
182+
elements = self.assessments.columns
183+
self.assessment_id = 0
184+
185+
assessments = request.findall("Assessment")
186+
187+
for assessment in assessments:
188+
189+
# assessments
190+
self.assessment_id += 1
191+
assessment_dict = {}
192+
193+
assessment_dict = get_values(elements, assessment_dict, assessment)
194+
195+
assessment_dict["name"] = self.name
196+
assessment_dict["child_id"] = self.child_id
197+
assessment_dict["requests_id"] = self.requests_id
198+
assessment_dict["assessment_id"] = self.assessment_id
199+
200+
assessment_list.append(assessment_dict)
201+
202+
# named_plans
203+
self.create_named_plan(assessment)
204+
205+
assessment_df = pd.DataFrame(assessment_list)
206+
self.assessments = pd.concat(
207+
[self.assessments, assessment_df], ignore_index=True
208+
)
209+
210+
def create_named_plan(self, assessment):
211+
212+
named_plan_elements = [
213+
"StartDate",
214+
"PlanRes",
215+
"PlanWPB",
216+
"PB",
217+
"OA",
218+
"DP",
219+
"CeaseDate",
220+
"CeaseReason",
221+
]
222+
named_plan_dict = {}
223+
224+
plan_detail_elements = [
225+
"URN",
226+
"UKPRN",
227+
"SENSetting",
228+
"SENSettingOther",
229+
"PlacementRank",
230+
"SENunitIndicator",
231+
"ResourcedProvisionIndicator",
232+
]
233+
234+
named_plan_locs = assessment.find("NamedPlan")
235+
plan_detail_list = []
236+
237+
if named_plan_locs is not None:
238+
for plan_detail in named_plan_locs.findall("PlanDetail"):
239+
named_plan_dict = get_values(
240+
named_plan_elements, named_plan_dict, named_plan_locs
241+
)
242+
243+
named_plan_dict = get_values(
244+
plan_detail_elements, named_plan_dict, plan_detail
245+
)
246+
named_plan_dict["name"] = self.name
247+
named_plan_dict["child_id"] = self.child_id
248+
named_plan_dict["requests_id"] = self.requests_id
249+
named_plan_dict["assessment_id"] = self.assessment_id
250+
251+
plan_detail_list.append(named_plan_dict)
252+
253+
named_plan_df = pd.DataFrame(plan_detail_list)
254+
self.named_plan = pd.concat(
255+
[self.named_plan, named_plan_df], ignore_index=True
256+
)
257+
258+
def create_active_plans(self, request):
259+
active_plans_list = []
260+
261+
active_plan_elements = [
262+
"TransferLA",
263+
"RES",
264+
"WPB",
265+
"ReviewMeeting",
266+
"ReviewOutcome",
267+
"LastReview",
268+
]
269+
placement_detail_elements = [
270+
"URN",
271+
"SENSetting",
272+
"SENSettingOther",
273+
"PlacementRank",
274+
"EntryDate",
275+
"LeavingDate",
276+
"SENunitIndicator",
277+
"ResourcedProvisionIndicator",
278+
]
279+
sen_need_elements = ["SENtype", "SENtypeRank"]
280+
281+
active_plan_locs = request.find("ActivePlans")
282+
if active_plan_locs is not None:
283+
placement_detail_locs = active_plan_locs.findall("PlacementDetail")
284+
sen_need_locs = active_plan_locs.find("SENneed")
285+
286+
for placement_detail in placement_detail_locs:
287+
active_plans_dict = {}
288+
active_plans_dict = get_values(
289+
active_plan_elements, active_plans_dict, active_plan_locs
290+
)
291+
active_plans_dict = get_values(
292+
placement_detail_elements, active_plans_dict, placement_detail
293+
)
294+
active_plans_dict = get_values(
295+
sen_need_elements, active_plans_dict, sen_need_locs
296+
)
297+
active_plans_dict["name"] = self.name
298+
active_plans_dict["child_id"] = self.child_id
299+
active_plans_dict["requests_id"] = self.requests_id
300+
301+
active_plans_list.append(active_plans_dict)
302+
303+
active_plan_df = pd.DataFrame(active_plans_list)
304+
self.active_plans = pd.concat(
305+
[self.active_plans, active_plan_df], ignore_index=True
306+
)
307+
308+
@st.cache_data
309+
def convert_data(_root: ET.Element):
310+
datafiles = XMLtoDF(_root)
311+
312+
return datafiles
313+
314+
315+
###########################
316+
# Main App
317+
###########################
318+
319+
input_file = st.file_uploader('Upload SEN2 XML here')
320+
321+
if input_file:
322+
# Get time to test ingress speed and caching
323+
start_time = time.time()
324+
st.write("Starting data read, for large datasets this could take 5 minutes.")
325+
326+
tree = ET.parse(input_file)
327+
root = tree.getroot()
328+
data_files = convert_data(root)
329+
330+
after_ingress_time = time.time()
331+
total_ingress_time = after_ingress_time - start_time
332+
st.write(f'Total ingress time: {int(total_ingress_time/60)} minutes.')
333+
334+
# data_files.header

0 commit comments

Comments
 (0)