1+ import pandas as pd
2+ import numpy as np
3+
4+ import streamlit as st
5+
6+ import xml .etree .ElementTree as ET
7+ import time
8+
9+
10+
11+ def get_values (xml_elements , table_dict : dict , xml_block ):
12+ for element in xml_elements :
13+ try :
14+ table_dict [element ] = xml_block .find (element ).text
15+ except :
16+ table_dict [element ] = pd .NA
17+ return table_dict
18+
19+ class XMLtoDF :
20+ header = pd .DataFrame (columns = ["Collection" , "Year" , "Reference Date" ])
21+
22+ persons = pd .DataFrame (
23+ columns = [
24+ "Surname" ,
25+ "Forename" ,
26+ "PersonBirthDate" ,
27+ "Sex" ,
28+ "Ethnicity" ,
29+ "PostCode" ,
30+ "UPN" ,
31+ "UniqueLearnerNumber" ,
32+ "UPNunknown" ,
33+ ]
34+ )
35+
36+ requests = pd .DataFrame (
37+ columns = [
38+ "ReceivedDate" ,
39+ "RYA" ,
40+ "RequestOutcomeDate" ,
41+ "RequestOutcome" ,
42+ "RequestMediation" ,
43+ "RequestTribunal" ,
44+ "Exported" ,
45+ ]
46+ )
47+
48+ assessments = pd .DataFrame (
49+ columns = [
50+ "AssessmentOutcome" ,
51+ "AssessmentOutcomeDate" ,
52+ "AssessmentMediation" ,
53+ "AssessmentTribunal" ,
54+ "OtherMediation" ,
55+ "OtherTribunal" ,
56+ "Week20" ,
57+ ]
58+ )
59+
60+ named_plan = pd .DataFrame (
61+ columns = [
62+ "StartDate" ,
63+ "URN" ,
64+ "UKPRN" ,
65+ "SENSetting" ,
66+ "PlacementRank" ,
67+ "SENunitIndicator" ,
68+ "ResourcedProvisionIndicator" ,
69+ "PlanRes" ,
70+ "PlanWPB" ,
71+ "PB" ,
72+ "OA" ,
73+ "DP" ,
74+ "CeaseDate" ,
75+ "CeaseReason" ,
76+ ]
77+ )
78+
79+ active_plans = pd .DataFrame (
80+ columns = [
81+ "TransferLA" ,
82+ "URN" ,
83+ "UKPRN" ,
84+ "SENSetting" ,
85+ "SENSettingOther" ,
86+ "PlacementRank" ,
87+ "EntryDate" ,
88+ "LeavingDate" ,
89+ "SENunitIndicator" ,
90+ "ResourcedProvisionIndicator" ,
91+ "RES" ,
92+ "WPB" ,
93+ "SENtype" ,
94+ "SENtypeRank" ,
95+ "ReviewMeeting" ,
96+ "ReviewOutcome" ,
97+ "LastReview" ,
98+ ]
99+ )
100+
101+
102+ def __init__ (self , root ):
103+ self .child_id = 0
104+ header = root .find ("Header" )
105+ self .header = self .create_header (header )
106+ self .name = None
107+
108+ children = root .find ("Persons" )
109+ self .total_children = len (children )
110+
111+ for child in children .findall ("Person" ):
112+ self .create_child (child )
113+ if self .child_id % 1000 == 0 :
114+ st .write (f'Read data for { self .child_id } children of { self .total_children } children.' )
115+
116+
117+ self .named_plan = self .named_plan [self .named_plan ["StartDate" ].notna ()].copy ()
118+
119+ def create_header (self , header ):
120+
121+ header_dict = {}
122+ collection_details = header .find ("CollectionDetails" )
123+ collection_elements = ["Collection" , "Year" , "ReferenceDate" ]
124+ header_dict = get_values (collection_elements , header_dict , collection_details )
125+
126+ source = header .find ("Source" )
127+ source_elements = [
128+ "SourceLevel" ,
129+ "LEA" ,
130+ "SoftwareCode" ,
131+ "Release" ,
132+ "SerialNo" ,
133+ "DateTime" ,
134+ ]
135+ header_dict = get_values (source_elements , header_dict , source )
136+
137+ header_df = pd .DataFrame .from_dict ([header_dict ])
138+ return header_df
139+
140+ def create_child (self , person ):
141+ self .create_person (person )
142+ self .create_requests (person )
143+
144+ def create_person (self , child ):
145+ forename = child .find ("Forename" ).text
146+ surname = child .find ("Surname" ).text
147+ self .name = f"{ forename } { surname } "
148+ self .child_id += 1
149+ person_dict = {}
150+ elements = self .persons .columns
151+ person_dict = get_values (elements , person_dict , child )
152+ person_dict ["child_id" ] = self .child_id
153+
154+ persons_df = pd .DataFrame .from_dict ([person_dict ])
155+ self .persons = pd .concat ([self .persons , persons_df ], ignore_index = True )
156+
157+ def create_requests (self , child ):
158+ self .requests_id = 0
159+ elements = self .requests .columns
160+ requests_list = []
161+
162+ requests = child .findall ("Requests" )
163+ for request in requests :
164+ requests_dict = {}
165+ self .requests_id += 1
166+
167+ requests_dict = get_values (elements , requests_dict , request )
168+
169+ requests_dict ["child_id" ] = self .child_id
170+ requests_dict ["requests_id" ] = self .requests_id
171+
172+ requests_list .append (requests_dict )
173+
174+ self .create_assessments (request )
175+ self .create_active_plans (request )
176+
177+ requests_df = pd .DataFrame (requests_list )
178+ self .requests = pd .concat ([self .requests , requests_df ], ignore_index = True )
179+
180+ def create_assessments (self , request ):
181+ assessment_list = []
182+ elements = self .assessments .columns
183+ self .assessment_id = 0
184+
185+ assessments = request .findall ("Assessment" )
186+
187+ for assessment in assessments :
188+
189+ # assessments
190+ self .assessment_id += 1
191+ assessment_dict = {}
192+
193+ assessment_dict = get_values (elements , assessment_dict , assessment )
194+
195+ assessment_dict ["name" ] = self .name
196+ assessment_dict ["child_id" ] = self .child_id
197+ assessment_dict ["requests_id" ] = self .requests_id
198+ assessment_dict ["assessment_id" ] = self .assessment_id
199+
200+ assessment_list .append (assessment_dict )
201+
202+ # named_plans
203+ self .create_named_plan (assessment )
204+
205+ assessment_df = pd .DataFrame (assessment_list )
206+ self .assessments = pd .concat (
207+ [self .assessments , assessment_df ], ignore_index = True
208+ )
209+
210+ def create_named_plan (self , assessment ):
211+
212+ named_plan_elements = [
213+ "StartDate" ,
214+ "PlanRes" ,
215+ "PlanWPB" ,
216+ "PB" ,
217+ "OA" ,
218+ "DP" ,
219+ "CeaseDate" ,
220+ "CeaseReason" ,
221+ ]
222+ named_plan_dict = {}
223+
224+ plan_detail_elements = [
225+ "URN" ,
226+ "UKPRN" ,
227+ "SENSetting" ,
228+ "SENSettingOther" ,
229+ "PlacementRank" ,
230+ "SENunitIndicator" ,
231+ "ResourcedProvisionIndicator" ,
232+ ]
233+
234+ named_plan_locs = assessment .find ("NamedPlan" )
235+ plan_detail_list = []
236+
237+ if named_plan_locs is not None :
238+ for plan_detail in named_plan_locs .findall ("PlanDetail" ):
239+ named_plan_dict = get_values (
240+ named_plan_elements , named_plan_dict , named_plan_locs
241+ )
242+
243+ named_plan_dict = get_values (
244+ plan_detail_elements , named_plan_dict , plan_detail
245+ )
246+ named_plan_dict ["name" ] = self .name
247+ named_plan_dict ["child_id" ] = self .child_id
248+ named_plan_dict ["requests_id" ] = self .requests_id
249+ named_plan_dict ["assessment_id" ] = self .assessment_id
250+
251+ plan_detail_list .append (named_plan_dict )
252+
253+ named_plan_df = pd .DataFrame (plan_detail_list )
254+ self .named_plan = pd .concat (
255+ [self .named_plan , named_plan_df ], ignore_index = True
256+ )
257+
258+ def create_active_plans (self , request ):
259+ active_plans_list = []
260+
261+ active_plan_elements = [
262+ "TransferLA" ,
263+ "RES" ,
264+ "WPB" ,
265+ "ReviewMeeting" ,
266+ "ReviewOutcome" ,
267+ "LastReview" ,
268+ ]
269+ placement_detail_elements = [
270+ "URN" ,
271+ "SENSetting" ,
272+ "SENSettingOther" ,
273+ "PlacementRank" ,
274+ "EntryDate" ,
275+ "LeavingDate" ,
276+ "SENunitIndicator" ,
277+ "ResourcedProvisionIndicator" ,
278+ ]
279+ sen_need_elements = ["SENtype" , "SENtypeRank" ]
280+
281+ active_plan_locs = request .find ("ActivePlans" )
282+ if active_plan_locs is not None :
283+ placement_detail_locs = active_plan_locs .findall ("PlacementDetail" )
284+ sen_need_locs = active_plan_locs .find ("SENneed" )
285+
286+ for placement_detail in placement_detail_locs :
287+ active_plans_dict = {}
288+ active_plans_dict = get_values (
289+ active_plan_elements , active_plans_dict , active_plan_locs
290+ )
291+ active_plans_dict = get_values (
292+ placement_detail_elements , active_plans_dict , placement_detail
293+ )
294+ active_plans_dict = get_values (
295+ sen_need_elements , active_plans_dict , sen_need_locs
296+ )
297+ active_plans_dict ["name" ] = self .name
298+ active_plans_dict ["child_id" ] = self .child_id
299+ active_plans_dict ["requests_id" ] = self .requests_id
300+
301+ active_plans_list .append (active_plans_dict )
302+
303+ active_plan_df = pd .DataFrame (active_plans_list )
304+ self .active_plans = pd .concat (
305+ [self .active_plans , active_plan_df ], ignore_index = True
306+ )
307+
308+ @st .cache_data
309+ def convert_data (_root : ET .Element ):
310+ datafiles = XMLtoDF (_root )
311+
312+ return datafiles
313+
314+
315+ ###########################
316+ # Main App
317+ ###########################
318+
319+ input_file = st .file_uploader ('Upload SEN2 XML here' )
320+
321+ if input_file :
322+ # Get time to test ingress speed and caching
323+ start_time = time .time ()
324+ st .write ("Starting data read, for large datasets this could take 5 minutes." )
325+
326+ tree = ET .parse (input_file )
327+ root = tree .getroot ()
328+ data_files = convert_data (root )
329+
330+ after_ingress_time = time .time ()
331+ total_ingress_time = after_ingress_time - start_time
332+ st .write (f'Total ingress time: { int (total_ingress_time / 60 )} minutes.' )
333+
334+ # data_files.header
0 commit comments