@@ -90,13 +90,14 @@ def _add_to_result(levels, data, result):
9090 current_level .append (data )
9191
9292
93- def _speeches_to_json (speech_data ):
94- df_sitting = pd .DataFrame (json .loads (speech_data ))
93+ def _speeches_to_json (speech_data : list [ dict ] ):
94+ # df_sitting = pd.DataFrame(json.loads(speech_data))
9595 result = []
96- for index , row in df_sitting . iterrows () :
96+ for row in speech_data :
9797 speech_dict = {
9898 "speech" : row ["proc_speech" ],
99- "author" : row ["author" ],
99+ "author" : row ["author" ] if pd .notna (row ["author" ]) else None ,
100+ "author_id" : int (row ["speaker" ]) if pd .notna (row ["speaker" ]) else None ,
100101 "timestamp" : row ["timestamp" ],
101102 "is_annotation" : row ["is_annotation" ],
102103 "index" : row ["index" ],
@@ -168,11 +169,9 @@ def post(self, request):
168169 speech_data = request .data .get ("speech_data" )
169170 speeches_list = json .loads (speech_data )
170171
171- # speech dataframe
172- df_speech = pd .DataFrame (speeches_list )
173- df_speech = df_speech [df_speech .is_annotation == False ]
174-
175- serializer .validated_data ["speech_data" ] = speech_data
172+ # Convert and serialize speeches to nested JSON format for frontend catalogue
173+ speech_data_json = _speeches_to_json (speeches_list )
174+ serializer .validated_data ["speech_data" ] = speech_data_json
176175
177176 # Get parliamentary cycle ID given house_types, and date that falls between start_date and end_date
178177 cycle = ParliamentaryCycle .objects .get (
@@ -193,22 +192,29 @@ def post(self, request):
193192
194193 # Then create speech records if speech_data is provided
195194 try :
196- # Prepare speech records
195+ # Prepare speech records for speech searching
196+ df_speech = pd .DataFrame (speeches_list )
197+ df_speech = df_speech [df_speech .is_annotation == False ]
198+
197199 speech_records = []
198200 for idx , speech in df_speech .iterrows ():
199201 speech_records .append (
200202 {
201203 "sitting_id" : sitting .sitting_id ,
202- "index" : speech ["index" ],
203- "speaker_id" : speech ["author_id" ],
204+ "index" : int (speech ["index" ]),
205+ "speaker_id" : (
206+ int (speech ["speaker" ])
207+ if pd .notna (speech ["speaker" ])
208+ else None
209+ ),
204210 "timestamp" : speech ["timestamp" ],
205211 "speech" : speech ["speech" ],
206212 "speech_tokens" : speech ["speech_tokens" ],
207- "length" : speech ["length" ],
213+ "length" : int ( speech ["length" ]) ,
208214 "level_1" : speech ["level_1" ],
209215 "level_2" : speech ["level_2" ],
210216 "level_3" : speech ["level_3" ],
211- "is_annotation" : speech ["is_annotation" ],
217+ "is_annotation" : bool ( speech ["is_annotation" ]) ,
212218 }
213219 )
214220
0 commit comments