refactor: update _speeches_to_json to handle speech_data directly and improve author_id handling

shenghann · shenghann · commit 604d74792423 · 2025-03-14T19:04:56.000+08:00
diff --git a/src/api/views.py b/src/api/views.py
@@ -90,13 +90,14 @@ def _add_to_result(levels, data, result):
     current_level.append(data)
 
 
-def _speeches_to_json(speech_data):
-    df_sitting = pd.DataFrame(json.loads(speech_data))
+def _speeches_to_json(speech_data: list[dict]):
+    # df_sitting = pd.DataFrame(json.loads(speech_data))
     result = []
-    for index, row in df_sitting.iterrows():
+    for row in speech_data:
         speech_dict = {
             "speech": row["proc_speech"],
-            "author": row["author"],
+            "author": row["author"] if pd.notna(row["author"]) else None,
+            "author_id": int(row["speaker"]) if pd.notna(row["speaker"]) else None,
             "timestamp": row["timestamp"],
             "is_annotation": row["is_annotation"],
             "index": row["index"],
@@ -168,11 +169,9 @@ def post(self, request):
                 speech_data = request.data.get("speech_data")
                 speeches_list = json.loads(speech_data)
 
-                # speech dataframe
-                df_speech = pd.DataFrame(speeches_list)
-                df_speech = df_speech[df_speech.is_annotation == False]
-
-                serializer.validated_data["speech_data"] = speech_data
+                # Convert and serialize speeches to nested JSON format for frontend catalogue
+                speech_data_json = _speeches_to_json(speeches_list)
+                serializer.validated_data["speech_data"] = speech_data_json
 
                 # Get parliamentary cycle ID given house_types, and date that falls between start_date and end_date
                 cycle = ParliamentaryCycle.objects.get(
@@ -193,22 +192,29 @@ def post(self, request):
 
                 # Then create speech records if speech_data is provided
                 try:
-                    # Prepare speech records
+                    # Prepare speech records for speech searching
+                    df_speech = pd.DataFrame(speeches_list)
+                    df_speech = df_speech[df_speech.is_annotation == False]
+
                     speech_records = []
                     for idx, speech in df_speech.iterrows():
                         speech_records.append(
                             {
                                 "sitting_id": sitting.sitting_id,
-                                "index": speech["index"],
-                                "speaker_id": speech["author_id"],
+                                "index": int(speech["index"]),
+                                "speaker_id": (
+                                    int(speech["speaker"])
+                                    if pd.notna(speech["speaker"])
+                                    else None
+                                ),
                                 "timestamp": speech["timestamp"],
                                 "speech": speech["speech"],
                                 "speech_tokens": speech["speech_tokens"],
-                                "length": speech["length"],
+                                "length": int(speech["length"]),
                                 "level_1": speech["level_1"],
                                 "level_2": speech["level_2"],
                                 "level_3": speech["level_3"],
-                                "is_annotation": speech["is_annotation"],
+                                "is_annotation": bool(speech["is_annotation"]),
                             }
                         )