Skip to content

Commit 8c668ca

Browse files
committed
Add post processing to check excerpts in summary
Regenerating summary if any excerpts id exists in summary
1 parent 33c0ca0 commit 8c668ca

File tree

2 files changed

+65
-51
lines changed

2 files changed

+65
-51
lines changed

api/serializers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1561,6 +1561,7 @@ class Meta:
15611561
"id",
15621562
"code",
15631563
"event",
1564+
"start_date",
15641565
)
15651566

15661567

per/ops_learning_summary.py

Lines changed: 64 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -57,25 +57,25 @@ class OpsLearningSummaryTask:
5757
MIN_DIF_EXCERPTS = 3
5858

5959
primary_prompt = (
60-
"Please aggregate and summarize the provided data into UP TO THREE structured paragraphs. "
61-
"The output MUST strictly adhere to the format below: "
62-
"- *Title*: Each finding should begin with the main finding TITLE in bold. "
60+
"Please aggregate and summarize the provided data into UP TO THREE structured paragraphs.\n"
61+
"The output MUST strictly adhere to the format below:\n"
62+
"- *Title*: Each finding should begin with the main finding TITLE in bold.\n"
6363
"Should be a high level summary of the finding below. "
64-
"The length of the title MUST be between 20 and 30 characters."
65-
"- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary. "
64+
"The length of the title MUST be between 20 and 30 characters.\n"
65+
"- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary.\n"
6666
"- Content: Aggregate findings so that they are supported by evidence from more than one report. "
6767
"Always integrate evidence from multiple reports or items into the paragraph, and "
68-
"include the year and country of the evidence."
68+
"include the year and country of the evidence.\n"
6969
"- *Confidence Level*: Based on the number of excerpts connected to the finding, "
7070
"assign a score from 1 to 5 where 1 is the lowest and 5 is the highest, e.g. 4/5"
71-
"At the end of the summary, please highlight any contradictory country reports. "
72-
"Important:"
73-
"-- DO NOT mention the excerpts id in the content of the summary."
74-
"-- DO NOT mention the confidence level in the content of the summary."
75-
"-- DO NOT use data from any source other than the one provided."
76-
"Output Format:"
77-
"Provide your answer in valid JSON form. Reply with only the answer in valid JSON form and include no other commentary. "
78-
"Example: "
71+
"At the end of the summary, please highlight any contradictory country reports.\n"
72+
"Important:\n\n"
73+
"-- DO NOT mention the excerpts id in the content of the summary.\n"
74+
"-- DO NOT mention the confidence level in the content of the summary.\n"
75+
"-- DO NOT use data from any source other than the one provided.\n\n"
76+
"Output Format:\n"
77+
"Provide your answer in valid JSON form. Reply with only the answer in valid JSON form and include no other commentary.\n"
78+
"Example:\n"
7979
'{"0": {"title": "Flexible and Adaptive Response Planning", "excerpts id":"123, 45" '
8080
'"content": "Responses in Honduras, Peru, Ecuador, and Panama highlight the importance of adaptable strategies. '
8181
"The shift from youth-focused MHPSS to inclusive care in Peru in 2021, the pivot from sanitation infrastructure "
@@ -88,22 +88,22 @@ class OpsLearningSummaryTask:
8888
)
8989

9090
secondary_prompt = (
91-
"Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). "
92-
"The output SHOULD ALWAYS follow the format below: "
93-
"- *Type*: Whether the paragraph is related to a 'sector' or a 'component' "
94-
"- *Subtype*: Provides the name of the sector or of the component to which the paragraph refers."
95-
"- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary."
91+
"Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). \n "
92+
"The output SHOULD ALWAYS follow the format below:\n"
93+
"- *Type*: Whether the paragraph is related to a 'sector' or a 'component'\n"
94+
"- *Subtype*: Provides the name of the sector or of the component to which the paragraph refers.\n"
95+
"- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary.\n"
9696
"*Content*: A short summary aggregating findings related to the Subtype, "
9797
"so that they are supported by evidence coming from more than one report, "
9898
"and there is ONLY ONE entry per subtype. Always integrate in the paragraph evidence that supports "
9999
"it from the data available from multiples reports or items, include year and country of the evidence. "
100-
"The length of each paragraph MUST be between 20 and 30 words."
101-
" Important:"
102-
"- ONLY create one summary per subtype"
103-
"- DO NOT mention the ids of the excerpts in the content of the summary."
104-
"- DO NOT use data from any source other than the one provided. "
105-
"Output Format:"
106-
"Provide your answer in valid JSON form. Reply with ONLY the answer in JSON form and include NO OTHER COMMENTARY."
100+
"The length of each paragraph MUST be between 20 and 30 words.\n"
101+
" Important:\n\n"
102+
"- ONLY create one summary per subtype\n"
103+
"- DO NOT mention the ids of the excerpts in the content of the summary.\n"
104+
"- DO NOT use data from any source other than the one provided.\n\n"
105+
"Output Format:\n"
106+
"Provide your answer in valid JSON form. Reply with ONLY the answer in JSON form and include NO OTHER COMMENTARY.\n"
107107
'{"0": {"type": "sector", "subtype": "shelter", "excerpts id":"43, 1375, 14543", "content": "lorem ipsum"}, '
108108
'"1": {"type": "component", "subtype": "Information Management", "excerpts id":"23, 235", "content": "lorem ipsum"}, '
109109
'"2": {"type": "sector", "subtype": "WASH", "excerpts id":"30, 40", "content": "lorem ipsum"}}'
@@ -120,21 +120,21 @@ class OpsLearningSummaryTask:
120120
)
121121

122122
primary_instruction_prompt = (
123-
"You should:"
124-
"1. Describe, Summarize and Compare: Identify and detail the who, what, where and when"
125-
"2. Explain and Connect: Analyze why events happened and how they are related"
126-
"3. Identify gaps: Assess what data is available, what is missing and potential biases"
127-
"4. Identify key messages: Determine important stories and signals hidden in the data"
128-
"5. Select top three: Select up to three findings to report"
123+
"You should:\n"
124+
"1. Describe, Summarize and Compare: Identify and detail the who, what, where and when "
125+
"2. Explain and Connect: Analyze why events happened and how they are related "
126+
"3. Identify gaps: Assess what data is available, what is missing and potential biases "
127+
"4. Identify key messages: Determine important stories and signals hidden in the data "
128+
"5. Select top three: Select up to three findings to report "
129129
)
130130

131131
secondary_instruction_prompt = (
132-
"You should for each section in the data (TYPE & SUBTYPE combination):"
133-
"1. Describe, Summarize and Compare: Identify and detail the who, what, where and when"
134-
"2. Explain and Connect: Analyze why events happened and how they are related"
135-
"3. Identify gaps: Assess what data is available, what is missing and potential biases"
136-
"4. Identify key messages: Determine if there are important stories and signals hidden in the data"
137-
"5. Conclude and make your case"
132+
"You should for each section in the data (TYPE & SUBTYPE combination):\n"
133+
"1. Describe, Summarize and Compare: Identify and detail the who, what, where and when "
134+
"2. Explain and Connect: Analyze why events happened and how they are related "
135+
"3. Identify gaps: Assess what data is available, what is missing and potential biases "
136+
"4. Identify key messages: Determine if there are important stories and signals hidden in the data "
137+
"5. Conclude and make your case "
138138
)
139139

140140
@staticmethod
@@ -552,7 +552,7 @@ def _build_intro_section(cls):
552552
return (
553553
"I will provide you with a set of instructions, data, and formatting requests in three sections."
554554
+ " I will pass you the INSTRUCTIONS section, are you ready?"
555-
+ "\n\n\n\n"
555+
+ "\n\n"
556556
)
557557

558558
@classmethod
@@ -585,9 +585,9 @@ def _build_instruction_section(cls, request_filter: dict, df: pd.DataFrame, inst
585585
component_str = '", "'.join(components)
586586
instructions.append(f'and "{component_str}" aspects')
587587

588-
instructions.append("in Emergency Response.")
588+
instructions.append("in Emergency Response. ")
589589
instructions.append("\n\n" + instruction)
590-
instructions.append("\n\nI will pass you the DATA section, are you ready?\n\n\n")
590+
instructions.append("\n\nI will pass you the DATA section, are you ready?\n\n")
591591
return "\n".join(instructions)
592592

593593
@classmethod
@@ -814,13 +814,30 @@ def _modify_summary(summary: dict) -> dict:
814814
Checks if the "Confidence level" is present in the primary response and skipping for the secondary summary
815815
"""
816816
for key, value in summary.items():
817-
confidence_level = "confidence level"
818-
if key == "contradictory reports" or confidence_level in value:
817+
if key == "contradictory reports":
819818
continue
820-
if confidence_level in value["content"].lower():
821-
parts = re.split(rf"(?i)\b{confidence_level}\b", value["content"])
819+
820+
content = value.get("content", "")
821+
excerpt_ids = value.get("excerpts id", "")
822+
excerpt_id_list = (
823+
list(set(excerpt_ids))
824+
if isinstance(excerpt_ids, list)
825+
else list(set(int(id.strip()) for id in excerpt_ids.split(",") if excerpt_ids and excerpt_ids != ""))
826+
)
827+
828+
# Check if any excerpt id is present in the content and regenerate the summary if found
829+
if any(re.search(rf"\b{id}\b", content) for id in excerpt_id_list):
830+
return cls.generate_summary(prompt, type)
831+
832+
value["content"] = content
833+
value["excerpts id"] = excerpt_id_list
834+
835+
# Extract and remove if `confidence level` exists in the content
836+
confidence_level = "confidence level"
837+
if confidence_level not in value and confidence_level in content.lower():
838+
parts = re.split(rf"(?i)\b{confidence_level}\b", content, maxsplit=1)
822839
value["content"] = parts[0].strip() + "."
823-
value["confidence level"] = parts[1][1:].strip()
840+
value["confidence level"] = parts[1].strip()
824841

825842
return summary
826843

@@ -906,11 +923,7 @@ def secondary_response_save_to_db(
906923
type = value["type"].strip()
907924
subtype = value["subtype"].strip()
908925
content = value["content"].strip()
909-
excerpt_ids = value["excerpts id"]
910-
if isinstance(excerpt_ids, list):
911-
excerpt_id_list = list(set(excerpt_ids if excerpt_ids else []))
912-
else:
913-
excerpt_id_list = list(set(int(id.strip()) for id in excerpt_ids.split(",") if excerpt_ids and excerpt_ids != ""))
926+
excerpt_id_list = value["excerpts id"]
914927

915928
if type == "component" and len(excerpt_id_list) > 0:
916929
cls.add_used_ops_learnings_component(

0 commit comments

Comments
 (0)