#77 Top_k feature Added (#115)

sidhu66 · web-flow · commit 9fa0d61227e1 · 2025-07-27T13:57:55.000-04:00
Co-authored-by: Karanveer Singh Sidhu &lt;&gt;
diff --git a/src/config.yaml b/src/config.yaml
@@ -39,5 +39,4 @@ llmargs:
   temperature: 0.5
   max_tokens: 1024
   output: ../output/AspectAdded/semeval-agg/aspectAdded.pkl
-
-
+  top_k_aspects: 1
diff --git a/src/llm/aspect_extraction_pipeline.py b/src/llm/aspect_extraction_pipeline.py
@@ -17,7 +17,7 @@ class LLMReviewProcessor:
     def __init__(self, cfg: DictConfig):
         self.cfg = cfg
         self.llm_handler = self._init_llm_handler()
-        self.prompt_builder = PromptBuilder()
+        self.prompt_builder = PromptBuilder(top_k=cfg.llmargs.top_k_aspects)
 
     def _init_llm_handler(self):
         config = LLMconfig(
@@ -39,7 +39,6 @@ def find_aspect_indices(aspect: str, sentence_tokens) :
         for i in range(len(tokens) - len(aspect_tokens) + 1):
             if tokens[i:i + len(aspect_tokens)] == aspect_tokens: return list(range(i, i + len(aspect_tokens)))
 
-
         return -1  
     
     def process_reviews(self, reviews: list):
@@ -49,13 +48,32 @@ def process_reviews(self, reviews: list):
             if sample_review.get('implicit', [False])[0] is not True: continue
 
             prompt = self.prompt_builder.build_prompt(sample_review)
-            response = self.llm_handler.get_response(prompt)
-            matches = re.findall(r'\{.*?\}', response, re.DOTALL)
+            
+            max_retries = 5
+            valid_json_found = False
+            matches = []
+            
+            for attempt in range(max_retries):
+                response = self.llm_handler.get_response(prompt)
+                matches = re.findall(r'\{.*?\}', response, re.DOTALL)
+
+                for json_str in matches:
+                    try:
+                        aspect_data = json.loads(json_str)
+                        if "aspect" in aspect_data and aspect_data["aspect"]:
+                            valid_json_found = True
+                            break
+                    except json.JSONDecodeError:
+                        continue
+
+                if valid_json_found:
+                    break
+                else:
+                    print(f"Invalid or no valid JSON with 'aspect' found. Attempt {attempt + 1} of {max_retries}")
 
             if not matches: 
                 print("No JSON object found in response") 
                 continue
-
             all_aspects = []
             seen_aspects = set()
             tokens = [word.strip().lower() for sentences in sample_review["sentences"] for word in sentences]
diff --git a/src/llm/prompt_builder.py b/src/llm/prompt_builder.py
@@ -1,16 +1,18 @@
 
 class PromptBuilder:
-    def __init__(self, task_description=None):
+    def __init__(self, task_description=None, top_k=1):
         self.task_description = task_description or (
             "Identify the latent aspect targeted by the sentiment in the review. "
             "If the aspect is explicitly mentioned, return its index; if it's implicit, return the inferred aspect and use index -1."
         )
+        self.top_k = top_k
 
     def build_prompt(self, review_entry: dict) -> str:
         review_text = ' '.join(review_entry['sentences'][0])
         prompt = (
             f"Review: \"{review_text}\"\n"
             f"Task: {self.task_description}\n"
+            f"Return exactly the top {self.top_k} aspect(s) that best represent the sentiment in this review.\n"
             f"Output Format: {{\"aspect\": \"<aspect_name>\", \"index\": <index_list or -1>}}"
         )