Skip to content

Commit 093992e

Browse files
feat[extractot]:completed extractor testing
1 parent 2936b2f commit 093992e

File tree

2 files changed

+217
-196
lines changed

2 files changed

+217
-196
lines changed

sandbox/extractor.py

Lines changed: 156 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -1,220 +1,180 @@
1-
import requests
1+
# file: extract_actions_memories.py
2+
23
import os
3-
from dotenv import load_dotenv
44
import json
5-
from typing import Dict, List, Optional, Type, Any
5+
from typing import List, Dict
6+
from dotenv import load_dotenv
7+
import requests
68

7-
# Load environment variables at the start
89
load_dotenv()
910

10-
# --- Base Runnable Class ---
11-
class BaseRunnable:
12-
def __init__(
13-
self,
14-
model_url: Optional[str], # URL might be different for different providers
15-
model_name: str,
16-
system_prompt_template: str,
17-
user_prompt_template: str,
18-
input_variables: List[str],
19-
required_format: Optional[Dict],
20-
response_type: str = "chat",
21-
stream: bool = False,
22-
stateful: bool = False,
23-
max_tokens: Optional[int] = None, # Added max_tokens
24-
temperature: float = 0.7 # Added temperature
25-
):
26-
self.model_url = model_url # Kept for Ollama, not strictly needed for OpenRouter (hardcoded URL)
27-
self.model_name = model_name
28-
self.system_prompt_template = system_prompt_template # Use template for formatting later
29-
self.user_prompt_template = user_prompt_template # Use template for formatting later
30-
self.input_variables = input_variables
31-
self.required_format = required_format
32-
self.response_type = response_type
33-
self.stream = stream
34-
self.stateful = stateful
35-
self.history: List[Dict[str, str]] = [] # Type hint for history
36-
self.max_tokens = max_tokens
37-
self.temperature = temperature
38-
39-
def add_to_history(self, chat_history: List[Dict[str, str]]) -> None:
40-
self.history.extend(chat_history)
41-
42-
def invoke(self, inputs: Dict[str, str]) -> Dict[str, Any]: # Return type hint
43-
raise NotImplementedError("Subclasses must implement invoke method")
44-
45-
# --- Ollama Runnable (Kept for comparison/alternative) ---
46-
class OllamaRunnable(BaseRunnable):
47-
def invoke(self, inputs: Dict[str, str]) -> Dict[str, Any]:
48-
# Format prompts with input variables
49-
system_prompt = self.system_prompt_template.format(**inputs)
50-
user_prompt = self.user_prompt_template.format(**inputs)
51-
52-
# Ollama typically uses a single prompt string or messages list depending on endpoint/version
53-
# This example uses a single prompt string for simplicity with generate endpoint
54-
full_prompt = f"{system_prompt}\n\n{user_prompt}"
55-
56-
# Prepare payload for Ollama API
57-
payload: Dict[str, Any] = {
58-
"model": self.model_name,
59-
"prompt": full_prompt,
60-
"stream": self.stream,
61-
"options": { # Ollama options object for temperature, max_tokens etc.
62-
"temperature": self.temperature,
63-
"num_predict": self.max_tokens, # Ollama uses num_predict for max_tokens
64-
}
65-
}
11+
# -------- Step 1: Load Raw Data from Text File -------- #
12+
def load_raw_data() -> List[str]:
13+
import user_data
14+
return user_data.raw_data_list
6615

67-
# Add format if requested (Ollama requires "json" string, not required_format dict)
68-
if self.response_type == "json":
69-
payload["format"] = "json"
16+
# -------- Step 2: Define Prompt Templates -------- #
17+
system_prompt_template = """
18+
You are an expert assistant specialized in extracting structured information from various types of user communication data such as emails, messages, queries, and notifications.
7019
71-
# Make API call to Ollama
72-
try:
73-
# Ensure model_url is set for Ollama
74-
ollama_url = self.model_url or "http://localhost:11434"
75-
response = requests.post(f"{ollama_url}/api/generate", json=payload)
76-
response.raise_for_status()
77-
78-
result = response.json()
79-
80-
if self.response_type == "json":
81-
# Ollama's /api/generate with format="json" still returns JSON wrapper
82-
response_content = result.get("response", "{}")
83-
try:
84-
# Need to parse the string inside the "response" key
85-
return json.loads(response_content)
86-
except json.JSONDecodeError:
87-
print(f"Warning: Ollama returned non-JSON response despite format='json'. Content: {response_content}")
88-
return {"error": "Model did not return valid JSON."}
89-
else:
90-
# For chat response, just return the 'response' key content
91-
return {"response": result.get("response", "")}
92-
93-
except requests.RequestException as e:
94-
return {"error": f"Failed to invoke Ollama model {self.model_name}: {str(e)}"}
95-
except Exception as e:
96-
return {"error": f"An unexpected error occurred with Ollama: {str(e)}"}
20+
Your goal is to analyze the provided raw user data carefully and extract two specific categories of information:
21+
22+
1. Action Items:
23+
- Clear, actionable tasks or requests the user needs to complete.
24+
- Often involve interactions with tools like calendars, emails, reminders, documents, or appointments.
25+
- Provide concise, unambiguous task descriptions.
9726
27+
2. Memory Items:
28+
- Personal notes, facts, or context about the user's life: health, family, career, events, preferences.
29+
- Informational and non-actionable, useful for future reference.
9830
99-
# --- OpenRouter Runnable ---
100-
class OpenRouterRunnable(BaseRunnable):
101-
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
31+
Important Instructions:
32+
- Only extract information explicitly or strongly implied by the data.
33+
- Do NOT add assumptions or unrelated details.
34+
- Format the output strictly as a JSON object matching the required schema.
35+
- If no relevant items exist in a category, return an empty list for that category.
36+
- The input data may be formal or informal, questions, notifications, or any mix, so interpret accordingly.
10237
103-
def __init__(self, api_key: str, **kwargs):
104-
super().__init__(**kwargs)
105-
self.api_key = api_key
38+
Examples:
10639
107-
# Ensure response_type is 'chat' for OpenRouter chat completions endpoint
108-
if self.response_type not in ["chat", "json"]:
109-
print(f"Warning: OpenRouter chat completion supports 'chat' or 'json' response_type, not '{self.response_type}'. Using 'chat'.")
110-
self.response_type = "chat" # Default to chat if something else was passed
40+
Example 1:
41+
Input:
42+
\"\"\"
43+
Subject: Invoice #INV-2023-01 Received
44+
To whom it may concern,
11145
112-
# OpenRouter handles JSON formatting if the prompt guides the model to produce it
113-
# We don't set a specific 'format' parameter in the OpenRouter payload like Ollama
114-
# The 'required_format' is used as instruction in the system/user prompt
46+
We have received your invoice #INV-2023-01 dated October 26, 2023. Payment processing is underway and is expected within 10 business days.
11547
116-
def invoke(self, inputs: Dict[str, str]) -> Dict[str, Any]:
117-
if not self.api_key:
118-
return {"error": "OpenRouter API key is not configured."}
48+
Regards,
49+
Accounts Payable Department
50+
\"\"\"
51+
52+
Output:
53+
{
54+
"action_items": [
55+
"Track payment processing for invoice #INV-2023-01"
56+
],
57+
"memory_items": [
58+
"Invoice #INV-2023-01 was received on October 26, 2023",
59+
"Payment expected within 10 business days"
60+
]
61+
}
62+
63+
Example 2:
64+
Input:
65+
\"\"\"
66+
Hi Sarah,
67+
68+
Are you doing anything fun this weekend? Thought we might grab pizza if you're free.
69+
70+
Best,
71+
Mike
72+
\"\"\"
73+
74+
Output:
75+
{
76+
"action_items": [
77+
"Ask Sarah if she is free to grab pizza this weekend"
78+
],
79+
"memory_items": [
80+
"User wants to make weekend plans with Sarah"
81+
]
82+
}
11983
120-
# Format prompts with input variables
121-
system_prompt = self.system_prompt_template.format(**inputs)
122-
user_prompt = self.user_prompt_template.format(**inputs)
84+
Your structured output must contain exactly two fields:
85+
- "action_items": an array of actionable task strings.
86+
- "memory_items": an array of personal or contextual information strings.
12387
124-
# OpenRouter uses the standard OpenAI chat messages format
88+
Focus on precision, clarity, and relevance in your extraction.
89+
"""
90+
91+
user_prompt_template = """
92+
Given the following raw user data:
93+
{raw_data}
94+
95+
Analyze it thoroughly and extract all relevant action items and memory items as per the instructions.
96+
97+
Return the output ONLY as a JSON object following this structure:
98+
{{
99+
"action_items": [ ... ],
100+
"memory_items": [ ... ]
101+
}}
102+
"""
103+
104+
105+
required_format = """
106+
{
107+
"type": "object",
108+
"properties": {
109+
"action_items": {
110+
"type": "array",
111+
"items": {
112+
"type": "string",
113+
"description": "A task or action the user needs to complete"
114+
},
115+
"description": "List of tasks or actions the user needs to take"
116+
},
117+
"memory_items": {
118+
"type": "array",
119+
"items": {
120+
"type": "string",
121+
"description": "A personal note or memory"
122+
},
123+
"description": "List of personal notes or memories"
124+
}
125+
"""
126+
127+
# -------- Step 3: Create a Runnable Class for OpenRouter -------- #
128+
class OpenRouterRunnable:
129+
def __init__(self, model_url: str, model_name: str):
130+
self.api_url = model_url
131+
self.model = model_name
132+
self.headers = {
133+
"Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}",
134+
"Content-Type": "application/json",
135+
}
136+
137+
def invoke(self, inputs: Dict[str, str]) -> Dict[str, List[str]]:
125138
messages = [
126-
{"role": "system", "content": system_prompt},
127-
{"role": "user", "content": user_prompt},
128-
# You could add self.history here if stateful is True
139+
{"role": "system", "content": system_prompt_template},
140+
{"role": "user", "content": user_prompt_template.format(raw_data=inputs["raw_data"]) + "\n" + required_format},
129141
]
130142

131-
# Prepare payload for OpenRouter API
132-
payload: Dict[str, Any] = {
133-
"model": self.model_name,
143+
payload = {
144+
"model": self.model,
134145
"messages": messages,
135-
"temperature": self.temperature,
136-
"stream": self.stream,
146+
"stream": False
137147
}
138-
if self.max_tokens is not None:
139-
payload["max_tokens"] = self.max_tokens
140148

141-
headers = {
142-
"Authorization": f"Bearer {self.api_key}",
143-
"Content-Type": "application/json"
144-
}
149+
response = requests.post(self.api_url, headers=self.headers, json=payload)
150+
response.raise_for_status()
145151

146-
# Make API call to OpenRouter
147152
try:
148-
print(f"Calling OpenRouter model: {self.model_name}...")
149-
response = requests.post(self.OPENROUTER_API_URL, headers=headers, json=payload)
150-
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
151-
152-
result = response.json()
153-
154-
# OpenRouter (like OpenAI) returns the response in result['choices'][0]['message']['content']
155-
if result and result.get('choices'):
156-
response_content = result['choices'][0]['message']['content']
157-
158-
if self.response_type == "json":
159-
try:
160-
# Attempt to parse the content as JSON
161-
return json.loads(response_content)
162-
except json.JSONDecodeError:
163-
print(f"Warning: Model response was not valid JSON despite request for json type. Content: {response_content[:200]}...") # Print truncated content
164-
return {"error": "Model response was not valid JSON.", "raw_response": response_content}
165-
else:
166-
# Return as plain text response
167-
return {"response": response_content}
168-
else:
169-
# Handle cases where 'choices' is missing or empty
170-
print(f"Warning: OpenRouter response missing 'choices'. Full response: {result}")
171-
return {"error": "Unexpected API response structure."}
172-
173-
except requests.exceptions.RequestException as e:
174-
print(f"OpenRouter API request failed: {e}")
175-
# Try to get more details from the response if available in the exception
176-
if e.response is not None:
177-
try:
178-
error_details = e.response.json()
179-
print(f"OpenRouter Error Details: {error_details}")
180-
return {"error": f"OpenRouter API request failed: {e}", "details": error_details}
181-
except json.JSONDecodeError:
182-
return {"error": f"OpenRouter API request failed: {e}", "raw_response": e.response.text}
183-
return {"error": f"OpenRouter API request failed: {e}"}
153+
content = response.json()["choices"][0]["message"]["content"]
154+
return json.loads(content)
184155
except Exception as e:
185-
print(f"An unexpected error occurred with OpenRouter: {str(e)}")
186-
return {"error": f"An unexpected error occurred: {str(e)}"}
187-
188-
189-
# --- Configuration and Runnable Selection ---
190-
191-
# Environment variable to select provider (e.g., in .env: LLM_PROVIDER=openrouter)
192-
LLM_PROVIDER = os.getenv("LLM_PROVIDER", "ollama").lower() # Default to ollama if not set
193-
194-
# Environment variables for model names
195-
OLLAMA_MODEL_NAME = os.getenv("OLLAMA_MODEL_NAME", "qwen:4b") # Default Ollama Qwen
196-
# Confirmed OpenRouter name for Qwen 1.5 4B Chat
197-
OPENROUTER_MODEL_NAME = os.getenv("OPENROUTER_MODEL_NAME", "Qwen/Qwen1.5-4B-Chat")
198-
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
199-
200-
# --- Updated Prompt Templates and Format ---
201-
# System prompt guiding the extraction and JSON format
202-
SYSTEM_PROMPT_TEMPLATE = """
203-
You are an AI assistant designed to process user input and extract specific information.
204-
Your task is to read the provided text content and the user's query.
205-
Based on the content, identify:
206-
1. **Action Items:** Tasks or activities suggested or implied that require follow-up or use of tools (like creating a presentation, drafting an email, scheduling). These should be clear, actionable phrases.
207-
2. **Memory Items:** Key facts, details, names, dates, or concepts from the text that the user might want to remember or reference later. These should be concise summaries.
208-
209-
Structure your response strictly as a JSON object with two keys: "action_items" and "memory_items".
210-
Each key should map to a JSON array (list) of strings.
211-
If no items are found for a category, the corresponding array should be empty ([]).
212-
Do NOT include any other text or formatting outside the JSON object.
213-
Example required format:
214-
```json
215-
{
216-
"action_items": ["Create a presentation on Agriculture Visit", "Draft follow-up email"],
217-
"memory_items": ["Visited Maharashtra farms", "Meeting with Mr. Patil on Oct 26th"]
218-
}
219-
```
220-
"""
156+
print("Failed to parse model response:", e)
157+
print("Raw output:", response.text)
158+
return {"action_items": [], "memory_items": []}
159+
160+
# -------- Step 4: Main Processing Logic -------- #
161+
def extract_action_and_memory_items() -> Dict[str, List[str]]:
162+
raw_data_list = load_raw_data()
163+
if not raw_data_list:
164+
return {"action_items": [], "memory_items": []}
165+
166+
# Example: process the first item (index 0)
167+
raw_data_str = raw_data_list[10]
168+
169+
runnable = OpenRouterRunnable(
170+
model_url="https://openrouter.ai/api/v1/chat/completions",
171+
model_name="qwen/qwen3-8b"
172+
)
173+
174+
return runnable.invoke({"raw_data": raw_data_str, "required_format": required_format})
175+
176+
177+
# -------- Optional Entry Point -------- #
178+
if __name__ == "__main__":
179+
result = extract_action_and_memory_items()
180+
print(json.dumps(result, indent=2))

0 commit comments

Comments
 (0)