1+ import fitz # PyMuPDF
2+ import json
3+ import tempfile
4+ from typing import Dict , Tuple , Any
5+ from langgraph .graph import StateGraph , START , END
6+ from langchain_core .messages import HumanMessage
7+ from pydantic import BaseModel
8+ from typing_extensions import TypedDict
9+
10+ from oci_models import get_llm # LLM loader
11+ from utils import remove_triple_backtics # Output cleaner
12+
13+ # Dummy API that simulates checking invoice value
14+ def dummy_invoice_api_check (extracted_total : float ) -> float :
15+ return extracted_total
16+
17+ # --- Data Models ---
18+ class ExtractedPDFData (BaseModel ):
19+ data : Dict [str , Any ]
20+
21+ def make_hashable (self ):
22+ for key , value in self .data .items ():
23+ if isinstance (value , list ):
24+ self .data [key ] = tuple (value )
25+
26+ class State (TypedDict ):
27+ pdf_path : str
28+ declared_amount : float
29+ extracted_information : ExtractedPDFData
30+ validation_messages : list
31+ error : str
32+
33+ # --- Agent ---
34+ class ExpenseValidationAgent :
35+ def extract_pdf_text (self , pdf_path : str ) -> str :
36+ text = ""
37+ with fitz .open (pdf_path ) as doc :
38+ for page in doc :
39+ text += page .get_text ("text" ) + "\n "
40+ return text .strip ()
41+
42+ def process_pdf (self , pdf_path : str ) -> ExtractedPDFData :
43+ llm = get_llm ()
44+ text = self .extract_pdf_text (pdf_path )
45+
46+ # early check if PDF is unreadable
47+ if not text or text .strip () == "" :
48+ raise Exception ("❌ No readable text extracted from the uploaded PDF. It may be scanned badly or empty." )
49+
50+ prompt = f"""
51+ Extract ONLY a valid JSON object from the following document.
52+ No explanations, no formatting, no triple backticks.
53+
54+ Required fields:
55+ - employee_name (string)
56+ - claim_date (string)
57+ - items (list of dicts with keys: 'description' (string), 'amount' (float), 'category' (string))
58+ - total_amount (float)
59+
60+ Output must be a single valid JSON object.
61+
62+ Document:
63+ { text }
64+ """
65+
66+ response = llm .invoke ([{"role" : "user" , "content" : prompt }])
67+
68+ if not response or not response .content or not response .content .strip ():
69+ raise Exception ("❌ LLM returned an empty output. Cannot extract PDF information." )
70+
71+ cleaned = remove_triple_backtics (response .content .strip ())
72+
73+ # early check if LLM output is blank
74+ if not cleaned or cleaned .strip () == "" :
75+ raise Exception ("❌ Cleaned LLM output is empty. No valid data to extract." )
76+
77+ if not cleaned .startswith ("{" ):
78+ raise Exception (f"❌ LLM output does not start with a JSON object.\n Raw output:\n { cleaned } " )
79+
80+ try :
81+ data = json .loads (cleaned )
82+ except Exception as e :
83+ raise Exception (f"❌ Failed to parse LLM output as JSON.\n Raw output:\n { cleaned } \n Error: { e } " )
84+
85+ structured = ExtractedPDFData (data = data )
86+ structured .make_hashable ()
87+ return structured
88+
89+ def llm_extract_node (self , state : State ) -> Dict [str , Any ]:
90+ pdf_path = state ["pdf_path" ]
91+ extracted_data = self .process_pdf (pdf_path )
92+
93+ if not extracted_data or not extracted_data .data :
94+ return {"extracted_information" : None , "error" : "Failed to extract structured PDF content." }
95+
96+ return {"extracted_information" : extracted_data , "error" : None }
97+
98+ def check_policy_node (self , state : State ) -> Dict [str , Any ]:
99+ llm = get_llm (temperature = 0.0 )
100+ extracted = state ["extracted_information" ].data
101+
102+ policy_text = """..."""
103+ prompt = f"""
104+ Given the company policy:
105+ { policy_text }
106+
107+ And the following expense claim:
108+ { json .dumps (extracted , indent = 2 )}
109+
110+ Return a JSON object with:
111+ - status: "pass" if the claim conforms, "fail" if it violates
112+ - reason: 1-2 sentences explaining why
113+
114+ Respond ONLY with a valid JSON object. Do not add anything else.
115+ """
116+
117+ response = llm .invoke ([HumanMessage (content = prompt )])
118+ raw = response .content .strip ()
119+ cleaned = raw .replace ("```json" , "" ).replace ("```" , "" ).strip ()
120+
121+ try :
122+ result = json .loads (cleaned )
123+ except Exception as e :
124+ raise Exception (f"❌ LLM policy check did not return valid JSON.\n Raw output:\n { cleaned } \n Error: { e } " )
125+
126+ status = result .get ("status" , "" ).lower ()
127+ reason = result .get ("reason" , "No reason provided." )
128+
129+ label = "✅ Policy Check: " if status == "pass" else "❌ Policy Check: "
130+ return {
131+ "validation_messages" : state .get ("validation_messages" , []) + [label + reason ]
132+ }
133+
134+ def check_category_node (self , state : State ) -> Dict [str , Any ]:
135+ llm = get_llm (temperature = 0.0 )
136+ extracted = state ["extracted_information" ].data
137+
138+ prompt = f"""
139+ Given this expense data:
140+ { json .dumps (extracted , indent = 2 )}
141+
142+ Are any of the expense items clearly mismatched? For example, if 'Bread' is categorized under 'Travel'.
143+
144+ Return a JSON object with:
145+ - status: "pass" if all items are categorized correctly, "fail" if there are mismatches
146+ - reason: 1-2 sentences explaining if any mismatch exists.
147+
148+ Respond ONLY with a valid JSON object.
149+ """
150+
151+ response = llm .invoke ([HumanMessage (content = prompt )])
152+ raw = response .content .strip ()
153+ cleaned = raw .replace ("```json" , "" ).replace ("```" , "" ).strip ()
154+
155+ try :
156+ result = json .loads (cleaned )
157+ except Exception as e :
158+ raise Exception (f"❌ LLM category check did not return valid JSON.\n Raw output:\n { cleaned } \n Error: { e } " )
159+
160+ status = result .get ("status" , "" ).lower ()
161+ reason = result .get ("reason" , "No reason provided." )
162+
163+ label = "✅ Category Check: " if status == "pass" else "❌ Category Check: "
164+ return {
165+ "validation_messages" : state .get ("validation_messages" , []) + [label + reason ]
166+ }
167+
168+ def check_declared_amount_node (self , state : State ) -> Dict [str , Any ]:
169+ extracted_total = state ["extracted_information" ].data .get ("total_amount" , 0.0 )
170+ api_total = dummy_invoice_api_check (extracted_total )
171+ declared = state ["declared_amount" ]
172+
173+ if abs (api_total - declared ) > 0.1 :
174+ return {"validation_messages" : state .get ("validation_messages" , []) + [
175+ f"⚠️ Declared amount mismatch. Declared: ${ declared :.2f} , Backend Invoice: ${ api_total :.2f} "
176+ ]}
177+ else :
178+ return {"validation_messages" : state .get ("validation_messages" , []) + [
179+ "✅ Declared Amount Check: No significant mismatch"
180+ ]}
181+
182+ def create_workflow (self ):
183+ graph = StateGraph (State )
184+
185+ graph .add_node ("Extract" , self .llm_extract_node )
186+ graph .add_node ("PolicyCheck" , self .check_policy_node )
187+ graph .add_node ("CategoryCheck" , self .check_category_node )
188+ graph .add_node ("AmountCheck" , self .check_declared_amount_node )
189+
190+ graph .add_edge (START , "Extract" )
191+ graph .add_edge ("Extract" , "PolicyCheck" )
192+ graph .add_edge ("PolicyCheck" , "CategoryCheck" )
193+ graph .add_edge ("CategoryCheck" , "AmountCheck" )
194+ graph .add_edge ("AmountCheck" , END )
195+
196+ return graph .compile ()
197+
198+ # --- Public API ---
199+ def process_expense_workflow (pdf_bytes : bytes , declared_amount : float ) -> Tuple [Dict [str , Any ], list ]:
200+ temp_file = tempfile .NamedTemporaryFile (delete = False , suffix = ".pdf" )
201+ temp_file .write (pdf_bytes )
202+ temp_file .close ()
203+
204+ agent = ExpenseValidationAgent ()
205+ workflow = agent .create_workflow ()
206+
207+ initial_state = {
208+ "pdf_path" : temp_file .name ,
209+ "declared_amount" : declared_amount ,
210+ "extracted_information" : None ,
211+ "validation_messages" : [],
212+ "error" : None
213+ }
214+
215+ final_state = workflow .invoke (initial_state )
216+
217+ if final_state .get ("error" ):
218+ raise Exception (final_state ["error" ])
219+
220+ return final_state ["extracted_information" ].data , final_state ["validation_messages" ]
0 commit comments