1
+ import fitz # PyMuPDF
2
+ import json
3
+ import tempfile
4
+ from typing import Dict , Tuple , Any
5
+ from langgraph .graph import StateGraph , START , END
6
+ from langchain_core .messages import HumanMessage
7
+ from pydantic import BaseModel
8
+ from typing_extensions import TypedDict
9
+
10
+ from oci_models import get_llm # LLM loader
11
+ from utils import remove_triple_backtics # Output cleaner
12
+
13
+ # Dummy API that simulates checking invoice value
14
+ def dummy_invoice_api_check (extracted_total : float ) -> float :
15
+ return extracted_total
16
+
17
+ # --- Data Models ---
18
+ class ExtractedPDFData (BaseModel ):
19
+ data : Dict [str , Any ]
20
+
21
+ def make_hashable (self ):
22
+ for key , value in self .data .items ():
23
+ if isinstance (value , list ):
24
+ self .data [key ] = tuple (value )
25
+
26
+ class State (TypedDict ):
27
+ pdf_path : str
28
+ declared_amount : float
29
+ extracted_information : ExtractedPDFData
30
+ validation_messages : list
31
+ error : str
32
+
33
+ # --- Agent ---
34
+ class ExpenseValidationAgent :
35
+ def extract_pdf_text (self , pdf_path : str ) -> str :
36
+ text = ""
37
+ with fitz .open (pdf_path ) as doc :
38
+ for page in doc :
39
+ text += page .get_text ("text" ) + "\n "
40
+ return text .strip ()
41
+
42
+ def process_pdf (self , pdf_path : str ) -> ExtractedPDFData :
43
+ llm = get_llm ()
44
+ text = self .extract_pdf_text (pdf_path )
45
+
46
+ # early check if PDF is unreadable
47
+ if not text or text .strip () == "" :
48
+ raise Exception ("❌ No readable text extracted from the uploaded PDF. It may be scanned badly or empty." )
49
+
50
+ prompt = f"""
51
+ Extract ONLY a valid JSON object from the following document.
52
+ No explanations, no formatting, no triple backticks.
53
+
54
+ Required fields:
55
+ - employee_name (string)
56
+ - claim_date (string)
57
+ - items (list of dicts with keys: 'description' (string), 'amount' (float), 'category' (string))
58
+ - total_amount (float)
59
+
60
+ Output must be a single valid JSON object.
61
+
62
+ Document:
63
+ { text }
64
+ """
65
+
66
+ response = llm .invoke ([{"role" : "user" , "content" : prompt }])
67
+
68
+ if not response or not response .content or not response .content .strip ():
69
+ raise Exception ("❌ LLM returned an empty output. Cannot extract PDF information." )
70
+
71
+ cleaned = remove_triple_backtics (response .content .strip ())
72
+
73
+ # early check if LLM output is blank
74
+ if not cleaned or cleaned .strip () == "" :
75
+ raise Exception ("❌ Cleaned LLM output is empty. No valid data to extract." )
76
+
77
+ if not cleaned .startswith ("{" ):
78
+ raise Exception (f"❌ LLM output does not start with a JSON object.\n Raw output:\n { cleaned } " )
79
+
80
+ try :
81
+ data = json .loads (cleaned )
82
+ except Exception as e :
83
+ raise Exception (f"❌ Failed to parse LLM output as JSON.\n Raw output:\n { cleaned } \n Error: { e } " )
84
+
85
+ structured = ExtractedPDFData (data = data )
86
+ structured .make_hashable ()
87
+ return structured
88
+
89
+ def llm_extract_node (self , state : State ) -> Dict [str , Any ]:
90
+ pdf_path = state ["pdf_path" ]
91
+ extracted_data = self .process_pdf (pdf_path )
92
+
93
+ if not extracted_data or not extracted_data .data :
94
+ return {"extracted_information" : None , "error" : "Failed to extract structured PDF content." }
95
+
96
+ return {"extracted_information" : extracted_data , "error" : None }
97
+
98
+ def check_policy_node (self , state : State ) -> Dict [str , Any ]:
99
+ llm = get_llm (temperature = 0.0 )
100
+ extracted = state ["extracted_information" ].data
101
+
102
+ policy_text = """..."""
103
+ prompt = f"""
104
+ Given the company policy:
105
+ { policy_text }
106
+
107
+ And the following expense claim:
108
+ { json .dumps (extracted , indent = 2 )}
109
+
110
+ Return a JSON object with:
111
+ - status: "pass" if the claim conforms, "fail" if it violates
112
+ - reason: 1-2 sentences explaining why
113
+
114
+ Respond ONLY with a valid JSON object. Do not add anything else.
115
+ """
116
+
117
+ response = llm .invoke ([HumanMessage (content = prompt )])
118
+ raw = response .content .strip ()
119
+ cleaned = raw .replace ("```json" , "" ).replace ("```" , "" ).strip ()
120
+
121
+ try :
122
+ result = json .loads (cleaned )
123
+ except Exception as e :
124
+ raise Exception (f"❌ LLM policy check did not return valid JSON.\n Raw output:\n { cleaned } \n Error: { e } " )
125
+
126
+ status = result .get ("status" , "" ).lower ()
127
+ reason = result .get ("reason" , "No reason provided." )
128
+
129
+ label = "✅ Policy Check: " if status == "pass" else "❌ Policy Check: "
130
+ return {
131
+ "validation_messages" : state .get ("validation_messages" , []) + [label + reason ]
132
+ }
133
+
134
+ def check_category_node (self , state : State ) -> Dict [str , Any ]:
135
+ llm = get_llm (temperature = 0.0 )
136
+ extracted = state ["extracted_information" ].data
137
+
138
+ prompt = f"""
139
+ Given this expense data:
140
+ { json .dumps (extracted , indent = 2 )}
141
+
142
+ Are any of the expense items clearly mismatched? For example, if 'Bread' is categorized under 'Travel'.
143
+
144
+ Return a JSON object with:
145
+ - status: "pass" if all items are categorized correctly, "fail" if there are mismatches
146
+ - reason: 1-2 sentences explaining if any mismatch exists.
147
+
148
+ Respond ONLY with a valid JSON object.
149
+ """
150
+
151
+ response = llm .invoke ([HumanMessage (content = prompt )])
152
+ raw = response .content .strip ()
153
+ cleaned = raw .replace ("```json" , "" ).replace ("```" , "" ).strip ()
154
+
155
+ try :
156
+ result = json .loads (cleaned )
157
+ except Exception as e :
158
+ raise Exception (f"❌ LLM category check did not return valid JSON.\n Raw output:\n { cleaned } \n Error: { e } " )
159
+
160
+ status = result .get ("status" , "" ).lower ()
161
+ reason = result .get ("reason" , "No reason provided." )
162
+
163
+ label = "✅ Category Check: " if status == "pass" else "❌ Category Check: "
164
+ return {
165
+ "validation_messages" : state .get ("validation_messages" , []) + [label + reason ]
166
+ }
167
+
168
+ def check_declared_amount_node (self , state : State ) -> Dict [str , Any ]:
169
+ extracted_total = state ["extracted_information" ].data .get ("total_amount" , 0.0 )
170
+ api_total = dummy_invoice_api_check (extracted_total )
171
+ declared = state ["declared_amount" ]
172
+
173
+ if abs (api_total - declared ) > 0.1 :
174
+ return {"validation_messages" : state .get ("validation_messages" , []) + [
175
+ f"⚠️ Declared amount mismatch. Declared: ${ declared :.2f} , Backend Invoice: ${ api_total :.2f} "
176
+ ]}
177
+ else :
178
+ return {"validation_messages" : state .get ("validation_messages" , []) + [
179
+ "✅ Declared Amount Check: No significant mismatch"
180
+ ]}
181
+
182
+ def create_workflow (self ):
183
+ graph = StateGraph (State )
184
+
185
+ graph .add_node ("Extract" , self .llm_extract_node )
186
+ graph .add_node ("PolicyCheck" , self .check_policy_node )
187
+ graph .add_node ("CategoryCheck" , self .check_category_node )
188
+ graph .add_node ("AmountCheck" , self .check_declared_amount_node )
189
+
190
+ graph .add_edge (START , "Extract" )
191
+ graph .add_edge ("Extract" , "PolicyCheck" )
192
+ graph .add_edge ("PolicyCheck" , "CategoryCheck" )
193
+ graph .add_edge ("CategoryCheck" , "AmountCheck" )
194
+ graph .add_edge ("AmountCheck" , END )
195
+
196
+ return graph .compile ()
197
+
198
+ # --- Public API ---
199
+ def process_expense_workflow (pdf_bytes : bytes , declared_amount : float ) -> Tuple [Dict [str , Any ], list ]:
200
+ temp_file = tempfile .NamedTemporaryFile (delete = False , suffix = ".pdf" )
201
+ temp_file .write (pdf_bytes )
202
+ temp_file .close ()
203
+
204
+ agent = ExpenseValidationAgent ()
205
+ workflow = agent .create_workflow ()
206
+
207
+ initial_state = {
208
+ "pdf_path" : temp_file .name ,
209
+ "declared_amount" : declared_amount ,
210
+ "extracted_information" : None ,
211
+ "validation_messages" : [],
212
+ "error" : None
213
+ }
214
+
215
+ final_state = workflow .invoke (initial_state )
216
+
217
+ if final_state .get ("error" ):
218
+ raise Exception (final_state ["error" ])
219
+
220
+ return final_state ["extracted_information" ].data , final_state ["validation_messages" ]
0 commit comments