Skip to content

Commit 13dfea4

Browse files
feat: ✨ OpenAI parser (#245)
* feat: ✨ OpenAI parser * refinement in logic * logic refinement * Update README.md Co-authored-by: Glenn Matthews <[email protected]> * Update circuit_maintenance_parser/parser.py Co-authored-by: Glenn Matthews <[email protected]> * improve question * make more explicit the text parsing * Automate token management for local tests * Make openai library an extra * Adopt OpenAI library changes * fix mypy --------- Co-authored-by: Glenn Matthews <[email protected]>
1 parent e611c91 commit 13dfea4

File tree

12 files changed

+862
-379
lines changed

12 files changed

+862
-379
lines changed

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,33 @@ By default, there is a `GenericProvider` that support a `SimpleProcessor` using
8686

8787
> Note: Because these providers do not support the BCOP standard natively, maybe there are some gaps on the implemented parser that will be refined with new test cases. We encourage you to report related **issues**!
8888
89+
#### LLM-powered Parsers
90+
91+
The library supports an optional parser option leveraging Large Language Model (LLM) to provide best-effort parsing when the specific parsers have not been successful.
92+
93+
> Warning: Some of these integrations, such as OpenAI, require of extras installations parameters. Check the [extras section](#extras)
94+
95+
When the appropriate environment variable(s) are set (see below), these LLM parsers are automatically appended after all existing processors for each defined Provider.
96+
97+
> These integrations may involve some costs for API usage. Use it carefully! As an order of magnitude, a parsing of an email with OpenAI GPT gpt-3.5-turbo model costs $0.004.
98+
99+
These are the currently supported LLM integrations:
100+
101+
- [OpenAI](https://openai.com/product), these are the supported ENVs:
102+
- `OPENAI_API_KEY` (Required): OpenAI API Key.
103+
- `OPENAI_MODEL` (Optional): Model to use, it defaults to "gpt-3.5-turbo".
104+
89105
## Installation
90106

91107
The library is available as a Python package in pypi and can be installed with pip:
92108
`pip install circuit-maintenance-parser`
93109

110+
### Extras
111+
112+
#### OpenAI
113+
114+
`pip install circuit-maintenance-parser[openai]`
115+
94116
## How to use it?
95117

96118
The library requires two things:
@@ -319,6 +341,7 @@ The project is following Network to Code software development guidelines and is
319341
...omitted debug logs...
320342
====================================================== 99 passed, 174 deselected, 17 warnings in 10.35s ======================================================
321343
```
344+
322345
7. Run some final CI tests locally to ensure that there is no linting/formatting issues with your changes. You should look to get a code score of 10/10. See the example below: `invoke tests --local`
323346

324347
```

circuit_maintenance_parser/parser.py

Lines changed: 188 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@
66
import quopri
77
from typing import Dict, List
88
from email.utils import parsedate_tz, mktime_tz
9+
import hashlib
910

1011
import bs4 # type: ignore
1112
from bs4.element import ResultSet # type: ignore
1213

13-
from pydantic import BaseModel, Extra
14+
from pydantic import BaseModel
1415
from icalendar import Calendar # type: ignore
1516

1617
from circuit_maintenance_parser.errors import ParserError
@@ -23,7 +24,7 @@
2324
logger = logging.getLogger(__name__)
2425

2526

26-
class Parser(BaseModel, extra=Extra.forbid):
27+
class Parser(BaseModel):
2728
"""Parser class.
2829
2930
A Parser handles one or more specific data type(s) (specified in `data_types`).
@@ -34,14 +35,15 @@ class Parser(BaseModel, extra=Extra.forbid):
3435
# _data_types are used to match the Parser to to each type of DataPart
3536
_data_types = ["text/plain", "plain"]
3637

38+
# TODO: move it to where it is used, Cogent parser
3739
_geolocator = Geolocator()
3840

3941
@classmethod
4042
def get_data_types(cls) -> List[str]:
4143
"""Return the expected data type."""
4244
return cls._data_types
4345

44-
def parser_hook(self, raw: bytes) -> List[Dict]:
46+
def parser_hook(self, raw: bytes, content_type: str) -> List[Dict]:
4547
"""Custom parser logic.
4648
4749
This method is used by the main `Parser` classes (such as `ICal` or `Html` parser) to define a shared
@@ -53,14 +55,14 @@ def parser_hook(self, raw: bytes) -> List[Dict]:
5355
"""
5456
raise NotImplementedError
5557

56-
def parse(self, raw: bytes) -> List[Dict]:
58+
def parse(self, raw: bytes, content_type: str) -> List[Dict]:
5759
"""Execute parsing.
5860
5961
Do not override this method!
6062
Instead, each main `Parser` class should implement its own custom logic within the `parser_hook` method.
6163
"""
6264
try:
63-
result = self.parser_hook(raw)
65+
result = self.parser_hook(raw, content_type)
6466
except Exception as exc:
6567
raise ParserError from exc
6668
if any(not partial_result for partial_result in result):
@@ -86,7 +88,7 @@ class ICal(Parser):
8688

8789
_data_types = ["text/calendar", "ical", "icalendar"]
8890

89-
def parser_hook(self, raw: bytes):
91+
def parser_hook(self, raw: bytes, content_type: str):
9092
"""Execute parsing."""
9193
# iCalendar data sometimes comes encoded with base64
9294
# TODO: add a test case
@@ -163,7 +165,7 @@ def remove_hex_characters(string):
163165
"""Convert any hex characters to standard ascii."""
164166
return string.encode("ascii", errors="ignore").decode("utf-8")
165167

166-
def parser_hook(self, raw: bytes):
168+
def parser_hook(self, raw: bytes, content_type: str):
167169
"""Execute parsing."""
168170
result = []
169171
soup = bs4.BeautifulSoup(quopri.decodestring(raw), features="lxml")
@@ -195,7 +197,7 @@ class EmailDateParser(Parser):
195197

196198
_data_types = [EMAIL_HEADER_DATE]
197199

198-
def parser_hook(self, raw: bytes):
200+
def parser_hook(self, raw: bytes, content_type: str):
199201
"""Execute parsing."""
200202
parsed_date = parsedate_tz(raw.decode())
201203
if parsed_date:
@@ -208,7 +210,7 @@ class EmailSubjectParser(Parser):
208210

209211
_data_types = [EMAIL_HEADER_SUBJECT]
210212

211-
def parser_hook(self, raw: bytes):
213+
def parser_hook(self, raw: bytes, content_type: str):
212214
"""Execute parsing."""
213215
result = []
214216
for data in self.parse_subject(self.bytes_to_string(raw).replace("\r", "").replace("\n", "")):
@@ -230,7 +232,7 @@ class Csv(Parser):
230232

231233
_data_types = ["application/csv", "text/csv", "application/octet-stream"]
232234

233-
def parser_hook(self, raw: bytes):
235+
def parser_hook(self, raw: bytes, content_type: str):
234236
"""Execute parsing."""
235237
result = []
236238
for data in self.parse_csv(raw):
@@ -249,7 +251,7 @@ class Text(Parser):
249251

250252
_data_types = ["text/plain"]
251253

252-
def parser_hook(self, raw: bytes):
254+
def parser_hook(self, raw: bytes, content_type: str):
253255
"""Execute parsing."""
254256
result = []
255257
text = self.get_text_hook(raw)
@@ -265,3 +267,178 @@ def get_text_hook(raw: bytes) -> str:
265267
def parse_text(self, text) -> List[Dict]:
266268
"""Custom text parsing."""
267269
raise NotImplementedError
270+
271+
272+
class LLM(Parser):
273+
"""LLM parser."""
274+
275+
_data_types = ["text/html", "html", "text/plain"]
276+
277+
_llm_question = """Please, could you extract a JSON form without any other comment,
278+
with the following JSON schema (timestamps in EPOCH):
279+
{
280+
"type": "object",
281+
"properties": {
282+
"start": {
283+
"type": "int",
284+
},
285+
"end": {
286+
"type": "int",
287+
},
288+
"account": {
289+
"type": "string",
290+
},
291+
"summary": {
292+
"type": "string",
293+
},
294+
"maintenance_id": {
295+
"type": "string",
296+
},
297+
"account": {
298+
"type": "string",
299+
},
300+
"status": {
301+
"type": "string",
302+
},
303+
"impact": {
304+
"type": "string",
305+
},
306+
"circuit_ids": {
307+
"type": "array",
308+
"items": {
309+
"type": "string",
310+
}
311+
}
312+
}
313+
More context:
314+
* Circuit IDs are also known as service or order
315+
* Status could be confirmed, ongoing, cancelled, completed or rescheduled
316+
"""
317+
318+
def parser_hook(self, raw: bytes, content_type: str):
319+
"""Execute parsing."""
320+
result = []
321+
if content_type in ["html", "text/html"]:
322+
soup = bs4.BeautifulSoup(quopri.decodestring(raw), features="lxml")
323+
content = soup.text
324+
elif content_type in ["text/plain"]:
325+
content = self.get_text_hook(raw)
326+
327+
for data in self.parse_content(content):
328+
result.append(data)
329+
return result
330+
331+
@staticmethod
332+
def get_text_hook(raw: bytes) -> str:
333+
"""Can be overwritten by subclasses."""
334+
return raw.decode()
335+
336+
@staticmethod
337+
def get_key_with_string(dictionary: dict, string: str):
338+
"""Returns the key in the dictionary that contains the given string."""
339+
for key in dictionary.keys():
340+
if string in key:
341+
return key
342+
return None
343+
344+
def get_llm_response(self, content):
345+
"""Method to retrieve the response from the LLM for some content."""
346+
raise NotImplementedError
347+
348+
def _get_impact(self, generated_json: dict):
349+
"""Method to get a general Impact for all Circuits."""
350+
impact_key = self.get_key_with_string(generated_json, "impact")
351+
if impact_key:
352+
if "no impact" in generated_json[impact_key].lower():
353+
return Impact.NO_IMPACT
354+
if "partial" in generated_json[impact_key].lower():
355+
return Impact.DEGRADED
356+
357+
return Impact.OUTAGE
358+
359+
def _get_circuit_ids(self, generated_json: dict, impact: Impact):
360+
"""Method to get the Circuit IDs and use a general Impact."""
361+
circuits = []
362+
circuits_ids_key = self.get_key_with_string(generated_json, "circuit")
363+
for circuit in generated_json[circuits_ids_key]:
364+
if isinstance(circuit, str):
365+
circuits.append(CircuitImpact(circuit_id=circuit, impact=impact))
366+
elif isinstance(circuit, dict):
367+
circuit_key = self.get_key_with_string(circuit, "circuit")
368+
circuits.append(CircuitImpact(circuit_id=circuit[circuit_key], impact=impact))
369+
370+
return circuits
371+
372+
def _get_start(self, generated_json: dict):
373+
"""Method to get the Start Time."""
374+
return generated_json[self.get_key_with_string(generated_json, "start")]
375+
376+
def _get_end(self, generated_json: dict):
377+
"""Method to get the End Time."""
378+
return generated_json[self.get_key_with_string(generated_json, "end")]
379+
380+
def _get_summary(self, generated_json: dict):
381+
"""Method to get the Summary."""
382+
return generated_json[self.get_key_with_string(generated_json, "summary")]
383+
384+
def _get_status(self, generated_json: dict):
385+
"""Method to get the Status."""
386+
status_key = self.get_key_with_string(generated_json, "status")
387+
388+
if "confirmed" in generated_json[status_key].lower():
389+
return Status.CONFIRMED
390+
if "rescheduled" in generated_json[status_key].lower():
391+
return Status.RE_SCHEDULED
392+
if "cancelled" in generated_json[status_key].lower():
393+
return Status.CANCELLED
394+
if "ongoing" in generated_json[status_key].lower():
395+
return Status.IN_PROCESS
396+
if "completed" in generated_json[status_key].lower():
397+
return Status.COMPLETED
398+
399+
return Status.CONFIRMED
400+
401+
def _get_account(self, generated_json: dict):
402+
"""Method to get the Account."""
403+
account = generated_json[self.get_key_with_string(generated_json, "account")]
404+
if not account:
405+
return "Not found"
406+
407+
return account
408+
409+
def _get_maintenance_id(self, generated_json: dict, start, end, circuits):
410+
"""Method to get the Maintenance ID."""
411+
maintenance_key = self.get_key_with_string(generated_json, "maintenance")
412+
if maintenance_key and generated_json["maintenance_id"] != "N/A":
413+
return generated_json["maintenance_id"]
414+
415+
maintenance_id = str(start) + str(end) + "".join(list(circuits))
416+
return hashlib.md5(maintenance_id.encode("utf-8")).hexdigest() # nosec
417+
418+
def parse_content(self, content):
419+
"""Parse content via LLM."""
420+
generated_json = self.get_llm_response(content)
421+
if not generated_json:
422+
return []
423+
424+
impact = self._get_impact(generated_json)
425+
426+
data = {
427+
"circuits": self._get_circuit_ids(generated_json, impact),
428+
"start": int(self._get_start(generated_json)),
429+
"end": int(self._get_end(generated_json)),
430+
"summary": str(self._get_summary(generated_json)),
431+
"status": self._get_status(generated_json),
432+
"account": str(self._get_account(generated_json)),
433+
}
434+
435+
data["maintenance_id"] = str(
436+
self._get_maintenance_id(
437+
generated_json,
438+
data["start"],
439+
data["end"],
440+
data["circuits"],
441+
)
442+
)
443+
444+
return [data]
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""OpenAI Parser."""
2+
import os
3+
import logging
4+
import json
5+
from typing import List, Optional
6+
7+
try:
8+
from openai import OpenAI # type: ignore
9+
except ImportError:
10+
_HAS_OPENAI = False
11+
else:
12+
_HAS_OPENAI = True
13+
14+
from circuit_maintenance_parser.parser import LLM
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
class OpenAIParser(LLM):
20+
"""Notifications Parser powered by OpenAI ChatGPT."""
21+
22+
def get_llm_response(self, content) -> Optional[List]:
23+
"""Get LLM processing from OpenAI."""
24+
if not _HAS_OPENAI:
25+
raise ImportError("openai extra is required to use OpenAIParser.")
26+
27+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
28+
model = os.getenv("OPENAI_MODEL", "gpt-3.5-turbo")
29+
try:
30+
response = client.chat.completions.create(
31+
model=model,
32+
messages=[
33+
{ # type: ignore
34+
"role": "system",
35+
"content": self._llm_question,
36+
},
37+
{ # type: ignore
38+
"role": "user",
39+
"content": content,
40+
},
41+
],
42+
)
43+
44+
# TODO: Maybe asking again about the generated response could refine it
45+
46+
except Exception as err: # pylint: disable=broad-exception-caught
47+
logger.error(err)
48+
return None
49+
50+
logger.info("Used OpenAI tokens: %s", response.usage)
51+
generated_text = response.choices[0].message.content
52+
logger.info("Response from LLM: %s", generated_text)
53+
try:
54+
return json.loads(generated_text) # type: ignore
55+
except ValueError as err:
56+
logger.error(err)
57+
return None
58+
59+
return None

0 commit comments

Comments
 (0)