Skip to content

Commit 270d869

Browse files
codebytespamelafox
andauthored
Add JsonFileParser to FileStrategy (#1195)
* Add JsonFileParser to FileStrategy * Refactor JSON parser in prepdocs.py * fixed linting errors with ruff * Fix formatting in filestrategy.py and test_jsonparser.py * Added new textsplitter and tests * Added File processors and refactor of prepdocs.py * fix ruff formatting issues * fix linting errors * Update scripts/prepdocslib/jsonparser.py Co-authored-by: Pamela Fox <[email protected]> * Update scripts/prepdocslib/parser.py Co-authored-by: Pamela Fox <[email protected]> * Added sample json data, fixed bug in file extension * Fix file extension retrieval in File class * Refactor prepdocs.py script * renamed data examples, added test * Fix offset, add tests * Add pragma no cover * Use the whole version of dataclass * Run ruff on imports * Reformatting --------- Co-authored-by: Pamela Fox <[email protected]> Co-authored-by: Pamela Fox <[email protected]>
1 parent 232a6e0 commit 270d869

18 files changed

+531
-70
lines changed

data/Json_Examples/2189.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"AreaPath": "SmartHotel360",
3+
"AssignedTo": null,
4+
"Categories": null,
5+
"ChangedDate": "2023-12-13T23:08:38.69Z",
6+
"ClosedDate": null,
7+
"CreatedDate": "2023-12-13T23:08:38.69Z",
8+
"Description": "As a customer, I would like to reserve a conference room such that:<div><br> </div><div>1. It should display available date and time slots </div><div>2. Give an option to reserve a conference room for X hours </div><div>3. One can reserve a conference room for max 4 hours per day </div>",
9+
"Id": 2189,
10+
"State": "New",
11+
"StateChangeDate": "2023-12-13T23:08:38.69Z",
12+
"Tags": "Reservation",
13+
"Title": "As a customer, I would like to reserve a conference room"
14+
}

data/Json_Examples/2190.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"AreaPath": "SmartHotel360",
3+
"AssignedTo": null,
4+
"Categories": null,
5+
"ChangedDate": "2023-12-13T23:08:38.997Z",
6+
"ClosedDate": null,
7+
"CreatedDate": "2023-12-13T23:08:38.997Z",
8+
"Description": "<p class=MsoNormal><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;,sans-serif;color:#222222;background:white;\">Enter the&nbsp;</span><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;,sans-serif;color:#222222;\">guest's<span style=\"background:white;\">&nbsp;name to whom you&nbsp;</span></span>would\nlike to send<span style=\"background:white;\">&nbsp;a&nbsp;</span>confirmation,&nbsp;<span style=\"background:white;\">display the company, contact, source\nand&nbsp;</span>agent<span style=\"background:white;\">&nbsp;associated\nwith the&nbsp;</span>reservation<span style=\"background:white;\">.</span> </p>",
9+
"Id": 2190,
10+
"State": "New",
11+
"StateChangeDate": "2023-12-13T23:08:38.997Z",
12+
"Tags": "Notification",
13+
"Title": "As a reservation agent, I would like to send confirmations to guest"
14+
}

data/Json_Examples/2191.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"AreaPath": "SmartHotel360",
3+
"AssignedTo": null,
4+
"Categories": null,
5+
"ChangedDate": "2023-12-13T23:08:39.17Z",
6+
"ClosedDate": null,
7+
"CreatedDate": "2023-12-13T23:08:39.17Z",
8+
"Description": "<div><p class=MsoNormal><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;, sans-serif;background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\">If you have not picked up\nyour&nbsp;</span><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;, sans-serif;\">vehicle<span style=\"background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\">&nbsp;you can remove or cancel your&nbsp;</span></span>reservation<span style=\"background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\">&nbsp;by clicking here.</span> </p><p class=MsoNormal><span style=\"background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\"><br></span> </p> </div><div>1. Car reserved should have an option to cancel the request </div><div>2. Car driver should receive a notification about cancellation </div>",
9+
"Id": 2191,
10+
"State": "New",
11+
"StateChangeDate": "2023-12-13T23:08:39.17Z",
12+
"Tags": "Reservation",
13+
"Title": "As a customer, I should be able to remove a car reservation "
14+
}

data/Json_Examples/2192.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"AreaPath": "SmartHotel360",
3+
"AssignedTo": null,
4+
"Categories": null,
5+
"ChangedDate": "2023-12-13T23:08:39.383Z",
6+
"ClosedDate": null,
7+
"CreatedDate": "2023-12-13T23:08:39.383Z",
8+
"Description": "<span style=\"font-family:&quot;Segoe UI&quot;, sans-serif;font-size:10.5pt;\">As a courtesy, grant an\nextra hour or two to leave&nbsp;the&nbsp;room, especially if it isn't booked\nfor&nbsp;the upcoming evening. But customer must&nbsp;call the&nbsp;front desk\nin advance and&nbsp;request&nbsp;a&nbsp;late checkout.</span><p class=MsoNormal><span style=\"font-size:10.5pt;line-height:107%;font-family:&quot;Segoe UI&quot;, sans-serif;background-image:initial;background-position:initial;background-size:initial;background-repeat:initial;background-attachment:initial;background-origin:initial;background-clip:initial;\"></span> </p><p class=MsoNormal><span style=\"font-size:12.0pt;line-height:107%;\"></span> </p><div><div><br> </div><div>1. Late Check-in time should be displayed </div><div>2. Request should be sent to front-desk&nbsp; </div><div>3. Any extra charge should be displayed </div> </div>",
9+
"Id": 2192,
10+
"State": "New",
11+
"StateChangeDate": "2023-12-13T23:08:39.383Z",
12+
"Tags": "Front-desk; Members; Reservation",
13+
"Title": "As a customer, I should be able to request hotel for late Check-out"
14+
}

data/Json_Examples/query.json

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
[
2+
{
3+
"fields": {
4+
"System.Id": 2348,
5+
"System.State": "New",
6+
"System.Title": "Provide related items or frequently bought together section when people browse or search",
7+
"System.WorkItemType": "Product Backlog Item"
8+
},
9+
"id": 2348,
10+
"relations": null,
11+
"rev": 1,
12+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2348"
13+
},
14+
{
15+
"fields": {
16+
"System.Id": 2349,
17+
"System.State": "New",
18+
"System.Title": "As tester, I need to test the website on all the relevant broswers and devices and be sure that it can handle our load.",
19+
"System.WorkItemType": "Product Backlog Item"
20+
},
21+
"id": 2349,
22+
"relations": null,
23+
"rev": 1,
24+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2349"
25+
},
26+
{
27+
"fields": {
28+
"System.Id": 2350,
29+
"System.State": "New",
30+
"System.Title": "As a customer, I should be able to put items to shopping cart",
31+
"System.WorkItemType": "Product Backlog Item"
32+
},
33+
"id": 2350,
34+
"relations": null,
35+
"rev": 1,
36+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2350"
37+
},
38+
{
39+
"fields": {
40+
"System.Id": 2351,
41+
"System.State": "New",
42+
"System.Title": "As a customer, I should be able to print my purchase order",
43+
"System.WorkItemType": "Product Backlog Item"
44+
},
45+
"id": 2351,
46+
"relations": null,
47+
"rev": 1,
48+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2351"
49+
},
50+
{
51+
"fields": {
52+
"System.Id": 2352,
53+
"System.State": "New",
54+
"System.Title": "As a customer, I would like to have a sort capabaility by price and customer ratings",
55+
"System.WorkItemType": "Product Backlog Item"
56+
},
57+
"id": 2352,
58+
"relations": null,
59+
"rev": 1,
60+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2352"
61+
},
62+
{
63+
"fields": {
64+
"System.Id": 2353,
65+
"System.State": "New",
66+
"System.Title": "Recommended products must be based on customer purchase pattern history",
67+
"System.WorkItemType": "Product Backlog Item"
68+
},
69+
"id": 2353,
70+
"relations": null,
71+
"rev": 1,
72+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2353"
73+
},
74+
{
75+
"fields": {
76+
"System.Id": 2354,
77+
"System.State": "New",
78+
"System.Title": "As a customer, I would like to save my addresses so that I can easily select the address for delivery",
79+
"System.WorkItemType": "Product Backlog Item"
80+
},
81+
"id": 2354,
82+
"relations": null,
83+
"rev": 1,
84+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2354"
85+
},
86+
{
87+
"fields": {
88+
"System.Id": 2355,
89+
"System.State": "New",
90+
"System.Title": "As marketer, I want to run an A|B test on alternative Web Sites using Application Insights.",
91+
"System.WorkItemType": "Product Backlog Item"
92+
},
93+
"id": 2355,
94+
"relations": null,
95+
"rev": 1,
96+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2355"
97+
},
98+
{
99+
"fields": {
100+
"System.AssignedTo": {
101+
"_links": {
102+
"avatar": {
103+
"href": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0"
104+
}
105+
},
106+
"descriptor": "aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
107+
"displayName": "Chris Ayers",
108+
"id": "cd8258ec-ad87-4c0d-9026-e5e343447185",
109+
"imageUrl": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
110+
"uniqueName": "[email protected]",
111+
"url": "https://spsprodeus27.vssps.visualstudio.com/A6b854e9d-a8be-405d-a4cc-5eb8e7027155/_apis/Identities/cd8258ec-ad87-4c0d-9026-e5e343447185"
112+
},
113+
"System.Id": 2356,
114+
"System.State": "Done",
115+
"System.Title": "Provide customers the ability to track status of the package",
116+
"System.WorkItemType": "Product Backlog Item"
117+
},
118+
"id": 2356,
119+
"relations": null,
120+
"rev": 1,
121+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2356"
122+
},
123+
{
124+
"fields": {
125+
"System.AssignedTo": {
126+
"_links": {
127+
"avatar": {
128+
"href": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0"
129+
}
130+
},
131+
"descriptor": "aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
132+
"displayName": "Chris Ayers",
133+
"id": "cd8258ec-ad87-4c0d-9026-e5e343447185",
134+
"imageUrl": "https://dev.azure.com/codebytes/_apis/GraphProfile/MemberAvatars/aad.ZDlhOGEyZjktMGZmZS03YjY4LTlkYjctNjk1ZWZiNGY2Nzg0",
135+
"uniqueName": "[email protected]",
136+
"url": "https://spsprodeus27.vssps.visualstudio.com/A6b854e9d-a8be-405d-a4cc-5eb8e7027155/_apis/Identities/cd8258ec-ad87-4c0d-9026-e5e343447185"
137+
},
138+
"System.Id": 2357,
139+
"System.State": "Done",
140+
"System.Title": "As a customer, I would like to have the ability to send my items as gift",
141+
"System.WorkItemType": "Product Backlog Item"
142+
},
143+
"id": 2357,
144+
"relations": null,
145+
"rev": 2,
146+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2357"
147+
},
148+
{
149+
"fields": {
150+
"System.Id": 2358,
151+
"System.State": "Committed",
152+
"System.Title": "As a customer, I would like to store my credit card details securely",
153+
"System.WorkItemType": "Product Backlog Item"
154+
},
155+
"id": 2358,
156+
"relations": null,
157+
"rev": 1,
158+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2358"
159+
},
160+
{
161+
"fields": {
162+
"System.Id": 2359,
163+
"System.State": "Committed",
164+
"System.Title": "As a customer, I should be able to select different shipping option",
165+
"System.WorkItemType": "Product Backlog Item"
166+
},
167+
"id": 2359,
168+
"relations": null,
169+
"rev": 1,
170+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2359"
171+
},
172+
{
173+
"fields": {
174+
"System.Id": 2360,
175+
"System.State": "Committed",
176+
"System.Title": "As developer, I want to use Azure Machine Learning to provide a recommendations engine behind the website.",
177+
"System.WorkItemType": "Product Backlog Item"
178+
},
179+
"id": 2360,
180+
"relations": null,
181+
"rev": 1,
182+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2360"
183+
},
184+
{
185+
"fields": {
186+
"System.Id": 2361,
187+
"System.State": "Committed",
188+
"System.Title": "Provide tentative duration for shipping.",
189+
"System.WorkItemType": "Product Backlog Item"
190+
},
191+
"id": 2361,
192+
"relations": null,
193+
"rev": 1,
194+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2361"
195+
},
196+
{
197+
"fields": {
198+
"System.Id": 2362,
199+
"System.State": "Approved",
200+
"System.Title": "Notify the user about any changes made to the order",
201+
"System.WorkItemType": "Product Backlog Item"
202+
},
203+
"id": 2362,
204+
"relations": null,
205+
"rev": 1,
206+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2362"
207+
},
208+
{
209+
"fields": {
210+
"System.Id": 2363,
211+
"System.State": "Approved",
212+
"System.Title": "As a admin, I should be able to update prices on ad-hoc condition",
213+
"System.WorkItemType": "Product Backlog Item"
214+
},
215+
"id": 2363,
216+
"relations": null,
217+
"rev": 1,
218+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2363"
219+
},
220+
{
221+
"fields": {
222+
"System.Id": 2364,
223+
"System.State": "Approved",
224+
"System.Title": "As a customer, I would like to provide my feedback on items that I have purchased",
225+
"System.WorkItemType": "Product Backlog Item"
226+
},
227+
"id": 2364,
228+
"relations": null,
229+
"rev": 1,
230+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2364"
231+
},
232+
{
233+
"fields": {
234+
"System.Id": 2365,
235+
"System.State": "Approved",
236+
"System.Title": "As a customer, I would like to have a wishlist where I can add items for future purchase",
237+
"System.WorkItemType": "Product Backlog Item"
238+
},
239+
"id": 2365,
240+
"relations": null,
241+
"rev": 1,
242+
"url": "https://dev.azure.com/codebytes/_apis/wit/workItems/2365"
243+
}
244+
]

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[tool.ruff]
22
target-version = "py38"
3-
select = ["E", "F", "I", "UP"]
4-
ignore = ["E501", "E701"] # line too long, multiple statements on one line
3+
lint.select = ["E", "F", "I", "UP"]
4+
lint.ignore = ["E501", "E701"] # line too long, multiple statements on one line
55
src = ["app/backend", "scripts"]
66

77
[tool.ruff.isort]

scripts/prepdocs.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,18 @@
1414
OpenAIEmbeddings,
1515
OpenAIEmbeddingService,
1616
)
17+
from prepdocslib.fileprocessor import FileProcessor
1718
from prepdocslib.filestrategy import DocumentAction, FileStrategy
19+
from prepdocslib.jsonparser import JsonParser
1820
from prepdocslib.listfilestrategy import (
1921
ADLSGen2ListFileStrategy,
2022
ListFileStrategy,
2123
LocalListFileStrategy,
2224
)
23-
from prepdocslib.pdfparser import DocumentAnalysisPdfParser, LocalPdfParser, PdfParser
25+
from prepdocslib.parser import Parser
26+
from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser
2427
from prepdocslib.strategy import SearchInfo, Strategy
25-
from prepdocslib.textsplitter import TextSplitter
28+
from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
2629

2730

2831
def is_key_empty(key):
@@ -52,25 +55,29 @@ async def setup_file_strategy(credential: AsyncTokenCredential, args: Any) -> Fi
5255
verbose=args.verbose,
5356
)
5457

55-
pdf_parser: PdfParser
56-
if args.localpdfparser:
57-
pdf_parser = LocalPdfParser()
58-
else:
59-
# check if Azure Document Intelligence credentials are provided
60-
if args.formrecognizerservice is None:
61-
print(
62-
"Error: Azure Document Intelligence service is not provided. Please provide --formrecognizerservice or use --localpdfparser for local pypdf parser."
63-
)
64-
exit(1)
58+
pdf_parser: Parser
59+
doc_int_parser: DocumentAnalysisParser
60+
61+
# check if Azure Document Intelligence credentials are provided
62+
if args.formrecognizerservice is not None:
6563
formrecognizer_creds: Union[AsyncTokenCredential, AzureKeyCredential] = (
6664
credential if is_key_empty(args.formrecognizerkey) else AzureKeyCredential(args.formrecognizerkey)
6765
)
68-
pdf_parser = DocumentAnalysisPdfParser(
66+
doc_int_parser = DocumentAnalysisParser(
6967
endpoint=f"https://{args.formrecognizerservice}.cognitiveservices.azure.com/",
7068
credential=formrecognizer_creds,
7169
verbose=args.verbose,
7270
)
73-
71+
if args.localpdfparser or args.formrecognizerservice is None:
72+
pdf_parser = LocalPdfParser()
73+
else:
74+
pdf_parser = doc_int_parser
75+
sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=args.searchimages)
76+
file_processors = {
77+
".pdf": FileProcessor(pdf_parser, sentence_text_splitter),
78+
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
79+
".docx": FileProcessor(doc_int_parser, sentence_text_splitter),
80+
}
7481
use_vectors = not args.novectors
7582
embeddings: Optional[OpenAIEmbeddings] = None
7683
if use_vectors and args.openaihost != "openai":
@@ -128,8 +135,7 @@ async def setup_file_strategy(credential: AsyncTokenCredential, args: Any) -> Fi
128135
return FileStrategy(
129136
list_file_strategy=list_file_strategy,
130137
blob_manager=blob_manager,
131-
pdf_parser=pdf_parser,
132-
text_splitter=TextSplitter(has_image_embeddings=args.searchimages),
138+
file_processors=file_processors,
133139
document_action=document_action,
134140
embeddings=embeddings,
135141
image_embeddings=image_embeddings,
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from dataclasses import dataclass
2+
3+
from .parser import Parser
4+
from .textsplitter import TextSplitter
5+
6+
7+
@dataclass(frozen=True)
8+
class FileProcessor:
9+
parser: Parser
10+
splitter: TextSplitter

0 commit comments

Comments
 (0)