Skip to content

Commit 76cf582

Browse files
committed
fix crash while parsing CAS without mobile number in address.
1 parent e338b45 commit 76cf582

File tree

4 files changed

+38
-24
lines changed

4 files changed

+38
-24
lines changed

casparser/parsers/mupdf.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -125,15 +125,24 @@ def parse_investor_info(page_dict) -> InvestorInfo:
125125
if name is None:
126126
name = txt
127127
else:
128-
if m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I):
129-
mobile = m.group(1).strip()
130-
address_lines.append(txt)
131-
if mobile is not None:
128+
if (
129+
re.search(
130+
r"Date\s+Transaction|Folio\s+No|^Date\s*$",
131+
txt,
132+
re.I | re.MULTILINE,
133+
)
134+
or mobile is not None
135+
):
132136
return InvestorInfo(
133-
email=email, name=name, mobile=mobile, address="\n".join(address_lines)
137+
email=email,
138+
name=name,
139+
mobile=mobile or "",
140+
address="\n".join(address_lines),
134141
)
135-
if email is None or mobile is None:
136-
raise CASParseError("Unable to parse investor data")
142+
elif m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I):
143+
mobile = m.group(1).strip()
144+
address_lines.append(txt)
145+
raise CASParseError("Unable to parse investor data")
137146

138147

139148
def group_similar_rows(elements_list: List[Iterator[Any]]):

casparser/parsers/pdfminer.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,17 @@ def parse_investor_info(layout, width, height) -> InvestorInfo:
4242
if name is None:
4343
name = txt
4444
else:
45-
if m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I):
46-
mobile = m.group(1).strip()
47-
address_lines.append(txt)
48-
if mobile is not None:
45+
if (
46+
re.search(r"Date\s+Transaction|Folio\s+No|^Date\s*$", txt, re.I | re.MULTILINE)
47+
or mobile is not None
48+
):
4949
return InvestorInfo(
50-
email=email, name=name, mobile=mobile, address="\n".join(address_lines)
50+
email=email, name=name, mobile=mobile or "", address="\n".join(address_lines)
5151
)
52-
if email is None or mobile is None:
53-
raise CASParseError("Unable to parse investor data")
52+
elif m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I):
53+
mobile = m.group(1).strip()
54+
address_lines.append(txt)
55+
raise CASParseError("Unable to parse investor data")
5456

5557

5658
def detect_pdf_source(document) -> FileType:

tests/base.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,26 +22,27 @@ def setup_class(cls):
2222
cls.cams_password = os.getenv("CAMS_CAS_PASSWORD")
2323
cls.kfintech_password = os.getenv("KFINTECH_CAS_PASSWORD")
2424

25+
cls.pdf_files = [
26+
(cls.cams_file_name, cls.cams_password),
27+
(cls.kfintech_file_name, cls.kfintech_password),
28+
]
29+
2530
def read_pdf(self, filename, password, output="dict"):
2631
use_pdfminer = self.mode == "pdfminer"
2732
return read_cas_pdf(filename, password, output=output, force_pdfminer=use_pdfminer)
2833

2934
def test_read_summary(self):
3035
data = self.read_pdf(self.cams_summary_file_name, self.cams_password)
3136
assert len(data.get("folios", [])) == 4
37+
assert data.get("investor_info", {}).get("mobile") not in (None, "")
3238
assert data["cas_type"] == "SUMMARY"
3339

3440
def test_read_dict(self):
3541
from casparser.cli import cli
3642

37-
pdf_files = [
38-
(self.cams_file_name, self.cams_password),
39-
(self.kfintech_file_name, self.kfintech_password),
40-
]
41-
4243
runner = CliRunner()
4344

44-
for pdf_file, pdf_password in pdf_files:
45+
for pdf_file, pdf_password in self.pdf_files:
4546
args = [pdf_file, "-p", pdf_password]
4647
if self.mode != "mupdf":
4748
args.append("--force-pdfminer")

tests/test_mupdf.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@ class TestMuPDF(BaseTestClass):
1313
"""Test PyMuPDF parser."""
1414

1515
def test_output_json(self):
16-
json_data = self.read_pdf(self.cams_file_name, self.cams_password, output="json")
17-
data = json.loads(json_data)
18-
assert len(data.get("folios", [])) == 10
19-
assert data["cas_type"] == "DETAILED"
16+
for filename, password in self.pdf_files:
17+
json_data = self.read_pdf(filename, password, output="json")
18+
data = json.loads(json_data)
19+
assert len(data.get("folios", [])) == 10
20+
assert data.get("investor_info", {}).get("mobile") not in (None, "")
21+
assert data["cas_type"] == "DETAILED"
2022

2123
def test_output_csv(self):
2224
output = self.read_pdf(self.cams_file_name, self.cams_password, output="csv")

0 commit comments

Comments
 (0)