Skip to content

Commit 5cc8541

Browse files
committed
Add test cases for nsdl statements
1 parent 7692508 commit 5cc8541

File tree

7 files changed

+73
-56
lines changed

7 files changed

+73
-56
lines changed

casparser/parsers/pdfminer.py

Lines changed: 52 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -22,50 +22,50 @@
2222

2323
from .utils import is_close
2424

25-
26-
def parse_investor_info_nsdl(layout, width, height) -> InvestorInfo:
27-
"""Parse investor info."""
28-
text_elements = sorted(
29-
[
30-
x
31-
for x in layout
32-
if isinstance(x, LTTextBoxHorizontal)
33-
# and x.x1 < width / 2
34-
# and x.y1 > height / 2
35-
and x.get_text().strip() != ""
36-
],
37-
key=lambda x: -x.y1,
38-
)
39-
cas_id_found = False
40-
address_lines = []
41-
email = ""
42-
mobile = None
43-
name = None
44-
for el in text_elements:
45-
txt = el.get_text().strip()
46-
if not cas_id_found:
47-
if m := re.search(r"[CAS|NSDL]\s+ID\s*:\s*(.+?)(?:\s|$)", txt, re.I):
48-
# email = m.group(1).strip()
49-
cas_id_found = True
50-
continue
51-
if name is None:
52-
name = txt
53-
else:
54-
if (
55-
re.search(
56-
r"Statement\s+for\s+the\s+period|Your\s+demat\s+account\s+and\s+mutual\s+fund",
57-
txt,
58-
re.I | re.MULTILINE,
59-
)
60-
or mobile is not None
61-
):
62-
return InvestorInfo(
63-
email=email, name=name, mobile=mobile or "", address="\n".join(address_lines)
64-
)
65-
elif m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I):
66-
mobile = m.group(1).strip()
67-
address_lines.append(txt)
68-
raise CASParseError("Unable to parse investor data")
25+
# def parse_investor_info_nsdl(layout, width, height) -> InvestorInfo:
26+
# """Parse investor info."""
27+
# text_elements = sorted(
28+
# [
29+
# x
30+
# for x in layout
31+
# if isinstance(x, LTTextBoxHorizontal)
32+
# # and x.x1 < width / 2
33+
# # and x.y1 > height / 2
34+
# and x.get_text().strip() != ""
35+
# ],
36+
# key=lambda x: -x.y1,
37+
# )
38+
# cas_id_found = False
39+
# address_lines = []
40+
# email = ""
41+
# mobile = None
42+
# name = None
43+
# for el in text_elements:
44+
# txt = el.get_text().strip()
45+
# if not cas_id_found:
46+
# if m := re.search(r"[CAS|NSDL]\s+ID\s*:\s*(.+?)(?:\s|$)", txt, re.I):
47+
# # email = m.group(1).strip()
48+
# cas_id_found = True
49+
# continue
50+
# if name is None:
51+
# name = txt
52+
# else:
53+
# if (
54+
# re.search(
55+
# r"Statement\s+for\s+the\s+period|Your\s+demat\s+"
56+
# r"account\s+and\s+mutual\s+fund",
57+
# txt,
58+
# re.I | re.MULTILINE,
59+
# )
60+
# or mobile is not None
61+
# ):
62+
# return InvestorInfo(
63+
# email=email, name=name, mobile=mobile or "", address="\n".join(address_lines)
64+
# )
65+
# elif m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I):
66+
# mobile = m.group(1).strip()
67+
# address_lines.append(txt)
68+
# raise CASParseError("Unable to parse investor data")
6969

7070

7171
def parse_investor_info_mf(layout, width, height) -> InvestorInfo:
@@ -250,14 +250,14 @@ def extract_text_elements(layout: LTContainer) -> Iterator[LTTextBox]:
250250
raise CASParseError(
251251
"pdfminer does not support this file type. Install pymupdf dependency"
252252
)
253-
if investor_info is None:
254-
if file_type in (FileType.CAMS, FileType.KFINTECH):
255-
investor_info = parse_investor_info_mf(text_elements, *page.mediabox[2:])
256-
elif file_type in (FileType.NSDL, FileType.CDSL) and page_num == 1:
257-
investor_info = parse_investor_info_nsdl(text_elements, *page.mediabox[2:])
258-
if file_type == FileType.NSDL and page_num == 0:
259-
# Ignore first page. no useful data
260-
continue
253+
# if investor_info is None:
254+
# if file_type in (FileType.CAMS, FileType.KFINTECH):
255+
# investor_info = parse_investor_info_mf(text_elements, *page.mediabox[2:])
256+
# elif file_type in (FileType.NSDL, FileType.CDSL) and page_num == 1:
257+
# investor_info = parse_investor_info_nsdl(text_elements, *page.mediabox[2:])
258+
# if file_type == FileType.NSDL and page_num == 0:
259+
# # Ignore first page. no useful data
260+
# continue
261261
pages.append(text_elements)
262262
lines = group_similar_rows(pages)
263263
return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)

casparser/process/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from ..types import ProcessedCASData
66
from .cas_detailed import process_detailed_text
77
from .cas_summary import process_summary_text
8-
from .dp_statement import process_depository_text
8+
from .nsdl_statement import process_nsdl_text
99
from .regex import CAS_TYPE_RE
1010

1111

@@ -26,7 +26,7 @@ def process_cas_text(text, file_type: FileType = FileType.UNKNOWN) -> ProcessedC
2626
:return:
2727
"""
2828
if file_type in (FileType.CDSL, FileType.NSDL):
29-
return process_depository_text(text)
29+
return process_nsdl_text(text)
3030
cas_statement_type = detect_cas_type(text[:1000])
3131
if cas_statement_type == CASFileType.DETAILED:
3232
return process_detailed_text(text)

casparser/process/dp_statement.py renamed to casparser/process/nsdl_statement.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def parse_header(text):
3232
raise HeaderParseError("Error parsing CAS header")
3333

3434

35-
def process_depository_text(text):
35+
def process_nsdl_text(text):
3636
hdr_data = parse_header(text[:1000])
3737
statement_period = StatementPeriod(from_=hdr_data["from"], to=hdr_data["to"])
3838
accounts = re.findall(
@@ -192,7 +192,9 @@ def process_depository_text(text):
192192
}
193193
)
194194

195-
return NSDLCASData(
195+
cas_data = NSDLCASData(
196196
statement_period=statement_period,
197197
accounts=list(demat.values()),
198198
)
199+
200+
return cas_data

tests/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def setup_class(cls):
2727
cls.bad_file_name = os.getenv("BAD_CAS_FILE")
2828
cls.cams_password = os.getenv("CAMS_CAS_PASSWORD")
2929
cls.kfintech_password = os.getenv("KFINTECH_CAS_PASSWORD")
30+
cls.nsdl_file_name = os.getenv("NSDL_CAS_FILE_1")
3031

3132
cls.pdf_files = [
3233
(cls.cams_file_name, cls.cams_password, 10, 14),

tests/files.enc

284 KB
Binary file not shown.

tests/test_mupdf.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,14 @@ def test_bad_file_type(self):
9797

9898
file_type = parse_file_type([])
9999
assert file_type == FileType.UNKNOWN
100+
101+
def test_nsdl_statement(self):
102+
from casparser.cli import cli
103+
104+
runner = CliRunner()
105+
result = runner.invoke(cli, [self.nsdl_file_name, "-p", "", "-a"])
106+
assert result.exit_code == 0
107+
clean_output = self.ansi_cleaner.sub("", result.output)
108+
109+
assert re.search(r"Matched\s+:\s+3\s+accounts", clean_output) is not None
110+
assert re.search(r"Error\s+:\s+0\s+accounts", clean_output) is not None

tests/test_pdfminer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,6 @@ def test_invalid_file_type(self):
5959
with pytest.raises(CASParseError) as exc_info:
6060
read_cas_pdf(1, "", force_pdfminer=True)
6161
assert "Invalid input" in str(exc_info)
62+
63+
with pytest.raises(CASParseError) as exc_info:
64+
read_cas_pdf(self.nsdl_file_name, "", force_pdfminer=True)

0 commit comments

Comments
 (0)