Add test cases for nsdl statements

codereverser · codereverser · commit 5cc8541fb792 · 2025-02-26T14:22:18.000+05:30
diff --git a/casparser/parsers/pdfminer.py b/casparser/parsers/pdfminer.py
@@ -22,50 +22,50 @@
 
 from .utils import is_close
 
-
-def parse_investor_info_nsdl(layout, width, height) -> InvestorInfo:
-    """Parse investor info."""
-    text_elements = sorted(
-        [
-            x
-            for x in layout
-            if isinstance(x, LTTextBoxHorizontal)
-            # and x.x1 < width / 2
-            # and x.y1 > height / 2
-            and x.get_text().strip() != ""
-        ],
-        key=lambda x: -x.y1,
-    )
-    cas_id_found = False
-    address_lines = []
-    email = ""
-    mobile = None
-    name = None
-    for el in text_elements:
-        txt = el.get_text().strip()
-        if not cas_id_found:
-            if m := re.search(r"[CAS|NSDL]\s+ID\s*:\s*(.+?)(?:\s|$)", txt, re.I):
-                # email = m.group(1).strip()
-                cas_id_found = True
-            continue
-        if name is None:
-            name = txt
-        else:
-            if (
-                re.search(
-                    r"Statement\s+for\s+the\s+period|Your\s+demat\s+account\s+and\s+mutual\s+fund",
-                    txt,
-                    re.I | re.MULTILINE,
-                )
-                or mobile is not None
-            ):
-                return InvestorInfo(
-                    email=email, name=name, mobile=mobile or "", address="\n".join(address_lines)
-                )
-            elif m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I):
-                mobile = m.group(1).strip()
-            address_lines.append(txt)
-    raise CASParseError("Unable to parse investor data")
+# def parse_investor_info_nsdl(layout, width, height) -> InvestorInfo:
+#     """Parse investor info."""
+#     text_elements = sorted(
+#         [
+#             x
+#             for x in layout
+#             if isinstance(x, LTTextBoxHorizontal)
+#             # and x.x1 < width / 2
+#             # and x.y1 > height / 2
+#             and x.get_text().strip() != ""
+#         ],
+#         key=lambda x: -x.y1,
+#     )
+#     cas_id_found = False
+#     address_lines = []
+#     email = ""
+#     mobile = None
+#     name = None
+#     for el in text_elements:
+#         txt = el.get_text().strip()
+#         if not cas_id_found:
+#             if m := re.search(r"[CAS|NSDL]\s+ID\s*:\s*(.+?)(?:\s|$)", txt, re.I):
+#                 # email = m.group(1).strip()
+#                 cas_id_found = True
+#             continue
+#         if name is None:
+#             name = txt
+#         else:
+#             if (
+#                 re.search(
+#                     r"Statement\s+for\s+the\s+period|Your\s+demat\s+"
+#                     r"account\s+and\s+mutual\s+fund",
+#                     txt,
+#                     re.I | re.MULTILINE,
+#                 )
+#                 or mobile is not None
+#             ):
+#                 return InvestorInfo(
+#                     email=email, name=name, mobile=mobile or "", address="\n".join(address_lines)
+#                 )
+#             elif m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I):
+#                 mobile = m.group(1).strip()
+#             address_lines.append(txt)
+#     raise CASParseError("Unable to parse investor data")
 
 
 def parse_investor_info_mf(layout, width, height) -> InvestorInfo:
@@ -250,14 +250,14 @@ def extract_text_elements(layout: LTContainer) -> Iterator[LTTextBox]:
                 raise CASParseError(
                     "pdfminer does not support this file type. Install pymupdf dependency"
                 )
-            if investor_info is None:
-                if file_type in (FileType.CAMS, FileType.KFINTECH):
-                    investor_info = parse_investor_info_mf(text_elements, *page.mediabox[2:])
-                elif file_type in (FileType.NSDL, FileType.CDSL) and page_num == 1:
-                    investor_info = parse_investor_info_nsdl(text_elements, *page.mediabox[2:])
-            if file_type == FileType.NSDL and page_num == 0:
-                # Ignore first page. no useful data
-                continue
+            # if investor_info is None:
+            #     if file_type in (FileType.CAMS, FileType.KFINTECH):
+            #         investor_info = parse_investor_info_mf(text_elements, *page.mediabox[2:])
+            #     elif file_type in (FileType.NSDL, FileType.CDSL) and page_num == 1:
+            #         investor_info = parse_investor_info_nsdl(text_elements, *page.mediabox[2:])
+            # if file_type == FileType.NSDL and page_num == 0:
+            #     # Ignore first page. no useful data
+            #     continue
             pages.append(text_elements)
         lines = group_similar_rows(pages)
         return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)
diff --git a/casparser/process/__init__.py b/casparser/process/__init__.py
@@ -5,7 +5,7 @@
 from ..types import ProcessedCASData
 from .cas_detailed import process_detailed_text
 from .cas_summary import process_summary_text
-from .dp_statement import process_depository_text
+from .nsdl_statement import process_nsdl_text
 from .regex import CAS_TYPE_RE
 
 
@@ -26,7 +26,7 @@ def process_cas_text(text, file_type: FileType = FileType.UNKNOWN) -> ProcessedC
     :return:
     """
     if file_type in (FileType.CDSL, FileType.NSDL):
-        return process_depository_text(text)
+        return process_nsdl_text(text)
     cas_statement_type = detect_cas_type(text[:1000])
     if cas_statement_type == CASFileType.DETAILED:
         return process_detailed_text(text)
diff --git a/casparser/process/nsdl_statement.py b/casparser/process/nsdl_statement.py
@@ -32,7 +32,7 @@ def parse_header(text):
     raise HeaderParseError("Error parsing CAS header")
 
 
-def process_depository_text(text):
+def process_nsdl_text(text):
     hdr_data = parse_header(text[:1000])
     statement_period = StatementPeriod(from_=hdr_data["from"], to=hdr_data["to"])
     accounts = re.findall(
@@ -192,7 +192,9 @@ def process_depository_text(text):
                     }
                 )
 
-    return NSDLCASData(
+    cas_data = NSDLCASData(
         statement_period=statement_period,
         accounts=list(demat.values()),
     )
+
+    return cas_data
diff --git a/tests/base.py b/tests/base.py
@@ -27,6 +27,7 @@ def setup_class(cls):
         cls.bad_file_name = os.getenv("BAD_CAS_FILE")
         cls.cams_password = os.getenv("CAMS_CAS_PASSWORD")
         cls.kfintech_password = os.getenv("KFINTECH_CAS_PASSWORD")
+        cls.nsdl_file_name = os.getenv("NSDL_CAS_FILE_1")
 
         cls.pdf_files = [
             (cls.cams_file_name, cls.cams_password, 10, 14),
diff --git a/tests/files.enc b/tests/files.enc
diff --git a/tests/test_mupdf.py b/tests/test_mupdf.py
@@ -97,3 +97,14 @@ def test_bad_file_type(self):
 
         file_type = parse_file_type([])
         assert file_type == FileType.UNKNOWN
+
+    def test_nsdl_statement(self):
+        from casparser.cli import cli
+
+        runner = CliRunner()
+        result = runner.invoke(cli, [self.nsdl_file_name, "-p", "", "-a"])
+        assert result.exit_code == 0
+        clean_output = self.ansi_cleaner.sub("", result.output)
+
+        assert re.search(r"Matched\s+:\s+3\s+accounts", clean_output) is not None
+        assert re.search(r"Error\s+:\s+0\s+accounts", clean_output) is not None
diff --git a/tests/test_pdfminer.py b/tests/test_pdfminer.py
@@ -59,3 +59,6 @@ def test_invalid_file_type(self):
         with pytest.raises(CASParseError) as exc_info:
             read_cas_pdf(1, "", force_pdfminer=True)
         assert "Invalid input" in str(exc_info)
+
+        with pytest.raises(CASParseError) as exc_info:
+            read_cas_pdf(self.nsdl_file_name, "", force_pdfminer=True)

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ def parse_header(text):`
`32`	`32`	`raise HeaderParseError("Error parsing CAS header")`
`33`	`33`
`34`	`34`
`35`		`-def process_depository_text(text):`
	`35`	`+def process_nsdl_text(text):`
`36`	`36`	`hdr_data = parse_header(text[:1000])`
`37`	`37`	`statement_period = StatementPeriod(from_=hdr_data["from"], to=hdr_data["to"])`
`38`	`38`	`accounts = re.findall(`
`@@ -192,7 +192,9 @@ def process_depository_text(text):`
`192`	`192`	`}`
`193`	`193`	`)`
`194`	`194`
`195`		`- return NSDLCASData(`
	`195`	`+ cas_data = NSDLCASData(`
`196`	`196`	`statement_period=statement_period,`
`197`	`197`	`accounts=list(demat.values()),`
`198`	`198`	`)`
	`199`	`+`
	`200`	`+ return cas_data`