|
22 | 22 |
|
23 | 23 | from .utils import is_close |
24 | 24 |
|
25 | | - |
26 | | -def parse_investor_info_nsdl(layout, width, height) -> InvestorInfo: |
27 | | - """Parse investor info.""" |
28 | | - text_elements = sorted( |
29 | | - [ |
30 | | - x |
31 | | - for x in layout |
32 | | - if isinstance(x, LTTextBoxHorizontal) |
33 | | - # and x.x1 < width / 2 |
34 | | - # and x.y1 > height / 2 |
35 | | - and x.get_text().strip() != "" |
36 | | - ], |
37 | | - key=lambda x: -x.y1, |
38 | | - ) |
39 | | - cas_id_found = False |
40 | | - address_lines = [] |
41 | | - email = "" |
42 | | - mobile = None |
43 | | - name = None |
44 | | - for el in text_elements: |
45 | | - txt = el.get_text().strip() |
46 | | - if not cas_id_found: |
47 | | - if m := re.search(r"[CAS|NSDL]\s+ID\s*:\s*(.+?)(?:\s|$)", txt, re.I): |
48 | | - # email = m.group(1).strip() |
49 | | - cas_id_found = True |
50 | | - continue |
51 | | - if name is None: |
52 | | - name = txt |
53 | | - else: |
54 | | - if ( |
55 | | - re.search( |
56 | | - r"Statement\s+for\s+the\s+period|Your\s+demat\s+account\s+and\s+mutual\s+fund", |
57 | | - txt, |
58 | | - re.I | re.MULTILINE, |
59 | | - ) |
60 | | - or mobile is not None |
61 | | - ): |
62 | | - return InvestorInfo( |
63 | | - email=email, name=name, mobile=mobile or "", address="\n".join(address_lines) |
64 | | - ) |
65 | | - elif m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I): |
66 | | - mobile = m.group(1).strip() |
67 | | - address_lines.append(txt) |
68 | | - raise CASParseError("Unable to parse investor data") |
| 25 | +# def parse_investor_info_nsdl(layout, width, height) -> InvestorInfo: |
| 26 | +# """Parse investor info.""" |
| 27 | +# text_elements = sorted( |
| 28 | +# [ |
| 29 | +# x |
| 30 | +# for x in layout |
| 31 | +# if isinstance(x, LTTextBoxHorizontal) |
| 32 | +# # and x.x1 < width / 2 |
| 33 | +# # and x.y1 > height / 2 |
| 34 | +# and x.get_text().strip() != "" |
| 35 | +# ], |
| 36 | +# key=lambda x: -x.y1, |
| 37 | +# ) |
| 38 | +# cas_id_found = False |
| 39 | +# address_lines = [] |
| 40 | +# email = "" |
| 41 | +# mobile = None |
| 42 | +# name = None |
| 43 | +# for el in text_elements: |
| 44 | +# txt = el.get_text().strip() |
| 45 | +# if not cas_id_found: |
| 46 | +# if m := re.search(r"[CAS|NSDL]\s+ID\s*:\s*(.+?)(?:\s|$)", txt, re.I): |
| 47 | +# # email = m.group(1).strip() |
| 48 | +# cas_id_found = True |
| 49 | +# continue |
| 50 | +# if name is None: |
| 51 | +# name = txt |
| 52 | +# else: |
| 53 | +# if ( |
| 54 | +# re.search( |
| 55 | +# r"Statement\s+for\s+the\s+period|Your\s+demat\s+" |
| 56 | +# r"account\s+and\s+mutual\s+fund", |
| 57 | +# txt, |
| 58 | +# re.I | re.MULTILINE, |
| 59 | +# ) |
| 60 | +# or mobile is not None |
| 61 | +# ): |
| 62 | +# return InvestorInfo( |
| 63 | +# email=email, name=name, mobile=mobile or "", address="\n".join(address_lines) |
| 64 | +# ) |
| 65 | +# elif m := re.search(r"mobile\s*:\s*([+\d]+)(?:s|$)", txt, re.I): |
| 66 | +# mobile = m.group(1).strip() |
| 67 | +# address_lines.append(txt) |
| 68 | +# raise CASParseError("Unable to parse investor data") |
69 | 69 |
|
70 | 70 |
|
71 | 71 | def parse_investor_info_mf(layout, width, height) -> InvestorInfo: |
@@ -250,14 +250,14 @@ def extract_text_elements(layout: LTContainer) -> Iterator[LTTextBox]: |
250 | 250 | raise CASParseError( |
251 | 251 | "pdfminer does not support this file type. Install pymupdf dependency" |
252 | 252 | ) |
253 | | - if investor_info is None: |
254 | | - if file_type in (FileType.CAMS, FileType.KFINTECH): |
255 | | - investor_info = parse_investor_info_mf(text_elements, *page.mediabox[2:]) |
256 | | - elif file_type in (FileType.NSDL, FileType.CDSL) and page_num == 1: |
257 | | - investor_info = parse_investor_info_nsdl(text_elements, *page.mediabox[2:]) |
258 | | - if file_type == FileType.NSDL and page_num == 0: |
259 | | - # Ignore first page. no useful data |
260 | | - continue |
| 253 | + # if investor_info is None: |
| 254 | + # if file_type in (FileType.CAMS, FileType.KFINTECH): |
| 255 | + # investor_info = parse_investor_info_mf(text_elements, *page.mediabox[2:]) |
| 256 | + # elif file_type in (FileType.NSDL, FileType.CDSL) and page_num == 1: |
| 257 | + # investor_info = parse_investor_info_nsdl(text_elements, *page.mediabox[2:]) |
| 258 | + # if file_type == FileType.NSDL and page_num == 0: |
| 259 | + # # Ignore first page. no useful data |
| 260 | + # continue |
261 | 261 | pages.append(text_elements) |
262 | 262 | lines = group_similar_rows(pages) |
263 | 263 | return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines) |
0 commit comments