|
1 | | -from collections import namedtuple |
2 | 1 | import io |
3 | | -import json |
4 | 2 | from operator import itemgetter |
5 | 3 | import re |
6 | 4 | from typing import List, Iterator, Union, Any |
7 | 5 |
|
8 | 6 | # noinspection PyPackageRequirements |
9 | 7 | import fitz |
10 | 8 |
|
11 | | -from casparser.encoder import CASDataEncoder |
12 | 9 | from casparser.enums import FileType |
13 | 10 | from casparser.exceptions import CASParseError |
14 | | -from casparser.process import process_cas_text |
15 | | -from .utils import isclose |
16 | | - |
17 | | -InvestorInfo = namedtuple("InvestorInfo", ["name", "email", "address", "mobile"]) |
| 11 | +from .utils import isclose, InvestorInfo, PartialCASData |
18 | 12 |
|
19 | 13 |
|
20 | 14 | def extract_blocks(page_dict): |
@@ -138,14 +132,13 @@ def group_similar_rows(elements_list: List[Iterator[Any]]): |
138 | 132 | return lines |
139 | 133 |
|
140 | 134 |
|
141 | | -def read_cas_pdf(filename: Union[str, io.IOBase], password, output="dict"): |
| 135 | +def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData: |
142 | 136 | """ |
143 | 137 | Parse CAS pdf and returns line data. |
144 | 138 |
|
145 | 139 | :param filename: CAS pdf file (CAMS or Kfintech) |
146 | 140 | :param password: CAS pdf password |
147 | | - :param output: Output format (json,dict) [default: dict] |
148 | | - :return: array of lines from the CAS. |
| 141 | + :return: partial cas data with FileType, InvestorInfo and lines of data |
149 | 142 | """ |
150 | 143 | file_type: FileType = FileType.UNKNOWN |
151 | 144 |
|
@@ -183,13 +176,4 @@ def read_cas_pdf(filename: Union[str, io.IOBase], password, output="dict"): |
183 | 176 | investor_info = parse_investor_info(page_dict) |
184 | 177 | pages.append(sorted_blocks) |
185 | 178 | lines = group_similar_rows(pages) |
186 | | - processed_data = process_cas_text("\u2029".join(lines)) |
187 | | - processed_data.update( |
188 | | - { |
189 | | - "file_type": file_type.name, |
190 | | - "investor_info": investor_info._asdict(), |
191 | | - } |
192 | | - ) |
193 | | - if output == "dict": |
194 | | - return processed_data |
195 | | - return json.dumps(processed_data, cls=CASDataEncoder) |
| 179 | + return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines) |
0 commit comments