Skip to content

Commit 4287c9f

Browse files
committed
code refactor
1 parent 969043c commit 4287c9f

File tree

7 files changed

+57
-61
lines changed

7 files changed

+57
-61
lines changed

casparser/__init__.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
1-
try:
2-
from .parsers.mupdf import read_cas_pdf
3-
except (ImportError, ModuleNotFoundError):
4-
from .parsers.pdfminer import read_cas_pdf
5-
1+
from .parsers import read_cas_pdf
62
from .types import CASParserDataType
73
from .__version__ import __version__
84

casparser/cli.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,7 @@
77

88
from .__version__ import __version__
99

10-
try:
11-
from .parsers.mupdf import read_cas_pdf
12-
except ImportError:
13-
from .parsers.pdfminer import read_cas_pdf
10+
from . import read_cas_pdf
1411
from .encoder import CASDataEncoder
1512
from .exceptions import ParserException
1613

@@ -109,12 +106,7 @@ def cli(output, summary, password, force_pdfminer, filename):
109106
click.echo("No output file provided. Printing summary")
110107
summary = True
111108
try:
112-
if force_pdfminer:
113-
from .parsers.pdfminer import read_cas_pdf as read_cas_pdf_pm
114-
115-
data = read_cas_pdf_pm(filename, password)
116-
else:
117-
data = read_cas_pdf(filename, password)
109+
data = read_cas_pdf(filename, password, force_pdfminer=force_pdfminer)
118110
except ParserException as exc:
119111
click.echo("Error parsing pdf file :: " + click.style(str(exc), bold=True, fg="red"))
120112
sys.exit(1)

casparser/parsers/__init__.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import io
2+
import json
3+
from typing import Union
4+
5+
from casparser.encoder import CASDataEncoder
6+
from casparser.process import process_cas_text
7+
8+
9+
def read_cas_pdf(filename: Union[str, io.IOBase], password, output="dict", force_pdfminer=False):
10+
"""
11+
Parse CAS pdf and returns line data.
12+
13+
:param filename: CAS pdf file (CAMS or Kfintech)
14+
:param password: CAS pdf password
15+
:param output: Output format (json,dict) [default: dict]
16+
:param force_pdfminer: Force pdfminer parser even if mupdf is detected
17+
"""
18+
if force_pdfminer:
19+
from .pdfminer import cas_pdf_to_text
20+
else:
21+
try:
22+
from .mupdf import cas_pdf_to_text
23+
except (ImportError, ModuleNotFoundError):
24+
from .pdfminer import cas_pdf_to_text
25+
26+
partial_cas_data = cas_pdf_to_text(filename, password)
27+
28+
processed_data = process_cas_text("\u2029".join(partial_cas_data.lines))
29+
# noinspection PyProtectedMember
30+
processed_data.update(
31+
{
32+
"file_type": partial_cas_data.file_type.name,
33+
"investor_info": partial_cas_data.investor_info._asdict(),
34+
}
35+
)
36+
if output == "dict":
37+
return processed_data
38+
return json.dumps(processed_data, cls=CASDataEncoder)

casparser/parsers/mupdf.py

Lines changed: 4 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,14 @@
1-
from collections import namedtuple
21
import io
3-
import json
42
from operator import itemgetter
53
import re
64
from typing import List, Iterator, Union, Any
75

86
# noinspection PyPackageRequirements
97
import fitz
108

11-
from casparser.encoder import CASDataEncoder
129
from casparser.enums import FileType
1310
from casparser.exceptions import CASParseError
14-
from casparser.process import process_cas_text
15-
from .utils import isclose
16-
17-
InvestorInfo = namedtuple("InvestorInfo", ["name", "email", "address", "mobile"])
11+
from .utils import isclose, InvestorInfo, PartialCASData
1812

1913

2014
def extract_blocks(page_dict):
@@ -138,14 +132,13 @@ def group_similar_rows(elements_list: List[Iterator[Any]]):
138132
return lines
139133

140134

141-
def read_cas_pdf(filename: Union[str, io.IOBase], password, output="dict"):
135+
def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData:
142136
"""
143137
Parse CAS pdf and returns line data.
144138
145139
:param filename: CAS pdf file (CAMS or Kfintech)
146140
:param password: CAS pdf password
147-
:param output: Output format (json,dict) [default: dict]
148-
:return: array of lines from the CAS.
141+
:return: partial cas data with FileType, InvestorInfo and lines of data
149142
"""
150143
file_type: FileType = FileType.UNKNOWN
151144

@@ -183,13 +176,4 @@ def read_cas_pdf(filename: Union[str, io.IOBase], password, output="dict"):
183176
investor_info = parse_investor_info(page_dict)
184177
pages.append(sorted_blocks)
185178
lines = group_similar_rows(pages)
186-
processed_data = process_cas_text("\u2029".join(lines))
187-
processed_data.update(
188-
{
189-
"file_type": file_type.name,
190-
"investor_info": investor_info._asdict(),
191-
}
192-
)
193-
if output == "dict":
194-
return processed_data
195-
return json.dumps(processed_data, cls=CASDataEncoder)
179+
return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)

casparser/parsers/pdfminer.py

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
from collections import namedtuple
21
import io
3-
import json
42
import re
53
from typing import List, Optional, Iterator, Union
64

@@ -12,13 +10,9 @@
1210
from pdfminer.pdfpage import PDFPage
1311
from pdfminer.layout import LTTextBoxHorizontal, LTTextBoxVertical
1412

15-
from casparser.encoder import CASDataEncoder
1613
from casparser.enums import FileType
1714
from casparser.exceptions import CASParseError
18-
from casparser.process import process_cas_text
19-
from .utils import isclose
20-
21-
InvestorInfo = namedtuple("InvestorInfo", ["name", "email", "address", "mobile"])
15+
from .utils import isclose, InvestorInfo, PartialCASData
2216

2317

2418
def parse_investor_info(layout, width, height) -> InvestorInfo:
@@ -103,13 +97,12 @@ def group_similar_rows(elements_list: List[Iterator[LTTextBoxHorizontal]]):
10397
return lines
10498

10599

106-
def read_cas_pdf(filename: Union[str, io.IOBase], password, output="dict"):
100+
def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData:
107101
"""
108102
Parse CAS pdf and returns line data.
109103
110104
:param filename: CAS pdf file (CAMS or Kfintech)
111105
:param password: CAS pdf password
112-
:param output: Output format (json,dict) [default: dict]
113106
:return: array of lines from the CAS.
114107
"""
115108
file_type: Optional[FileType] = None
@@ -159,14 +152,4 @@ def read_cas_pdf(filename: Union[str, io.IOBase], password, output="dict"):
159152
pages.append(text_elements)
160153

161154
lines = group_similar_rows(pages)
162-
processed_data = process_cas_text("\u2029".join(lines))
163-
processed_data.update(
164-
{
165-
"file_type": file_type.name,
166-
"investor_info": investor_info._asdict(),
167-
}
168-
)
169-
170-
if output == "dict":
171-
return processed_data
172-
return json.dumps(processed_data, cls=CASDataEncoder)
155+
return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)

casparser/parsers/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
from collections import namedtuple
2+
3+
InvestorInfo = namedtuple("InvestorInfo", ["name", "email", "address", "mobile"])
4+
PartialCASData = namedtuple("PartialCASData", ["file_type", "investor_info", "lines"])
5+
6+
17
def isclose(a0, a1, tol=1.0e-4):
28
"""
39
Check if two elements are almost equal with a tolerance.

tests/base.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import pytest
55

6+
from casparser import read_cas_pdf
67
from casparser.exceptions import CASParseError
78

89

@@ -19,12 +20,8 @@ def setup_class(cls):
1920
cls.kfintech_password = os.getenv("KFINTECH_CAS_PASSWORD")
2021

2122
def read_pdf(self, filename, password, output="dict"):
22-
if self.mode == "pdfminer":
23-
from casparser.parsers.pdfminer import read_cas_pdf
24-
else:
25-
from casparser.parsers.mupdf import read_cas_pdf
26-
27-
return read_cas_pdf(filename, password, output=output)
23+
use_pdfminer = (self.mode == "pdfminer")
24+
return read_cas_pdf(filename, password, output=output, force_pdfminer=use_pdfminer)
2825

2926
def test_read_dict(self):
3027
self.read_pdf(self.cams_file_name, self.cams_password)

0 commit comments

Comments
 (0)