Skip to content

Commit 96c4df1

Browse files
authored
Merge pull request #13 from codereverser/feature/transaction-types
0.3.9 release - Transaction classification support + more... - Transaction classification - csv output support
2 parents 3166185 + 3579cb9 commit 96c4df1

File tree

19 files changed

+213
-75
lines changed

19 files changed

+213
-75
lines changed

.codecov.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
coverage:
22
precision: 2
33
round: down
4-
range: "70...90"
4+
range: "80...90"
55
status:
66
project:
77
default:

.deepsource.toml

Lines changed: 0 additions & 13 deletions
This file was deleted.

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Changelog
22

3+
## 0.3.9 - 2021-01-01
4+
5+
- Support for classifying transactions
6+
37
## 0.3.8 - 2020-12-29
48

59
- Support for parsing folios without PAN/KYC details
@@ -55,7 +59,7 @@
5559

5660
## 0.2.0 - 2020-10-15
5761

58-
- removed support for python < 3.8 versions
62+
- removed support for python versions <3.8
5963
- Better investor info parser
6064

6165
## 0.1.2 - 2020-10-14

README.md

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,16 @@ pip install casparser[mupdf]
2525

2626
## Usage
2727

28-
```
28+
```python
2929
import casparser
30-
data = casparser.read_cas_pdf('/path/to/cas/pdf/file.pdf', 'password')
30+
data = casparser.read_cas_pdf("/path/to/cas/file.pdf", "password")
31+
32+
# Get data in json format
33+
json_str = casparser.read_cas_pdf("/path/to/cas/file.pdf", "password", output="json")
34+
35+
# Get transactions data in csv string format
36+
csv_str = casparser.read_cas_pdf("/path/to/cas/file.pdf", "password", output="csv")
37+
3138
```
3239

3340
### Data structure
@@ -73,9 +80,8 @@ data = casparser.read_cas_pdf('/path/to/cas/pdf/file.pdf', 'password')
7380
"units": "number",
7481
"nav": "number",
7582
"balance": "number",
76-
"is_dividend_payout": "boolean",
77-
"is_dividend_reinvestment": "boolean",
78-
"dividend_rate": null
83+
"type": "string",
84+
"dividend_rate": "number"
7985
}
8086
]
8187
}
@@ -84,8 +90,22 @@ data = casparser.read_cas_pdf('/path/to/cas/pdf/file.pdf', 'password')
8490
]
8591
}
8692
```
87-
88-
93+
Notes:
94+
- Transaction `type` can be any value from the following
95+
- `PURCHASE`
96+
- `PURCHASE_SIP`
97+
- `REDEMPTION`
98+
- `SWITCH_IN`
99+
- `SWITCH_IN_MERGER`
100+
- `SWITCH_OUT`
101+
- `SWITCH_OUT_MERGER`
102+
- `DIVIDEND_PAYOUT`
103+
- `DIVIDEND_REINVESTMENT`
104+
- `TAX`
105+
- `MISC`
106+
- `dividend_rate` is applicable only for `DIVIDEND_PAYOUT` and
107+
`DIVIDEND_REINVESTMENT` transactions.
108+
89109
### CLI
90110

91111
casparser also comes with a command-line interface that prints summary of parsed
@@ -108,6 +128,10 @@ Usage: casparser [-o output_file.json] [-p password] [-s type] [-a] CAS_PDF_FILE
108128
-h, --help Show this message and exit.
109129
```
110130
131+
**Note:** `casparser cli` supports two special output file formats [-o _file.json_ / _file.csv_]
132+
1. `json` - complete parsed data is exported in json format (including investor info)
133+
2. `csv` - transactions with AMC, Folio and Scheme info are exported into csv format.
134+
111135
#### Demo
112136
113137
![demo](https://raw.githubusercontent.com/codereverser/casparser/main/assets/demo.jpg)

casparser/VERSION.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.3.8
1+
0.3.9

casparser/cli.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from decimal import Decimal
2-
import json
32
import os
43
import re
54
import shutil
@@ -15,9 +14,8 @@
1514
from .__version__ import __version__
1615

1716
from . import read_cas_pdf
18-
from .encoder import CASDataEncoder
1917
from .exceptions import ParserException
20-
from .parsers.utils import isclose
18+
from .parsers.utils import is_close, cas2json, cas2csv
2119

2220
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
2321

@@ -94,7 +92,7 @@ def print_summary(data, tablefmt="fancy_grid", output_filename=None, include_zer
9492

9593
# Check is calculated close (i.e. open + units from all transactions) is same as
9694
# reported close and also the scheme valuation = nav * calculated close.
97-
if calc_close != scheme["close"] or not isclose(
95+
if calc_close != scheme["close"] or not is_close(
9896
valuation["nav"] * calc_close, valuation["value"], tol=2
9997
):
10098
err += 1
@@ -201,7 +199,7 @@ def cli(output, summary, password, include_all, force_pdfminer, filename):
201199
if output is not None:
202200
output_ext = os.path.splitext(output)[-1].lower()
203201

204-
if not (summary or output_ext == ".json"):
202+
if not (summary or output_ext in (".csv", ".json")):
205203
summary = "fancy_grid"
206204

207205
try:
@@ -214,11 +212,12 @@ def cli(output, summary, password, include_all, force_pdfminer, filename):
214212
data,
215213
tablefmt=summary,
216214
include_zero_folios=include_all,
217-
output_filename=None if output_ext == ".json" else output,
215+
output_filename=None if output_ext in (".csv", ".json") else output,
218216
)
219-
if output_ext == ".json":
217+
if output_ext in (".csv", ".json"):
218+
conv_fn = cas2json if output_ext == ".json" else cas2csv
220219
with open(output, "w") as fp:
221-
json.dump(data, fp, cls=CASDataEncoder, indent=2)
220+
fp.write(conv_fn(data))
222221
click.echo("File saved : " + click.style(output, bold=True))
223222

224223

casparser/enums.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,24 @@
1-
import enum
1+
from enum import Enum, IntEnum, auto
22

33

4-
class FileType(enum.IntEnum):
4+
class FileType(IntEnum):
55
"""Enum for CAS file source."""
66

77
UNKNOWN = 0
88
CAMS = 1
99
KFINTECH = 2
10+
11+
12+
class TransactionType(Enum):
13+
PURCHASE = auto()
14+
PURCHASE_SIP = auto()
15+
REDEMPTION = auto()
16+
DIVIDEND_PAYOUT = auto()
17+
DIVIDEND_REINVEST = auto()
18+
SWITCH_IN = auto()
19+
SWITCH_IN_MERGER = auto()
20+
SWITCH_OUT = auto()
21+
SWITCH_OUT_MERGER = auto()
22+
TAX = auto()
23+
MISC = auto()
24+
UNKNOWN = auto()

casparser/parsers/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
import io
2-
import json
32
from typing import Union
43

5-
from casparser.encoder import CASDataEncoder
64
from casparser.process import process_cas_text
5+
from .utils import cas2json
76

87

98
def read_cas_pdf(filename: Union[str, io.IOBase], password, output="dict", force_pdfminer=False):
@@ -35,4 +34,4 @@ def read_cas_pdf(filename: Union[str, io.IOBase], password, output="dict", force
3534
)
3635
if output == "dict":
3736
return processed_data
38-
return json.dumps(processed_data, cls=CASDataEncoder)
37+
return cas2json(processed_data)

casparser/parsers/mupdf.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from casparser.enums import FileType
1010
from casparser.exceptions import CASParseError
11-
from .utils import isclose, InvestorInfo, PartialCASData
11+
from .utils import is_close, InvestorInfo, PartialCASData
1212

1313

1414
def extract_blocks(page_dict):
@@ -26,7 +26,7 @@ def extract_blocks(page_dict):
2626
y0, y1 = bbox[1], bbox[3]
2727
for line in sorted(block["lines"], key=lambda x: x["bbox"][1]):
2828
if len(items) > 0 and not (
29-
isclose(y0, line["bbox"][1], tol=3) or isclose(y1, line["bbox"][3], tol=3)
29+
is_close(y0, line["bbox"][1], tol=3) or is_close(y1, line["bbox"][3], tol=3)
3030
):
3131
full_text = "\t\t".join(
3232
[x[0].strip() for x in sorted(items, key=lambda x: x[1][0]) if x[0].strip()]
@@ -119,7 +119,7 @@ def group_similar_rows(elements_list: List[Iterator[Any]]):
119119
y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
120120
items = []
121121
for el in sorted_elements:
122-
if len(items) > 0 and not (isclose(el[3], y1, tol=3) or isclose(el[1], y0, tol=3)):
122+
if len(items) > 0 and not (is_close(el[3], y1, tol=3) or is_close(el[1], y0, tol=3)):
123123
line = "\t\t".join(
124124
[x[4].strip() for x in sorted(items, key=lambda x: x[0]) if x[4].strip()]
125125
)
@@ -143,9 +143,7 @@ def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData
143143

144144
if isinstance(filename, str):
145145
fp = open(filename, "rb")
146-
elif isinstance(filename, io.IOBase):
147-
fp = filename
148-
elif hasattr(filename, "read"): # compatibility for Django UploadedFile
146+
elif hasattr(filename, "read") and hasattr(filename, "close"): # file-like object
149147
fp = filename
150148
else:
151149
raise CASParseError("Invalid input. filename should be a string or a file like object")

casparser/parsers/pdfminer.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from casparser.enums import FileType
1414
from casparser.exceptions import CASParseError
15-
from .utils import isclose, InvestorInfo, PartialCASData
15+
from .utils import is_close, InvestorInfo, PartialCASData
1616

1717

1818
def parse_investor_info(layout, width, height) -> InvestorInfo:
@@ -86,7 +86,7 @@ def group_similar_rows(elements_list: List[Iterator[LTTextBoxHorizontal]]):
8686
y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1
8787
items = []
8888
for el in sorted_elements:
89-
if len(items) > 0 and not (isclose(el.y1, y1, tol=3) or isclose(el.y0, y0, tol=3)):
89+
if len(items) > 0 and not (is_close(el.y1, y1, tol=3) or is_close(el.y0, y0, tol=3)):
9090
line = "\t\t".join(
9191
[x.get_text().strip() for x in sorted(items, key=lambda x: x.x0)]
9292
)
@@ -110,9 +110,7 @@ def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData
110110

111111
if isinstance(filename, str):
112112
fp = open(filename, "rb")
113-
elif isinstance(filename, io.IOBase):
114-
fp = filename
115-
elif hasattr(filename, "read"): # compatibility for Django UploadedFile
113+
elif hasattr(filename, "read") and hasattr(filename, "close"): # file-like object
116114
fp = filename
117115
else:
118116
raise CASParseError("Invalid input. filename should be a string or a file like object")

0 commit comments

Comments
 (0)