Skip to content

Commit f859c1e

Browse files
authored
support v003 features
support v003 features
2 parents 3dda72d + f4dc683 commit f859c1e

File tree

20 files changed

+523
-196
lines changed

20 files changed

+523
-196
lines changed

docker/entrypoint.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ export MANPATH=/opt/texlive/2023/texmf-dist/doc/man:$MANPATH
1111
export INFOPATH=/opt/texlive/2023/texmf-dist/doc/info:$INFOPATH
1212
export PATH=/opt/pandoc/pandoc-3.1.9/bin:$PATH
1313

14-
uvicorn --host 0.0.0.0 --port 10001 bisheng_unstructured.api.main:app
14+
uvicorn --host 0.0.0.0 --port 10001 --workers 8 bisheng_unstructured.api.main:app

docker/run.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ function upload_image() {
3535

3636
function run_dev_docker() {
3737
image="dataelement/bisheng-unstructured:0.0.2"
38-
cnt_name="bisheng_uns_v002_dev"
38+
cnt_name="bisheng_uns_v002_rd_dev"
3939
MOUNT="-v $HOME:$HOME -v /public:/public"
4040
pushd $(cd $(dirname $0); pwd)
41-
docker run -p 20001:10001 -itd --name ${cnt_name} $MOUNT $image bash
41+
docker run -p 22001:10001 -itd --name ${cnt_name} $MOUNT $image bash
4242
}
4343

4444

src/bisheng_unstructured/api/main.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,14 @@
88
from fastapi.middleware.cors import CORSMiddleware
99
from fastapi.responses import ORJSONResponse
1010

11+
from bisheng_unstructured.common import Timer, get_logger
12+
1113
from .pipeline import Pipeline
1214
from .types import ConfigInput, UnstructuredInput, UnstructuredOutput
1315

16+
logger = get_logger("BishengUns", "/app/log/bisheng-uns.log")
17+
18+
1419
# Fastapi App
1520

1621

@@ -90,23 +95,36 @@ async def etl4_llm(inp: UnstructuredInput):
9095
file_type = filename.rsplit(".", 1)[1].lower()
9196

9297
if not inp.b64_data and not inp.url:
98+
logger.error(f"url or b64_data at least one must be given filename=[{inp.filename}]")
9399
raise Exception("url or b64_data at least one must be given")
94100

101+
logger.info(f"start etl4llm with mode=[{inp.mode}] filename=[{inp.filename}]")
102+
timer = Timer()
103+
95104
with tempfile.TemporaryDirectory() as tmpdir:
96105
file_path = os.path.join(tmpdir, filename)
97106
if b64_data:
98-
with open(file_path, "wb") as fout:
99-
fout.write(base64.b64decode(b64_data[0]))
107+
try:
108+
with open(file_path, "wb") as fout:
109+
fout.write(base64.b64decode(b64_data[0]))
110+
except Exception:
111+
logger.error(f"b64_data is damaged filename=[{inp.filename}]")
112+
return Exception(f"b64_data is damaged")
100113
else:
101114
headers = inp.parameters.get("headers", {})
102115
ssl_verify = inp.parameters.get("ssl_verify", True)
103116
response = requests.get(inp.url, headers=headers, verify=ssl_verify)
104117
if not response.ok:
105-
raise Exception(f"URL return an error: {response.status_code}")
118+
raise Exception(f"url data is damaged: {response.status_code}")
119+
106120
with open(file_path, "wb") as fout:
107121
fout.write(response.text)
108122

109123
inp.file_path = file_path
110124
inp.file_type = file_type
111125

112-
return pipeline.predict(inp)
126+
timer.toc()
127+
outp = pipeline.predict(inp)
128+
timer.toc()
129+
logger.info(f"succ etl4llm with filename=[{inp.filename}] elapses=[{timer.get()}]]")
130+
return outp

src/bisheng_unstructured/api/pipeline.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
from typing import Dict
33

4+
from bisheng_unstructured.common import get_logger
45
from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html
56
from bisheng_unstructured.documents.pdf_parser.image import ImageDocument
67
from bisheng_unstructured.documents.pdf_parser.pdf import PDFDocument
@@ -19,6 +20,8 @@
1920
from .any2pdf import Any2PdfCreator
2021
from .types import UnstructuredInput, UnstructuredOutput
2122

23+
logger = get_logger("BishengUns", "/app/log/bisheng-uns.log")
24+
2225

2326
def partition_pdf(filename, model_params, **kwargs):
2427
doc = PDFDocument(file=filename, model_params=model_params, **kwargs)
@@ -72,6 +75,7 @@ def to_pdf(self, inp: UnstructuredInput) -> UnstructuredOutput:
7275
result = UnstructuredOutput(b64_pdf=output)
7376
return result
7477
except Exception as e:
78+
logger.error(f"error in topdf filename=[{inp.filename}] err=[{e}]")
7579
return UnstructuredOutput(status_code=400, status_message=str(e))
7680

7781
def predict(self, inp: UnstructuredInput) -> UnstructuredOutput:
@@ -80,6 +84,7 @@ def predict(self, inp: UnstructuredInput) -> UnstructuredOutput:
8084

8185
if inp.file_type not in PARTITION_MAP:
8286
raise Exception(f"file type[{inp.file_type}] not supported")
87+
8388
filename = inp.file_path
8489
file_type = inp.file_type
8590

@@ -90,8 +95,6 @@ def predict(self, inp: UnstructuredInput) -> UnstructuredOutput:
9095
part_inp.update({"model_params": self.pdf_model_params})
9196
try:
9297
elements = part_func(**part_inp)
93-
# for e in elements:
94-
# print("e", e.to_dict())
9598
mode = inp.mode
9699
if mode == "partition":
97100
isd = convert_to_isd(elements)
@@ -105,4 +108,5 @@ def predict(self, inp: UnstructuredInput) -> UnstructuredOutput:
105108

106109
return result
107110
except Exception as e:
111+
logger.error(f"error in partition filename=[{inp.filename}] err=[{e}]")
108112
return UnstructuredOutput(status_code=400, status_message=str(e))
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .logger import get_logger
2+
from .timer import Timer
3+
4+
__all__ = ["Timer", "get_logger"]
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import logging
2+
import os
3+
4+
logger_initialized = {}
5+
global_log_file = []
6+
7+
8+
def get_logger(name, log_file=None, log_level=logging.INFO):
9+
"""Initialize and get a logger by name.
10+
11+
If the logger has not been initialized, this method will initialize the
12+
logger by adding one or two handlers, otherwise the initialized logger will
13+
be directly returned. During initialization, a StreamHandler will always be
14+
added. If `log_file` is specified and the process rank is 0, a FileHandler
15+
will also be added.
16+
17+
Args:
18+
name (str): Logger name.
19+
log_file (str | None): The log filename. If specified, a FileHandler
20+
will be added to the logger.
21+
log_level (int): The logger level. Note that only the process of
22+
rank 0 is affected, and other processes will set the level to
23+
"Error" thus be silent most of the time.
24+
25+
Returns:
26+
logging.Logger: The expected logger.
27+
"""
28+
logger = logging.getLogger(name)
29+
if name in logger_initialized:
30+
return logger
31+
# handle hierarchical names
32+
# e.g., logger "a" is initialized, then logger "a.b" will skip the
33+
# initialization since it is a child of "a".
34+
for logger_name in logger_initialized:
35+
if name.startswith(logger_name):
36+
return logger
37+
38+
# stream_handler = logging.StreamHandler()
39+
# handlers = [stream_handler]
40+
handlers = []
41+
42+
rank = 0
43+
44+
if log_file is not None and len(global_log_file) == 0:
45+
log_path = os.path.dirname(log_file)
46+
if not os.path.exists(log_path):
47+
os.makedirs(log_path)
48+
49+
global_log_file.append(log_file)
50+
51+
if log_file is None and len(global_log_file) > 0:
52+
log_file = global_log_file[0]
53+
54+
# only rank 0 will add a FileHandler
55+
if rank == 0 and log_file is not None:
56+
file_handler = logging.FileHandler(log_file, "a")
57+
handlers.append(file_handler)
58+
59+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
60+
for handler in handlers:
61+
handler.setFormatter(formatter)
62+
handler.setLevel(log_level)
63+
logger.addHandler(handler)
64+
65+
if rank == 0:
66+
logger.setLevel(log_level)
67+
else:
68+
logger.setLevel(logging.ERROR)
69+
70+
logger_initialized[name] = True
71+
72+
return logger
73+
74+
75+
def PCHECK(expr, msg, *args):
76+
if not expr:
77+
get_logger("main").error(msg, *args)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import time
2+
3+
4+
class Timer(object):
5+
def __init__(self):
6+
self.tic()
7+
self.elapses = []
8+
9+
def tic(self):
10+
self.tstart = time.time()
11+
12+
def toc(self, reset=True, memorize=True):
13+
elapse = round(time.time() - self.tstart, 3)
14+
if memorize:
15+
self.elapses.append(elapse)
16+
17+
if reset:
18+
self.tic()
19+
20+
def get(self):
21+
n = round(sum(self.elapses), 3)
22+
elapses = self.elapses + [
23+
n,
24+
]
25+
return elapses

src/bisheng_unstructured/documents/pdf_parser/image.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from .blob import Blob
88
from .pdf import PDFDocument
99

10+
# from bisheng_unstructured.common import Timer
11+
1012

1113
class ImageDocument(PDFDocument):
1214
def __init__(
@@ -16,8 +18,10 @@ def __init__(
1618
with_columns: bool = False,
1719
text_elem_sep: str = "\n",
1820
enhance_table: bool = True,
21+
keep_text_in_image: bool = True,
1922
lang: str = "zh",
2023
verbose: bool = False,
24+
n_parallel: int = 10,
2125
**kwargs
2226
) -> None:
2327
self.layout_agent = LayoutAgent(**model_params)
@@ -35,18 +39,26 @@ def __init__(
3539
self.is_scan = True
3640
self.support_rotate = False
3741
self.is_join_table = False
42+
self.keep_text_in_image = keep_text_in_image
43+
self.n_parallel = n_parallel
3844

3945
super(PDFDocument, self).__init__()
4046

4147
def load(self) -> List[Page]:
4248
"""Load given path as pages."""
49+
# timer = Timer()
4350
blob = Blob.from_path(self.file)
4451
groups = []
4552
b64_data = base64.b64encode(blob.as_bytes()).decode()
4653
layout_inp = {"b64_image": b64_data}
54+
# timer.toc()
55+
4756
layout = self.layout_agent.predict(layout_inp)
57+
# timer.toc()
58+
4859
page_inds = []
4960
blocks = self._allocate_semantic(None, layout, b64_data, self.is_scan, self.lang)
61+
# timer.toc()
5062

5163
if blocks:
5264
if self.with_columns:
@@ -58,6 +70,11 @@ def load(self) -> List[Page]:
5870
groups.append(blocks)
5971
page_inds.append(1)
6072

73+
# timer.toc()
6174
groups = self._allocate_continuous(groups, self.lang)
75+
76+
# timer.toc()
6277
pages = self._save_to_pages(groups, page_inds, self.lang)
78+
# timer.toc()
79+
# print('timers', timer.get())
6380
return pages

0 commit comments

Comments
 (0)