diff --git a/main.py b/main.py index dde927a..a2a5da5 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,14 @@ -from piper.services import TestMessageAdder, StringValue, TesseractRecognizer, SpacyNER +# from piper.services import TestMessageAdder +# from piper.services import StringValue +# from piper.services import TesseractRecognizer +# from piper.services import SpacyNER +from piper.services import FaceRecognizer from piper.envs import CurrentEnv, DockerEnv from piper.configurations import get_configuration import time import asyncio import sys -from piper.utils import tesrct_utils as tu +# from piper.utils import tesrct_utils as tu from loguru import logger logger.add("file.log", level="INFO", backtrace=True, diagnose=True, rotation='5 MB') @@ -29,7 +33,9 @@ loop = asyncio.get_event_loop() with DockerEnv() as env: # object created - recognizer = TesseractRecognizer(port=cfg.docker_app_port) + # recognizer = TesseractRecognizer(port=cfg.docker_app_port) + recognizer = FaceRecognizer(port=cfg.docker_app_port) + result = loop.run_until_complete(recognizer()) logger.info(f'result of recognition is {result}') diff --git a/piper/base/backend/templates/fast-api.j2 b/piper/base/backend/templates/fast-api.j2 index 6c8ae52..a933690 100644 --- a/piper/base/backend/templates/fast-api.j2 +++ b/piper/base/backend/templates/fast-api.j2 @@ -1,25 +1,48 @@ import time -from fastapi import FastAPI, Request, status -from piper.envs import CurrentEnv +from fastapi import FastAPI, Request, status, Response, File, UploadFile +from piper.envs import CurrentEnv, DockerEnv +from piper.configurations import get_configuration + +from loguru import logger +logger.add("file_.log", level="INFO", backtrace=True, diagnose=True, rotation='5 MB') {% for script_name in scripts.keys() %} from {{ script_name }} import * {% endfor %} +cfg = get_configuration() app = FastAPI(debug=True) @app.post('/health_check', status_code = status.HTTP_200_OK) async def health_check(): + logger.info('fast_api.j2 test health_check') return {"message": "health check"} + +logger.debug(f'CurrentEnv is {CurrentEnv}') +logger.debug(f'cfg.env is {cfg.env}') + + +# with CurrentEnv(): +# if True: +# with DockerEnv(): with CurrentEnv(): - service = {{ service_class }}( {% for k, v in service_kwargs.items() %} {{ k }}={{ v }}, {% endfor %} ) + logger.info(f'CurrentEnv') + service = {{ service_class }}({% for k, v in service_kwargs.items() %} {{ k }}={{ v }}, {% endfor %}) + logger.info(f'service {service}') @app.post('/{{ function_name }}') async def {{ function_name }}( - request_model: {{ request_model }}, + #request_model: {{ request_model }}, + file: UploadFile = File(...) ): - result = await service.{{ function_name }}(request_model) + data_b = await file.read() + logger.info('{{ function_name }} POST request ') + result = await service.{{ function_name }}(data_b) - return result.dict() \ No newline at end of file + logger.info(f'fast_api.j2 result is {result}') + try: + return result + except Exception as e: + logger.error(f'fast_api.j2 error while recognize {e}') diff --git a/piper/base/docker/__init__.py b/piper/base/docker/__init__.py index 99e6e4b..aa5b2fa 100644 --- a/piper/base/docker/__init__.py +++ b/piper/base/docker/__init__.py @@ -24,6 +24,29 @@ def render(self): return template.render(cmd=self.cmd, python_docker_version=self.python_docker_version, run_command_lines=self.run_rows, post_install_lines=self.post_install_lines) +class TensorFlowImage: + + def __init__(self, tag, python_docker_version, cmd, template_file, run_rows, post_install_lines): + self.tag = tag + self.python_docker_version = python_docker_version + self.cmd = cmd + self.template_file = template_file + self.run_rows = run_rows + self.post_install_lines = post_install_lines + + def render(self): + """ + Render docker template + """ + template_dir = os.path.join(os.path.dirname(__file__), 'images') + jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir), + trim_blocks=True, + lstrip_blocks=True) + template = jinja_env.get_template(self.template_file) + return template.render(cmd=self.cmd, ddocker_name=self.tag, ddocker_version=self.python_docker_version, run_command_lines=self.run_rows, post_install_lines=self.post_install_lines) + + + # class PythonTesseractImage: # def __init__(self, tag, python_docker_version, cmd): diff --git a/piper/base/docker/images/default-general.j2 b/piper/base/docker/images/default-general.j2 new file mode 100644 index 0000000..1b0203c --- /dev/null +++ b/piper/base/docker/images/default-general.j2 @@ -0,0 +1,15 @@ +FROM {{ddocker_name}}:{{ ddocker_version }} + +{{ run_command_lines }} + +WORKDIR /app + +COPY requirements.txt ./requirements.txt +RUN PYTHONPATH=/usr/bin/python3 pip3 install -r requirements.txt + +{{ post_install_lines }} + +COPY ./ ./ +RUN chmod +x ./run.sh + +ENTRYPOINT ["{{ cmd }}"] \ No newline at end of file diff --git a/piper/base/executors.py b/piper/base/executors.py index ee6ab47..0a8330c 100644 --- a/piper/base/executors.py +++ b/piper/base/executors.py @@ -8,9 +8,10 @@ import aiohttp from loguru import logger import docker +from docker.errors import APIError, BuildError from pydantic import BaseModel #, BytesObject, ListOfStringsObject -from piper.base.docker import PythonImage +from piper.base.docker import PythonImage, TensorFlowImage # from piper.base.docker import PythonTesseractImage from piper.base.backend.utils import render_fast_api_backend, render_fast_api_tsrct_backend from piper.envs import is_docker_env, is_current_env, get_env @@ -29,6 +30,7 @@ class LocalExecutor: def is_known(obj): + logger.info(f'test object {obj}') basic = obj.__class__.__name__ in {'dict', 'list', 'tuple', 'str', 'int', 'float', 'bool'} models = isinstance(obj, (BaseModel,)) return basic or models @@ -109,10 +111,20 @@ def build_image(path: str, docker_image): with open(f"{path}/Dockerfile", "w") as output: output.write(image) - image, logs = client.images.build(path=path, - tag=docker_image.tag, - quiet=False, - timeout=20) + logger.info('build start!') + + try: + image, logs = client.images.build(path=path, + tag=docker_image.tag, + quiet=False, + timeout=120) + except (BuildError, APIError) as e: + logger.error('error while build_image:') + for line in e.build_log: + if 'stream' in line: + logger.error(line['stream'].strip()) + sys.exit() + for log in logs: logger.info(f'executor build_image: {log}') logger.info(f'image is {image}') @@ -145,10 +157,11 @@ def wait_for_fast_api_app_start(host, external_port, wait_on_iter, n_iters): if r.status_code == 200: break except Exception as e: + logger.error(f'resive health_check answer {e}') time.sleep(wait_on_iter) if i == n_iters: - logger.error('FastAPI app can`t start or n_iters too small') + logger.error(f'FastAPI app can`t start or wait_on_iter: {wait_on_iter} or n_iters: {n_iters} too small') sys.exit() i += 1 @@ -267,6 +280,99 @@ def __init__(self, port: int = 8080, **service_kwargs): super().__init__('localhost', port, self.base_handler) + +class FastAPIFaceRecognExecutor(HTTPExecutor): + # basic requements + requirements = ["gunicorn", "fastapi", "uvicorn", "aiohttp", "docker", "Jinja2", "pydantic", "loguru", "numpy", "opencv-python", "python-multipart", ] + + # executor specific requements + requirements.extend( + [ + # 'python3-opencv' + 'tensorflow', + 'mtcnn', + ] + ) + + # basic packages + packages_list = ['apt-utils', 'tree', 'cmake', 'mc'] + + # executor specific packages + packages_list.extend( + [ + 'libgl1', + 'ffmpeg', + 'libsm6', + 'libxext6', + ] + ) + + base_handler = "recognize" + + def __init__(self, port: int = 8080, **service_kwargs): + self.container = None + # self.image_tag = 'piper:latest'\ + self.image_tag = 'tensorflow/tensorflow' + self.container_name = "piper_FastAPI_FaceRecogn" + + if is_docker_env(): + docker_client = docker.DockerClient(base_url='unix://var/run/docker.sock') + cfg = get_configuration() + project_output_path = cfg.path + + copy_piper(project_output_path) + copy_scripts(project_output_path, self.scripts()) + + run_rows = '' + run_rows += add_row('RUN apt update && apt install -y apt-transport-https') + run_rows += add_row('RUN apt install -y software-properties-common') + run_rows += add_packages_to_install(self.packages_list) + run_rows += add_row('RUN pip3 install --upgrade pip') + + post_install_lines = "" + + docker_image = TensorFlowImage(self.image_tag, 'latest-gpu-jupyter', cmd=f"./run.sh", template_file='default-general.j2', run_rows=run_rows, post_install_lines=post_install_lines) + logger.info('Docker file created') + + write_requirements(project_output_path, self.requirements) + logger.info('python requirements file created') + + logger.info('build_image') + build_image(project_output_path, docker_image) + + logger.info('image builded') + self.create_fast_api_files(project_output_path, **service_kwargs) + + # create and run docker container + # if container exits it will be recreated! + logger.info('create image and container started') + container = du.create_image_and_container_by_dockerfile( + docker_client, + project_output_path, + self.image_tag, + self.container_name, + port + ) + + logger.info('waiting for FastApi service start') + if container: + output = container.attach(stdout=True, stderr=True, stream=False, logs=True) + for line in output: + logger.info(str(line)) + #TODO test FastAPI errors by other way + if 'Traceback' in str(line): + logger.error('FastAPI can`t start') + sys.exit() + # logger.info(container.stats(decode=False, stream=False)) + + wait_for_fast_api_app_start('localhost', cfg.docker_app_port, cfg.wait_on_iter, cfg.n_iters) + else: + # TODO: Local ENVIRONMENT checks + pass + + super().__init__('localhost', port, self.base_handler) + + def rm_container(self): if self.container: self.container.remove(force=True) @@ -278,21 +384,19 @@ def create_fast_api_files(self, path: str, **service_kwargs): cfg = get_configuration() # TODO add support more than one functions - backend = render_fast_api_tsrct_backend( + backend = render_fast_api_backend( service_class=self.__class__.__name__, service_kwargs=dict(service_kwargs), scripts=self.scripts(), function_name=self.base_handler, - # request_model="BytesObject", - # response_model="ListOfStringsObject" + request_model="BytesObject", + response_model="ListOfStringsObject" ) with open(f"{path}/main.py", "w") as output: output.write(backend) - write_requirements(path, self.requirements) - gunicorn = "#!/bin/bash \n" \ - f"gunicorn -b 0.0.0.0:8080 --workers {cfg.n_gunicorn_workers} main:app --worker-class uvicorn.workers.UvicornWorker --preload --timeout 120" + f"gunicorn -b 0.0.0.0:8080 --workers {cfg.n_gunicorn_workers} main:app --worker-class uvicorn.workers.UvicornWorker --preload --timeout 240" with open(f"{path}/run.sh", "w") as output: output.write(gunicorn) diff --git a/piper/configurations.py b/piper/configurations.py index a5dd151..4f10eaf 100644 --- a/piper/configurations.py +++ b/piper/configurations.py @@ -8,14 +8,14 @@ class Configuration: env = None # start time and counter - wait_on_iter = 0.5 - n_iters = 10 + wait_on_iter = 5 + n_iters = 20 # docker start time and counter - docker_wait_on_iter = 0.5 + docker_wait_on_iter = 4 docker_n_iters = 20 - n_gunicorn_workers = 1 + n_gunicorn_workers = 4 image_suffixes = set(['jpg', 'jpeg', 'png']) pdf_suffixes = set(['pdf']) diff --git a/piper/envs/__init__.py b/piper/envs/__init__.py index de363d3..7050c4f 100644 --- a/piper/envs/__init__.py +++ b/piper/envs/__init__.py @@ -1,4 +1,6 @@ +import imp from piper.configurations import get_configuration +from loguru import logger cfg = get_configuration() @@ -18,7 +20,7 @@ def get_env(): def set_env(env): - print("Setting environment to: {}".format(env)) + logger.info("Setting environment to: {}".format(env)) cfg.env = env @@ -28,12 +30,12 @@ def __init__(self): pass def __enter__(self): - print("Entering DockerEnv") + logger.info("Docker context management __enter__") self._old_environment = get_env() set_env(self) def __exit__(self, *args, **kws): - print("Exiting DockerEnv") + logger.info("Docker context management __exit__") set_env(self._old_environment) @@ -43,12 +45,12 @@ def __init__(self): pass def __enter__(self): - print("Entering CurrentEnv") + logger.info("CurrentEnv context management __enter__") self._old_environment = get_env() set_env(self) def __exit__(self, *args, **kws): - print("Exiting CurrentEnv") + logger.info("CurrentEnv context management __exit__") set_env(self._old_environment) diff --git a/piper/services/__init__.py b/piper/services/__init__.py index 62c7810..81a60cb 100644 --- a/piper/services/__init__.py +++ b/piper/services/__init__.py @@ -1,13 +1,14 @@ -from piper.base.executors import FastAPIExecutor, FastAPITesseractExecutor +from piper.base.executors import FastAPIExecutor, FastAPITesseractExecutor, FastAPIFaceRecognExecutor from fastapi.responses import JSONResponse from pydantic import BaseModel from loguru import logger import json -import spacy +# import spacy import sys from piper.configurations import get_configuration -from piper.utils import tesrct_utils as tu +# from piper.utils import tesrct_utils as tu +from piper.utils import face_recogn_utils as fru logger.add("file.log", level="INFO", backtrace=True, diagnose=True, rotation='5 MB') @@ -32,6 +33,23 @@ async def run(self, message: StringValue) -> StringValue: return StringValue(value=(message.value + self.appender)) +class FaceRecognizer(FastAPIFaceRecognExecutor): + ''' + FaceRecognizer implementation service + ''' + def __init__(self, **kwargs): + self.face_recogizer = fru.FaceRecognizer() + super().__init__(**kwargs) + + + async def recognize(self, file_content : BytesObject) -> ListOfStringsObject: + logger.info(f'face_recogizer recive {type(file_content)} object') + text_dict = self.face_recogizer.bytes_handler(file_content) + logger.info(f'face_recogizer img_bytes_handler return {(text_dict)} object') + return JSONResponse(content=text_dict) + # return JSONResponse(content={"1":"1"}) + + class TesseractRecognizer(FastAPITesseractExecutor): ''' Tesseract OCR implementation service @@ -82,54 +100,54 @@ async def ner(self, txt: str): # logger.exception(msg) -class SpacyNER(): - ''' - Spacy NER service - ''' - def __init__(self): - cfg = get_configuration() - self.available_models = set() - self.nlp = None - - try: - for cur_model in cfg.spacy_models: - logger.info(f'try to download model {cur_model} to {cfg.model_path}') - # spacy.util.set_data_path(cfg.model_path) - res = spacy.cli.download(cur_model) - logger.info(f'result of spacy.cli.download is {res}') - self.available_models.add(cur_model) - except Exception as e: - logger.error(f'catch exception {e}') - sys.exit() - - - def set_model(self, cur_model): - if cur_model not in self.available_models: - logger.error(f'there is not {cur_model} in available_models set: {self.available_models}') - self.nlp = None - raise ValueError(f'there is not {cur_model} in available_models set: {self.available_models}') - - try: - nlp = spacy.load(cur_model) - # nlp = spacy.load('en_default') - logger.info('spacy nlp object created with model {cur_model}') - except Exception as e: - logger.error(f'catch exception {e}') - if isinstance(e, OSError): - logger.error(f'you must download spacy model {cur_model}') - nlp = None - logger.info('spacy nlp object DID NOT create') +# class SpacyNER(): +# ''' +# Spacy NER service +# ''' +# def __init__(self): +# cfg = get_configuration() +# self.available_models = set() +# self.nlp = None + +# try: +# for cur_model in cfg.spacy_models: +# logger.info(f'try to download model {cur_model} to {cfg.model_path}') +# # spacy.util.set_data_path(cfg.model_path) +# res = spacy.cli.download(cur_model) +# logger.info(f'result of spacy.cli.download is {res}') +# self.available_models.add(cur_model) +# except Exception as e: +# logger.error(f'catch exception {e}') +# sys.exit() + + +# def set_model(self, cur_model): +# if cur_model not in self.available_models: +# logger.error(f'there is not {cur_model} in available_models set: {self.available_models}') +# self.nlp = None +# raise ValueError(f'there is not {cur_model} in available_models set: {self.available_models}') + +# try: +# nlp = spacy.load(cur_model) +# # nlp = spacy.load('en_default') +# logger.info('spacy nlp object created with model {cur_model}') +# except Exception as e: +# logger.error(f'catch exception {e}') +# if isinstance(e, OSError): +# logger.error(f'you must download spacy model {cur_model}') +# nlp = None +# logger.info('spacy nlp object DID NOT create') - self.nlp = nlp - - - def extract_named_ents(self, txt: str): - logger.debug(f'got data type {type(txt)} and data <<{txt}>> for NER') - if self.nlp: - res = [] - doc = self.nlp(txt) - for ent in doc.ents: - res.append((ent.text, ent.label_)) - return JSONResponse(content=res) - else: - logger.error(f'nlp object didn`t create. you should use set_model(model_name)') +# self.nlp = nlp + + +# def extract_named_ents(self, txt: str): +# logger.debug(f'got data type {type(txt)} and data <<{txt}>> for NER') +# if self.nlp: +# res = [] +# doc = self.nlp(txt) +# for ent in doc.ents: +# res.append((ent.text, ent.label_)) +# return JSONResponse(content=res) +# else: +# logger.error(f'nlp object didn`t create. you should use set_model(model_name)') diff --git a/piper/utils/docker_utils.py b/piper/utils/docker_utils.py index 57975c6..c6821dd 100644 --- a/piper/utils/docker_utils.py +++ b/piper/utils/docker_utils.py @@ -172,6 +172,8 @@ def create_image_and_container_by_dockerfile(docker_client, path, image_tag, con sys.exit() time.sleep(cfg.docker_wait_on_iter) + return container + except docker.errors.APIError as api_e: logger.error(f'eroror while run container {container_name}') diff --git a/piper/utils/face_recogn_utils.py b/piper/utils/face_recogn_utils.py new file mode 100644 index 0000000..068efda --- /dev/null +++ b/piper/utils/face_recogn_utils.py @@ -0,0 +1,35 @@ +import numpy as np +import cv2 +import sys + +from mtcnn import MTCNN +from loguru import logger + +class FaceRecognizer(): + + def __init__(self): + # doesnt work from docker + # self.detector = MTCNN() + self.detector = None + logger.info('FaceRecognizer model is MTCNN') + + def bytes_handler(self, img_bytes): + logger.info(f'bytes_handler with arg {type(img_bytes)} and len {sys.getsizeof(img_bytes)}') + np_array = np.asarray(bytearray(img_bytes), dtype="uint8") + logger.info(f'converted image is type of {type(np_array)} and size {np_array.shape}') + img = cv2.imdecode(np_array, cv2.IMREAD_COLOR) + + if img is not None: + logger.info(f'converted to cv2 image with shape {img.shape}') + if img is not None: + h, w, _ = img.shape + if h > 0 and w > 0: + detector = MTCNN() + # logger.info(f'detector is {self.detector}') + logger.info('start detect faces') + detections = detector.detect_faces(img) + # detections = self.detector.detect_faces(img) + logger.info(f'detections is {detections}') + return detections + else: + logger.error('can not convert bytes to cv2 image') \ No newline at end of file diff --git a/piper/utils/tesrct_utils.py b/piper/utils/tesrct_utils._py similarity index 100% rename from piper/utils/tesrct_utils.py rename to piper/utils/tesrct_utils._py diff --git a/tests/face_recogn_test.py b/tests/face_recogn_test.py new file mode 100644 index 0000000..1c19907 --- /dev/null +++ b/tests/face_recogn_test.py @@ -0,0 +1,105 @@ +import os +import sys +import asyncio +import requests +root_dir = os.path.join(os.path.realpath(os.path.pardir), 'piper') +sys.path.insert(1, root_dir) + +# from piper.utils import docker_utils as du +# from piper.utils import tesrct_utils as tus + +# from piper.envs import DockerEnv +# from piper.envs import is_docker_env +# from piper.configurations import get_configuration +# from piper.services import TesseractRecognizer, StringValue +from pathlib import Path +import os +import pytest +from loguru import logger +import cv2 +import base64 +import numpy as np +import json + +def base64_str_to_cv2_image(b64_str): + image = base64.b64decode(bytes(b64_str, "utf-8")) + nparr = np.asarray(bytearray(image), dtype="uint8") + cv2_image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) + return cv2_image + +def get_opencv_format_image(data_json): + if 'image' in data_json.keys(): + image_base64 = data_json.get('image') + if image_base64: + cv2_image = base64_str_to_cv2_image(image_base64) + return cv2_image + return None + +def cv2_image_to_base64(cv2_img): + img_str = cv2.imencode('.jpg', cv2_img)[1].tobytes() + encoded_pic = str(base64.b64encode(img_str), 'utf-8') + return encoded_pic + + +def send_file_to_service(url, file_path): + multipart_form_data = { + 'file': open(file_path, 'rb') + } + + logger.info(f'url: {url}') + logger.info(f'data: {multipart_form_data}') + + try: + + # возврат excepiton + result = requests.post(url, files=multipart_form_data, verify=False) + return result + + except requests.exceptions.ConnectionError as ce: + logger.error(f'exeption while connect to {url}') + logger.error(ce) + +main_app_url = f'http://localhost:8788' +file_path = Path(__file__).parent + +# curl -X POST -w "%{http_code}" -F "image=@tests/faces.jpg" http://localhost:8788/recognize +# curl -X POST -w "%{http_code}" -H "Content-Type: application/json" -d "data=2" http://localhost:8788/health_check +# pytest -vs tests/face_recogn_test.py::TestFaceRecogn::test_recognizer +class TestFaceRecogn(): + ''' + Docker container API test. Methods: + test_recognizer_jpg + test_recognizer_pdf + health_check + ''' + + def test_recognizer(self): + ''' + jpg file recognize test + ''' + fn = file_path.joinpath('faces.jpg') + # fn = file_path.joinpath('ocr_data.jpg') + url = f'{main_app_url}/recognize' + + received_data = send_file_to_service(url, fn) + + logger.info(f'received_data.json {received_data.json()}') + assert received_data.status_code == 200 + try: + data = received_data.json() + logger.info('data', data) + assert len(data) != 0 + except Exception as e: + pytest.raises(Exception) + + + def test_health_check(self): + ''' + health check test + ''' + url = f'{main_app_url}/health_check' + print(url) + # убрать параметры + result = requests.post(url, data=json.dumps({"1":"2"}), headers= {'Content-Type': 'application/json'}) + logger.info('health_check test') + assert result.status_code == 200 diff --git a/tests/faces.jpg b/tests/faces.jpg new file mode 100644 index 0000000..5d87e2a Binary files /dev/null and b/tests/faces.jpg differ diff --git a/tests/use_case_folder_processing_po.py b/tests/use_case_folder_processing_po.py new file mode 100644 index 0000000..bcc523c --- /dev/null +++ b/tests/use_case_folder_processing_po.py @@ -0,0 +1,83 @@ +import os +import sys +import asyncio +import requests +root_dir = os.path.join(os.path.realpath(os.path.pardir), 'piper') +sys.path.insert(1, root_dir) + +from piper.utils import docker_utils as du +from piper.utils import tesrct_utils as tu + +from piper.envs import DockerEnv +from piper.envs import is_docker_env +from piper.configurations import get_configuration +from piper.services import TesseractRecognizer, StringValue +from pathlib import Path +import os +import pytest +import json + +from PiperOperator import * + +# service urls +headers = {"Content-Type": "application/json"} +main_app_url = f'http://localhost:8788' + +url_tsrct_cfg = f'{main_app_url}/set_config' +url_rcg = f'{main_app_url}/recognize' + +url_spacy_all_models = f'{main_app_url}/get_ner_models' +url_spacy_set_model = f'{main_app_url}/set_ner_model' +url_spacy_get_NE = f'{main_app_url}/extract_named_ents' + +# folder info +file_path = Path(__file__).parent +# fn = file_path.joinpath('ocr_data.jpg') + +SOURCE_FOLDER = file_path +OUTPUT_FOLDER = file_path.joinpath('out') + + +if __name__ == '__main__': + + piper_worker = PiperNLPWorker('http://localhost:8788') + + available_models = piper_worker.get_available_ner_models() + print(f'get_ner_models {available_models}') + + ts_conf = dict() + ts_conf['ts_lang'] = 'eng' + + for v in [6, 8, 11]: + # change tesseract config + ts_conf['ts_config_row'] = rf'--oem 1 --psm {v}' + piper_worker.set_tesseract_config(ts_conf) + print(f"\ttesseract config changed to {ts_conf['ts_config_row']}") + + for model in available_models: + # change spacy model + piper_worker.set_current_spacy_model(model) + print(f"\t\tspacy model changed to {model}") + + # create output folder + cur_dir = OUTPUT_FOLDER.joinpath(f'ts_{v}_{model}') + cur_dir.mkdir(parents=True, exist_ok=True) + for fn in file_path.iterdir(): + if fn.suffix[1:] in ['jpg', 'jpeg', 'png', 'pdf']: + # folder processing + txt = piper_worker.get_text_from_file(fn) + txt = ' '.join(txt) + print(f'\t\t\textracted text {txt}') + + named_ents = piper_worker.get_named_ent_from_text(txt) + + if named_ents: + named_ents_str = "\n".join(f'\t\t\t{x}' for x in named_ents) + # print(type(named_ents)) + print(f'\t\t\textract_named_ents {named_ents_str}') + + out_fn = cur_dir.joinpath(f'res_{fn.stem}.txt') + with open(out_fn, 'w') as f: + f.write(txt) + f.write('\t') + f.write(named_ents_str)