Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.0.82

* Bump to `unstructured` 0.16.11
* No longer attempts to download NLTK asset from S3 which could result in a 403

## 0.0.81

* Update `strategy` parameter to allow `'` and `"` as input surrounding the value.
Expand Down
2 changes: 1 addition & 1 deletion prepline_general/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
app = FastAPI(
title="Unstructured Pipeline API",
summary="Partition documents with the Unstructured library",
version="0.0.81",
version="0.0.82",
docs_url="/general/docs",
openapi_url="/general/openapi.json",
servers=[
Expand Down
55 changes: 2 additions & 53 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
elements_from_json,
)
from unstructured_inference.models.base import UnknownModelException
from unstructured_inference.models.chipper import MODEL_TYPES as CHIPPER_MODEL_TYPES

app = FastAPI()
router = APIRouter()
Expand Down Expand Up @@ -214,37 +213,6 @@ def partition_pdf_splits(
return results


is_chipper_processing = False


class ChipperMemoryProtection:
"""Chipper calls are expensive, and right now we can only do one call at a time.

If the model is in use, return a 503 error. The API should scale up and the user can try again
on a different server.
"""

def __enter__(self):
global is_chipper_processing
if is_chipper_processing:
# Log here so we can track how often it happens
logger.error("Chipper is already is use")
raise HTTPException(
status_code=503, detail="Server is under heavy load. Please try again later."
)

is_chipper_processing = True

def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
exc_tb: Optional[TracebackType],
):
global is_chipper_processing
is_chipper_processing = False


def pipeline_api(
file: IO[bytes],
request: Request,
Expand Down Expand Up @@ -331,7 +299,6 @@ def pipeline_api(
if file_content_type == "application/pdf":
_check_pdf(file)

hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates)
strategy = _validate_strategy(strategy)
pdf_infer_table_structure = _set_pdf_infer_table_structure(
pdf_infer_table_structure,
Expand Down Expand Up @@ -417,9 +384,6 @@ def pipeline_api(
coordinates=coordinates,
**partition_kwargs, # type: ignore # pyright: ignore[reportGeneralTypeIssues]
)
elif hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES:
with ChipperMemoryProtection():
elements = partition(**partition_kwargs) # type: ignore # pyright: ignore[reportGeneralTypeIssues]
else:
elements = partition(**partition_kwargs) # type: ignore # pyright: ignore[reportGeneralTypeIssues]

Expand Down Expand Up @@ -533,21 +497,6 @@ def _validate_strategy(strategy: str) -> str:
return strategy


def _validate_hi_res_model_name(
hi_res_model_name: Optional[str], show_coordinates: bool
) -> Optional[str]:
# Make sure chipper aliases to the latest model
if hi_res_model_name and hi_res_model_name == "chipper":
hi_res_model_name = "chipperv2"

if hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES and show_coordinates:
raise HTTPException(
status_code=400,
detail=f"coordinates aren't available when using the {hi_res_model_name} model type",
)
return hi_res_model_name


def _validate_chunking_strategy(chunking_strategy: Optional[str]) -> Optional[str]:
"""Raise on `chunking_strategy` is not a valid chunking strategy name.

Expand Down Expand Up @@ -653,7 +602,7 @@ def return_content_type(filename: str):


@router.get("/general/v0/general", include_in_schema=False)
@router.get("/general/v0.0.81/general", include_in_schema=False)
@router.get("/general/v0.0.82/general", include_in_schema=False)
async def handle_invalid_get_request():
raise HTTPException(
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
Expand All @@ -668,7 +617,7 @@ async def handle_invalid_get_request():
description="Description",
operation_id="partition_parameters",
)
@router.post("/general/v0.0.81/general", include_in_schema=False)
@router.post("/general/v0.0.82/general", include_in_schema=False)
def general_partition(
request: Request,
# cannot use annotated type here because of a bug described here:
Expand Down
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.81
version: 0.0.82
Loading
Loading