Skip to content

Commit e6092aa

Browse files
JorjMcKiejulian-smith-artifex-com
authored andcommitted
Corrections for 4186
Extracting JPEG-CMYK images consistently need inverting the colors. We have taken this opportunity to reuse as much as possible the creation of the image dictionaries in 'Document.extract_image' and the image block in the Python version of text extraction.
1 parent 653617c commit e6092aa

File tree

2 files changed

+65
-107
lines changed

2 files changed

+65
-107
lines changed

src/__init__.py

Lines changed: 51 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -4196,8 +4196,7 @@ def extract_image(self, xref):
41964196
raise ValueError("document closed or encrypted")
41974197

41984198
pdf = _as_pdf_document(self)
4199-
img_type = 0
4200-
smask = 0
4199+
42014200
if not _INRANGE(xref, 1, mupdf.pdf_xref_len(pdf)-1):
42024201
raise ValueError( MSG_BAD_XREF)
42034202

@@ -4210,65 +4209,15 @@ def extract_image(self, xref):
42104209
o = mupdf.pdf_dict_geta(obj, PDF_NAME('SMask'), PDF_NAME('Mask'))
42114210
if o.m_internal:
42124211
smask = mupdf.pdf_to_num(o)
4213-
4214-
if mupdf.pdf_is_jpx_image(obj):
4215-
img_type = mupdf.FZ_IMAGE_JPX
4216-
res = mupdf.pdf_load_stream(obj)
4217-
ext = "jpx"
4218-
if JM_is_jbig2_image(obj):
4219-
img_type = mupdf.FZ_IMAGE_JBIG2
4220-
res = mupdf.pdf_load_stream(obj)
4221-
ext = "jb2"
4222-
res = mupdf.pdf_load_raw_stream(obj)
4223-
if img_type == mupdf.FZ_IMAGE_UNKNOWN:
4224-
res = mupdf.pdf_load_raw_stream(obj)
4225-
_, c = mupdf.fz_buffer_storage(res)
4226-
#log( '{=_ c}')
4227-
img_type = mupdf.fz_recognize_image_format(c)
4228-
ext = JM_image_extension(img_type)
4229-
if img_type == mupdf.FZ_IMAGE_UNKNOWN:
4230-
res = None
4231-
img = mupdf.pdf_load_image(pdf, obj)
4232-
ll_cbuf = mupdf.ll_fz_compressed_image_buffer(img.m_internal)
4233-
if (ll_cbuf
4234-
and ll_cbuf.params.type not in (
4235-
mupdf.FZ_IMAGE_RAW,
4236-
mupdf.FZ_IMAGE_FAX,
4237-
mupdf.FZ_IMAGE_FLATE,
4238-
mupdf.FZ_IMAGE_LZW,
4239-
mupdf.FZ_IMAGE_RLD,
4240-
)
4241-
):
4242-
img_type = ll_cbuf.params.type
4243-
ext = JM_image_extension(img_type)
4244-
res = mupdf.FzBuffer(mupdf.ll_fz_keep_buffer(ll_cbuf.buffer))
4245-
else:
4246-
res = mupdf.fz_new_buffer_from_image_as_png(
4247-
img,
4248-
mupdf.FzColorParams(mupdf.fz_default_color_params),
4249-
)
4250-
ext = "png"
42514212
else:
4252-
img = mupdf.fz_new_image_from_buffer(res)
4253-
4254-
xres, yres = mupdf.fz_image_resolution(img)
4255-
width = img.w()
4256-
height = img.h()
4257-
colorspace = img.n()
4258-
bpc = img.bpc()
4259-
cs_name = mupdf.fz_colorspace_name(img.colorspace())
4213+
smask = 0
42604214

4215+
# load the image
4216+
img = mupdf.pdf_load_image(pdf, obj)
42614217
rc = dict()
4262-
rc[ dictkey_ext] = ext
4263-
rc[ dictkey_smask] = smask
4264-
rc[ dictkey_width] = width
4265-
rc[ dictkey_height] = height
4266-
rc[ dictkey_colorspace] = colorspace
4267-
rc[ dictkey_bpc] = bpc
4268-
rc[ dictkey_xres] = xres
4269-
rc[ dictkey_yres] = yres
4270-
rc[ dictkey_cs_name] = cs_name
4271-
rc[ dictkey_image] = JM_BinFromBuffer(res)
4218+
_make_image_dict(img, rc)
4219+
rc[dictkey_smask] = smask
4220+
rc[dictkey_cs_name] = mupdf.fz_colorspace_name(img.colorspace())
42724221
return rc
42734222

42744223
def ez_save(
@@ -16323,19 +16272,6 @@ def JM_irect_from_py(r):
1632316272
f[i] = FZ_MAX_INF_RECT
1632416273
return mupdf.fz_make_irect(f[0], f[1], f[2], f[3])
1632516274

16326-
16327-
def JM_is_jbig2_image(dict_):
16328-
# fixme: should we remove this function?
16329-
return 0
16330-
#filter_ = pdf_dict_get(ctx, dict_, PDF_NAME(Filter));
16331-
#if (pdf_name_eq(ctx, filter_, PDF_NAME(JBIG2Decode)))
16332-
# return 1;
16333-
#n = pdf_array_len(ctx, filter_);
16334-
#for (i = 0; i < n; i++)
16335-
# if (pdf_name_eq(ctx, pdf_array_get(ctx, filter_, i), PDF_NAME(JBIG2Decode)))
16336-
# return 1;
16337-
#return 0;
16338-
1633916275
def JM_listbox_value( annot):
1634016276
'''
1634116277
ListBox retrieve value
@@ -16533,38 +16469,52 @@ def __str__(self):
1653316469
line_dict[dictkey_spans] = span_list
1653416470
return line_rect
1653516471

16472+
def _make_image_dict(img, img_dict):
16473+
"""Populate a dictionary with information extracted from a given image.
1653616474

16537-
def JM_make_image_block(block, block_dict):
16538-
image = block.i_image()
16539-
n = mupdf.fz_colorspace_n(image.colorspace())
16540-
w = image.w()
16541-
h = image.h()
16542-
type_ = mupdf.FZ_IMAGE_UNKNOWN
16543-
# fz_compressed_image_buffer() is not available because
16544-
# `fz_compressed_buffer` is not copyable.
16545-
ll_fz_compressed_buffer = mupdf.ll_fz_compressed_image_buffer(image.m_internal)
16546-
if ll_fz_compressed_buffer:
16547-
type_ = ll_fz_compressed_buffer.params.type
16548-
if type_ < mupdf.FZ_IMAGE_BMP or type_ == mupdf.FZ_IMAGE_JBIG2:
16549-
type_ = mupdf.FZ_IMAGE_UNKNOWN
16550-
bytes_ = None
16551-
if ll_fz_compressed_buffer and type_ != mupdf.FZ_IMAGE_UNKNOWN:
16552-
buf = mupdf.FzBuffer( mupdf.ll_fz_keep_buffer( ll_fz_compressed_buffer.buffer))
16553-
ext = JM_image_extension(type_)
16554-
else:
16555-
buf = mupdf.fz_new_buffer_from_image_as_png(image, mupdf.FzColorParams())
16475+
Used by 'Document.extract_image' and by 'JM_make_image_block'.
16476+
Both of these functions will add some more specific information.
16477+
"""
16478+
img_type = img.fz_compressed_image_type()
16479+
ext = JM_image_extension(img_type)
16480+
16481+
# compressed image buffer if present, else None
16482+
ll_cbuf = mupdf.ll_fz_compressed_image_buffer(img.m_internal)
16483+
16484+
if (0
16485+
or not ll_cbuf
16486+
or img_type in (mupdf.FZ_IMAGE_JBIG2, mupdf.FZ_IMAGE_UNKNOWN)
16487+
or img_type < mupdf.FZ_IMAGE_BMP
16488+
):
16489+
# not an image with a compressed buffer: convert to PNG
16490+
res = mupdf.fz_new_buffer_from_image_as_png(
16491+
img,
16492+
mupdf.FzColorParams(mupdf.fz_default_color_params),
16493+
)
1655616494
ext = "png"
16557-
bytes_ = JM_BinFromBuffer(buf)
16558-
block_dict[ dictkey_width] = w
16559-
block_dict[ dictkey_height] = h
16560-
block_dict[ dictkey_ext] = ext
16561-
block_dict[ dictkey_colorspace] = n
16562-
block_dict[ dictkey_xres] = image.xres()
16563-
block_dict[ dictkey_yres] = image.yres()
16564-
block_dict[ dictkey_bpc] = image.bpc()
16565-
block_dict[ dictkey_matrix] = JM_py_from_matrix(block.i_transform())
16566-
block_dict[ dictkey_size] = len(bytes_)
16567-
block_dict[ dictkey_image] = bytes_
16495+
elif ext == "jpeg" and img.n() == 4:
16496+
# JPEG with CMYK: invert colors
16497+
res = mupdf.fz_new_buffer_from_image_as_jpeg(
16498+
img, mupdf.FzColorParams(mupdf.fz_default_color_params), 95, 1)
16499+
else:
16500+
# copy the compressed buffer
16501+
res = mupdf.FzBuffer(mupdf.ll_fz_keep_buffer(ll_cbuf.buffer))
16502+
16503+
bytes_ = JM_BinFromBuffer(res)
16504+
img_dict[dictkey_width] = img.w()
16505+
img_dict[dictkey_height] = img.h()
16506+
img_dict[dictkey_ext] = ext
16507+
img_dict[dictkey_colorspace] = img.n()
16508+
img_dict[dictkey_xres] = img.xres()
16509+
img_dict[dictkey_yres] = img.yres()
16510+
img_dict[dictkey_bpc] = img.bpc()
16511+
img_dict[dictkey_size] = len(bytes_)
16512+
img_dict[dictkey_image] = bytes_
16513+
16514+
def JM_make_image_block(block, block_dict):
16515+
img = block.i_image()
16516+
_make_image_dict(img, block_dict)
16517+
block_dict[dictkey_matrix] = JM_py_from_matrix(block.i_transform())
1656816518

1656916519

1657016520
def JM_make_text_block(block, block_dict, raw, buff, tp_rect):

src/extra.i

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3511,22 +3511,30 @@ void JM_make_image_block(fz_stext_block *block, PyObject *block_dict)
35113511
int n = fz_colorspace_n(ctx, image->colorspace);
35123512
int w = image->w;
35133513
int h = image->h;
3514-
const char *ext = NULL;
3514+
const char *ext = "";
35153515
int type = FZ_IMAGE_UNKNOWN;
3516-
if (buffer)
3516+
if (buffer) {
35173517
type = buffer->params.type;
3518+
ext = JM_image_extension(type);
3519+
}
35183520
if (type < FZ_IMAGE_BMP || type == FZ_IMAGE_JBIG2)
35193521
type = FZ_IMAGE_UNKNOWN;
35203522
PyObject *bytes = NULL;
35213523
fz_var(bytes);
35223524
fz_try(ctx) {
3523-
if (buffer && type != FZ_IMAGE_UNKNOWN) {
3524-
buf = buffer->buffer;
3525-
ext = JM_image_extension(type);
3526-
} else {
3525+
if (!buffer || type == FZ_IMAGE_UNKNOWN)
3526+
{
35273527
buf = freebuf = fz_new_buffer_from_image_as_png(ctx, image, fz_default_color_params);
35283528
ext = "png";
35293529
}
3530+
else if (n == 4 && strcmp(ext, "jpeg") == 0) // JPEG CMYK needs another step
3531+
{
3532+
buf = freebuf = fz_new_buffer_from_image_as_jpeg(ctx, image, fz_default_color_params, 95, 1);
3533+
}
3534+
else
3535+
{
3536+
buf = buffer->buffer;
3537+
}
35303538
bytes = JM_BinFromBuffer(buf);
35313539
}
35323540
fz_always(ctx) {

0 commit comments

Comments
 (0)