Skip to content

Commit 82fb3a5

Browse files
committed
Corrections for 4186
Extracting JPEG-CMYK images consistently need inverting the colors. We have taken this opportunity to reuse as much as possible the creation of the image dictionaries in 'Document.extract_image' and the image block in the Python version of text extraction.
1 parent 17e538a commit 82fb3a5

File tree

2 files changed

+65
-107
lines changed

2 files changed

+65
-107
lines changed

src/__init__.py

Lines changed: 51 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -4196,8 +4196,7 @@ def extract_image(self, xref):
41964196
raise ValueError("document closed or encrypted")
41974197

41984198
pdf = _as_pdf_document(self)
4199-
img_type = 0
4200-
smask = 0
4199+
42014200
if not _INRANGE(xref, 1, mupdf.pdf_xref_len(pdf)-1):
42024201
raise ValueError( MSG_BAD_XREF)
42034202

@@ -4210,65 +4209,15 @@ def extract_image(self, xref):
42104209
o = mupdf.pdf_dict_geta(obj, PDF_NAME('SMask'), PDF_NAME('Mask'))
42114210
if o.m_internal:
42124211
smask = mupdf.pdf_to_num(o)
4213-
4214-
if mupdf.pdf_is_jpx_image(obj):
4215-
img_type = mupdf.FZ_IMAGE_JPX
4216-
res = mupdf.pdf_load_stream(obj)
4217-
ext = "jpx"
4218-
if JM_is_jbig2_image(obj):
4219-
img_type = mupdf.FZ_IMAGE_JBIG2
4220-
res = mupdf.pdf_load_stream(obj)
4221-
ext = "jb2"
4222-
res = mupdf.pdf_load_raw_stream(obj)
4223-
if img_type == mupdf.FZ_IMAGE_UNKNOWN:
4224-
res = mupdf.pdf_load_raw_stream(obj)
4225-
_, c = mupdf.fz_buffer_storage(res)
4226-
#log( '{=_ c}')
4227-
img_type = mupdf.fz_recognize_image_format(c)
4228-
ext = JM_image_extension(img_type)
4229-
if img_type == mupdf.FZ_IMAGE_UNKNOWN:
4230-
res = None
4231-
img = mupdf.pdf_load_image(pdf, obj)
4232-
ll_cbuf = mupdf.ll_fz_compressed_image_buffer(img.m_internal)
4233-
if (ll_cbuf
4234-
and ll_cbuf.params.type not in (
4235-
mupdf.FZ_IMAGE_RAW,
4236-
mupdf.FZ_IMAGE_FAX,
4237-
mupdf.FZ_IMAGE_FLATE,
4238-
mupdf.FZ_IMAGE_LZW,
4239-
mupdf.FZ_IMAGE_RLD,
4240-
)
4241-
):
4242-
img_type = ll_cbuf.params.type
4243-
ext = JM_image_extension(img_type)
4244-
res = mupdf.FzBuffer(mupdf.ll_fz_keep_buffer(ll_cbuf.buffer))
4245-
else:
4246-
res = mupdf.fz_new_buffer_from_image_as_png(
4247-
img,
4248-
mupdf.FzColorParams(mupdf.fz_default_color_params),
4249-
)
4250-
ext = "png"
42514212
else:
4252-
img = mupdf.fz_new_image_from_buffer(res)
4253-
4254-
xres, yres = mupdf.fz_image_resolution(img)
4255-
width = img.w()
4256-
height = img.h()
4257-
colorspace = img.n()
4258-
bpc = img.bpc()
4259-
cs_name = mupdf.fz_colorspace_name(img.colorspace())
4213+
smask = 0
42604214

4215+
# load the image
4216+
img = mupdf.pdf_load_image(pdf, obj)
42614217
rc = dict()
4262-
rc[ dictkey_ext] = ext
4263-
rc[ dictkey_smask] = smask
4264-
rc[ dictkey_width] = width
4265-
rc[ dictkey_height] = height
4266-
rc[ dictkey_colorspace] = colorspace
4267-
rc[ dictkey_bpc] = bpc
4268-
rc[ dictkey_xres] = xres
4269-
rc[ dictkey_yres] = yres
4270-
rc[ dictkey_cs_name] = cs_name
4271-
rc[ dictkey_image] = JM_BinFromBuffer(res)
4218+
make_image_dict(img, rc)
4219+
rc[dictkey_smask] = smask
4220+
rc[dictkey_cs_name] = mupdf.fz_colorspace_name(img.colorspace())
42724221
return rc
42734222

42744223
def ez_save(
@@ -16313,19 +16262,6 @@ def JM_irect_from_py(r):
1631316262
f[i] = FZ_MAX_INF_RECT
1631416263
return mupdf.fz_make_irect(f[0], f[1], f[2], f[3])
1631516264

16316-
16317-
def JM_is_jbig2_image(dict_):
16318-
# fixme: should we remove this function?
16319-
return 0
16320-
#filter_ = pdf_dict_get(ctx, dict_, PDF_NAME(Filter));
16321-
#if (pdf_name_eq(ctx, filter_, PDF_NAME(JBIG2Decode)))
16322-
# return 1;
16323-
#n = pdf_array_len(ctx, filter_);
16324-
#for (i = 0; i < n; i++)
16325-
# if (pdf_name_eq(ctx, pdf_array_get(ctx, filter_, i), PDF_NAME(JBIG2Decode)))
16326-
# return 1;
16327-
#return 0;
16328-
1632916265
def JM_listbox_value( annot):
1633016266
'''
1633116267
ListBox retrieve value
@@ -16500,38 +16436,52 @@ def __str__(self):
1650016436
line_dict[dictkey_spans] = span_list
1650116437
return line_rect
1650216438

16439+
def make_image_dict(img, img_dict):
16440+
"""Populate a dictionary with information extracted from a given image.
1650316441

16504-
def JM_make_image_block(block, block_dict):
16505-
image = block.i_image()
16506-
n = mupdf.fz_colorspace_n(image.colorspace())
16507-
w = image.w()
16508-
h = image.h()
16509-
type_ = mupdf.FZ_IMAGE_UNKNOWN
16510-
# fz_compressed_image_buffer() is not available because
16511-
# `fz_compressed_buffer` is not copyable.
16512-
ll_fz_compressed_buffer = mupdf.ll_fz_compressed_image_buffer(image.m_internal)
16513-
if ll_fz_compressed_buffer:
16514-
type_ = ll_fz_compressed_buffer.params.type
16515-
if type_ < mupdf.FZ_IMAGE_BMP or type_ == mupdf.FZ_IMAGE_JBIG2:
16516-
type_ = mupdf.FZ_IMAGE_UNKNOWN
16517-
bytes_ = None
16518-
if ll_fz_compressed_buffer and type_ != mupdf.FZ_IMAGE_UNKNOWN:
16519-
buf = mupdf.FzBuffer( mupdf.ll_fz_keep_buffer( ll_fz_compressed_buffer.buffer))
16520-
ext = JM_image_extension(type_)
16521-
else:
16522-
buf = mupdf.fz_new_buffer_from_image_as_png(image, mupdf.FzColorParams())
16442+
Used by 'Document.extract_image' and by 'JM_make_image_block'.
16443+
Both of these functions will add some more specific information.
16444+
"""
16445+
img_type = img.fz_compressed_image_type()
16446+
ext = JM_image_extension(img_type)
16447+
16448+
# compressed image buffer if present, else None
16449+
ll_cbuf = mupdf.ll_fz_compressed_image_buffer(img.m_internal)
16450+
16451+
if (0
16452+
or not ll_cbuf
16453+
or img_type in (mupdf.FZ_IMAGE_JBIG2, mupdf.FZ_IMAGE_UNKNOWN)
16454+
or img_type < mupdf.FZ_IMAGE_BMP
16455+
):
16456+
# not an image with a compressed buffer: convert to PNG
16457+
res = mupdf.fz_new_buffer_from_image_as_png(
16458+
img,
16459+
mupdf.FzColorParams(mupdf.fz_default_color_params),
16460+
)
1652316461
ext = "png"
16524-
bytes_ = JM_BinFromBuffer(buf)
16525-
block_dict[ dictkey_width] = w
16526-
block_dict[ dictkey_height] = h
16527-
block_dict[ dictkey_ext] = ext
16528-
block_dict[ dictkey_colorspace] = n
16529-
block_dict[ dictkey_xres] = image.xres()
16530-
block_dict[ dictkey_yres] = image.yres()
16531-
block_dict[ dictkey_bpc] = image.bpc()
16532-
block_dict[ dictkey_matrix] = JM_py_from_matrix(block.i_transform())
16533-
block_dict[ dictkey_size] = len(bytes_)
16534-
block_dict[ dictkey_image] = bytes_
16462+
elif ext == "jpeg" and img.n() == 4:
16463+
# JPEG with CMYK: invert colors
16464+
res = mupdf.fz_new_buffer_from_image_as_jpeg(
16465+
img, mupdf.FzColorParams(mupdf.fz_default_color_params), 95, 1)
16466+
else:
16467+
# copy the compressed buffer
16468+
res = mupdf.FzBuffer(mupdf.ll_fz_keep_buffer(ll_cbuf.buffer))
16469+
16470+
bytes_ = JM_BinFromBuffer(res)
16471+
img_dict[dictkey_width] = img.w()
16472+
img_dict[dictkey_height] = img.h()
16473+
img_dict[dictkey_ext] = ext
16474+
img_dict[dictkey_colorspace] = img.n()
16475+
img_dict[dictkey_xres] = img.xres()
16476+
img_dict[dictkey_yres] = img.yres()
16477+
img_dict[dictkey_bpc] = img.bpc()
16478+
img_dict[dictkey_size] = len(bytes_)
16479+
img_dict[dictkey_image] = bytes_
16480+
16481+
def JM_make_image_block(block, block_dict):
16482+
img = block.i_image()
16483+
make_image_dict(img, block_dict)
16484+
block_dict[dictkey_matrix] = JM_py_from_matrix(block.i_transform())
1653516485

1653616486

1653716487
def JM_make_text_block(block, block_dict, raw, buff, tp_rect):

src/extra.i

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3467,22 +3467,30 @@ void JM_make_image_block(fz_stext_block *block, PyObject *block_dict)
34673467
int n = fz_colorspace_n(ctx, image->colorspace);
34683468
int w = image->w;
34693469
int h = image->h;
3470-
const char *ext = NULL;
3470+
const char *ext = "";
34713471
int type = FZ_IMAGE_UNKNOWN;
3472-
if (buffer)
3472+
if (buffer) {
34733473
type = buffer->params.type;
3474+
ext = JM_image_extension(type);
3475+
}
34743476
if (type < FZ_IMAGE_BMP || type == FZ_IMAGE_JBIG2)
34753477
type = FZ_IMAGE_UNKNOWN;
34763478
PyObject *bytes = NULL;
34773479
fz_var(bytes);
34783480
fz_try(ctx) {
3479-
if (buffer && type != FZ_IMAGE_UNKNOWN) {
3480-
buf = buffer->buffer;
3481-
ext = JM_image_extension(type);
3482-
} else {
3481+
if (!buffer || type == FZ_IMAGE_UNKNOWN)
3482+
{
34833483
buf = freebuf = fz_new_buffer_from_image_as_png(ctx, image, fz_default_color_params);
34843484
ext = "png";
34853485
}
3486+
else if (n == 4 && strcmp(ext, "jpeg") == 0) // JPEG CMYK needs another step
3487+
{
3488+
buf = freebuf = fz_new_buffer_from_image_as_jpeg(ctx, image, fz_default_color_params, 95, 1);
3489+
}
3490+
else
3491+
{
3492+
buf = buffer->buffer;
3493+
}
34863494
bytes = JM_BinFromBuffer(buf);
34873495
}
34883496
fz_always(ctx) {

0 commit comments

Comments
 (0)