Getting text remove issue from pdf after search and redaction #2457

ashifaliclientpoint · 2023-06-06T14:20:33Z

ashifaliclientpoint
Jun 6, 2023

Hello
I am using pymypdf(1.19.6) to search string from a pdf file. And doing redaction. But it removing text which are just above of the result.
Please help me for this strange issue.
I am also attaching the original file and converted file.
DRAFT_Executive.pdf
highlighted_file.pdf

Reproduce step
import re
import fitz
import sys, json
file_path = "DRAFT_Executive.pdf"

pattern = r'[\s*([s|c|d|i|t]):([a-z]):([o|r])\s*]' # Replace with your desired regex pattern
doc = fitz.open(file_path)
resultOutput = []
tagsPerPage = {}
addedTags = set()

for page in doc:
text = page.get_text()
tagsPerPage[page.number]=[]
matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
if matches:
for match in matches:
start, end = match.span()
coordinates = page.search_for(match.group())
tempDict={}
firstCoordStr = ""
singleTagArr = []
needleStarted=0
for rect in coordinates:
x1, y2, x2, y1 = rect

            height = y2 - y1

            y1 = page.rect.height - y1
            y2 = page.rect.height - y2  


            currCoordsStr = str(y2)+"_"+str(y1)+"_"+str(x2)+"_"+str(x1)+"_"+str(page.number)
            currText = page.get_text("text",clip=rect)
            currTextTrimmed = ''.join(currText.split())  
            tagComplete = re.match(pattern, currTextTrimmed, re.IGNORECASE | re.MULTILINE | re.DOTALL)
            if(tagComplete):
                if currCoordsStr in addedTags:                        
                    pass     
                else:            
                    addedTags.add(currCoordsStr)
                    tagsPerPage[page.number].append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
                    resultOutput.append({'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) })
            else:
                if needleStarted == 0 and currTextTrimmed.find('[')!=-1:
                    needleStarted = 1  
                    tempDict = {}     
                    if "x2" in tempDict and x2 < tempDict["x2"]:
                        x2 = tempDict["x2"]
                    if "x1" in tempDict and x1 > tempDict["x1"]:
                        x1 = tempDict["x1"]  
                    tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }                                                   
                elif needleStarted == 1 and currTextTrimmed.find(']')!=-1:
                    needleStarted = 0       
                    currCoordsStr = str(tempDict['y2'])+"_"+str(tempDict['y1'])+"_"+str(tempDict['x2'])+"_"+str(tempDict['x1'])+"_"+str(page.number)                        
                    if currCoordsStr in addedTags:     
                        tempDict = {}                   
                        pass     
                    else:            
                        if "x2" in tempDict and x2 < tempDict["x2"]:
                            x2 = tempDict["x2"]
                        if "x1" in tempDict and x1 > tempDict["x1"]:
                            x1 = tempDict["x1"]  
                        tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }     
                        addedTags.add(currCoordsStr)
                        tagsPerPage[page.number].append(tempDict)
                        resultOutput.append(tempDict)
                        tempDict = {}
                else:
                    if "x2" in tempDict and x2 < tempDict["x2"]:
                        x2 = tempDict["x2"]
                    if "x1" in tempDict and x1 > tempDict["x1"]:
                        x1 = tempDict["x1"]  
                    tempDict={'y2':y2, 'y1':y1, 'tag':match.group(), 'x2':x2, 'x1':x1, 'page':(page.number+1) }

if tagsPerPage:
for page in doc:
if tagsPerPage[page.number]:
for item in tagsPerPage[page.number]:
currPage= page.number+1
if item['page']==currPage:
y1 = page.rect.height - item['y1']
y2 = page.rect.height - item['y2']

                page.add_redact_annot(fitz.Rect(item['x1'], y2, item['x2'], y1),text_color=(0, 0, 0),cross_out=True)

            page.apply_redactions()

doc.save("highlighted_file.pdf", garbage=3, deflate=True)
doc.close()

Configuration
OS ubuntu
Python 3.8
PyMuPDF 1.19.6

JorjMcKie · 2023-06-06T15:03:42Z

JorjMcKie
Jun 6, 2023
Maintainer

This is not a bug, so let me first transfer this to the "discussions" tab.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Getting text remove issue from pdf after search and redaction #2457

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Getting text remove issue from pdf after search and redaction #2457

Uh oh!

Uh oh!

ashifaliclientpoint Jun 6, 2023

Replies: 1 comment

Uh oh!

JorjMcKie Jun 6, 2023 Maintainer

ashifaliclientpoint
Jun 6, 2023

JorjMcKie
Jun 6, 2023
Maintainer