diff --git a/PDF Highlighter Script/README.md b/PDF Highlighter Script/README.md new file mode 100644 index 00000000..7bfe8f63 --- /dev/null +++ b/PDF Highlighter Script/README.md @@ -0,0 +1,28 @@ +# PDF Keyword Highlighter + +A command-line tool to highlight one or more keywords in a PDF file using PyMuPDF. Supports multiple keywords, optional case-sensitive search, and outputs per-page highlight statistics. + +## Requirements +- Python `>=3.12` +- PyMuPDF: `pip install pymupdf` + +## Usage +```bash +usage: Highlight keywords in PDF [-h] -i INPUT [-o OUTPUT] -k KEYS [KEYS ...] [-s] + +options: + -h, --help show this help message and exit + -i INPUT, --input INPUT + Input PDF + -o OUTPUT, --output OUTPUT + Output PDF + -k KEYS [KEYS ...], --keys KEYS [KEYS ...] + Keyword(s) to highlight. Sentences are not supported + -s, --sensitive Case-sensitive search +``` +## Examples +```bash +python3 pdf_highlight.py -i input.pdf -k python code script -o highlighted.pdf + +python3 pdf_highlight.py -i input.pdf -k Python -s -o output.pdf # case-sensitive +``` \ No newline at end of file diff --git a/PDF Highlighter Script/pdf_highlight.py b/PDF Highlighter Script/pdf_highlight.py new file mode 100644 index 00000000..633d447d --- /dev/null +++ b/PDF Highlighter Script/pdf_highlight.py @@ -0,0 +1,95 @@ +import argparse +import pymupdf as fitz +import string + +def parse_args() -> argparse.Namespace: + """Parses command-line arguments using `argparse.ArgumentParser()`. + + Returns: + `argparse.Namespace`: Parsed arguments as attributes. + """ + parser = argparse.ArgumentParser("Highlight keywords in PDF") + + # add arguments to be accepted + parser.add_argument("-i", "--input", type=str, required=True, help="Input PDF") + parser.add_argument("-o", "--output", type=str, default="highlighted.pdf", help="Output PDF") + parser.add_argument("-k", "--keys", type=str, required=True, nargs="+" ,help='Keyword(s) to highlight. Sentences are not supported') + parser.add_argument("-s", "--sensitive", action="store_true", help='Case-sensitive search') + + return parser.parse_args() + +def highlight_pdf(input_file : str, output_file : str, keywords : list[str], case_sensitive=False) -> dict[str, int]: + """`Highlghts occurances of `keywords` in the PDF and saves a new file. + + Args: + input_file (str): Path of the input PDF. + output_file (str): Path for the output PDF(highlighted). + keywords (list[str]): List of keywords to highlight + case_sensitive (bool, optional): if True, matching is case-sensitive. Defaults to False. + + Returns: + dict[str, int]: Page numbers and highlight counts. + """ + try: + doc = fitz.open(input_file) + except Exception as e: + print(f"Error opening PDF: {e}") + return {} + + stats = {} + + if case_sensitive: + keyword_set = {key.strip() for key in keywords} + else: + keyword_set = {key.strip().lower() for key in keywords} + + for page in doc: + hits = [] + page_no = f"Page {page.number + 1}" + words = page.get_text("words") + + for word in words: + rect = word[:4] + match_word = word[4].strip(string.punctuation) + if not case_sensitive: + match_word = match_word.lower() + + if match_word in keyword_set: + hits.append(rect) + + if hits: + annotation = page.add_highlight_annot(hits) + stats[page_no] = len(hits) + + doc.save(output_file, garbage=4, deflate=True, clean=True) + doc.close() + + return stats + +def print_stats(stats : dict) -> None: + """Prints highlight statistics. + + Args: + stats (dict): Page numbers and highlight counts. + """ + if not stats: + print("\nNo matches found.\n") + return + total = sum(stats.values()) + + print("\n" + "-"*28) + print("HIGHLIGHT".center(28)) + print("-"*28) + + for page, count in stats.items(): + print(f"{page:18} | {count:3d}") + print("-"*28) + print(f"Total: {total} highlights\n") + +if __name__ == "__main__": + args = parse_args() + print(f"Keywords: {", ".join(args.keys)}") + print(f"Case-sensitive: {args.sensitive}") + + stats = highlight_pdf(args.input, args.output, args.keys, args.sensitive) + print_stats(stats) \ No newline at end of file diff --git a/README.md b/README.md index 5333ecbf..6efd9f9c 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ More information on contributing and the general code of conduct for discussion | Password Generator | [Password Generator](https://github.com/DhanushNehru/Python-Scripts/tree/main/Password%20Generator) | Generates a random password. | | Password Manager | [Password Manager](https://github.com/nem5345/Python-Scripts/tree/main/Password%20Manager) | Generate and interact with a password manager. | | Password Strength Checker | [Password Strength Checker](https://github.com/nem5345/Python-Scripts/tree/main/Password%20Strength%20Checker) | Evaluates how strong a given password is. | +| PDF Highlighter | [PDF Highlighter Script](https://github.com/SurfyPenguin/Python-Scripts/tree/main/PDF%20Highlighter%20Script) | A command-line tool to highlight one or more keywords in a PDF file using PyMuPDF. Supports multiple keywords, and optional case-sensitive search | PDF Merger | [PDF Merger](https://github.com/DhanushNehru/Python-Scripts/tree/main/PDF%20Merger) | Merges multiple PDF files into a single PDF, with options for output location and custom order. | | PDF to Audio | [PDF to Audio](https://github.com/DhanushNehru/Python-Scripts/tree/main/PDF%20to%20Audio) | Converts PDF to audio. | | PDF to Text | [PDF to text](https://github.com/DhanushNehru/Python-Scripts/tree/main/PDF%20to%20text) | Converts PDF to text. |