Skip to content
This repository was archived by the owner on Jul 7, 2020. It is now read-only.

panic on some PDFs + suspect memory leak #9

@mark-summerfield

Description

@mark-summerfield

I have the following Go program that uses this library:

package main

import (
	"fmt"
	"os"
	"strconv"
	"rsc.io/pdf"
)

func main() {
	if len(os.Args) < 2 || os.Args[1] == "-h" || os.Args[1] == "--help" {
		fmt.Println("usage: pdfpage file.pdf [pnum]")
		os.Exit(1)
	}
	reader, err := pdf.Open(os.Args[1])
	if err != nil {
		fmt.Println(err)
		os.Exit(2)
	}
	if len(os.Args) == 3 {
		var pnum int
		var err error
		if pnum, err = strconv.Atoi(os.Args[2]); err != nil {
			pnum = 1
		}
		fmt.Printf("PAGE %d\n", pnum)
		printPage(reader, pnum)
	} else {
		for pnum := 1; pnum <= reader.NumPage(); pnum++ {
			fmt.Printf("PAGE %d\n", pnum)
			printPage(reader, pnum)
			fmt.Println("")
		}
	}
}

func printPage(reader *pdf.Reader, pnum int) {
	page := reader.Page(pnum)
	if page.V.IsNull() {
		fmt.Printf("failed to read page %d\n", pnum)
		os.Exit(3)
	}
	for _, chunk := range page.Content().Text {
		fmt.Printf("x=%06.2f y=%06.2f w=%06.2f %q %s %.1fpt\n",
			chunk.X, chunk.Y, chunk.W, chunk.S, chunk.Font,
			chunk.FontSize)
	}
}

This builds and runs fine and for many PDFs gives the expected output (although it is rather slow).
However I have a few PDFs which produce a panic:

PAGE 1
panic: malformed PDF: reading at offset 0: stream not present

goroutine 1 [running]:
rsc.io/pdf.(*buffer).errorf(0xc4200d3948, 0x507f70, 0x27, 0xc4200d36d0, 0x2, 0x2)
	/home/mark/app/go/src/rsc.io/pdf/lex.go:82 +0x74
rsc.io/pdf.(*buffer).reload(0xc4200d3948, 0x8)
	/home/mark/app/go/src/rsc.io/pdf/lex.go:95 +0x193
rsc.io/pdf.(*buffer).readByte(0xc4200d3948, 0x599da0)
	/home/mark/app/go/src/rsc.io/pdf/lex.go:71 +0x69
rsc.io/pdf.(*buffer).readToken(0xc4200d3948, 0xc42000aca0, 0x1000)
	/home/mark/app/go/src/rsc.io/pdf/lex.go:135 +0x4a
rsc.io/pdf.Interpret(0xc42006e060, 0x37, 0x4d78a0, 0xc42000ab60, 0xc4200d3b08)
	/home/mark/app/go/src/rsc.io/pdf/ps.go:64 +0x1c6
rsc.io/pdf.Page.Content(0xc42006e060, 0x37, 0x4db2e0, 0xc420014810, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
	/home/mark/app/go/src/rsc.io/pdf/page.go:613 +0x326
main.printPage(0xc42006e060, 0x1)
	/home/mark/app/go/src/pdfpage2/main.go:47 +0xa8
main.main()
	/home/mark/app/go/src/pdfpage2/main.go:35 +0x25d

I also have a 647 page PDF for which the program outputs the first 22 pages, then outputs PAGE 23 and then just sits there eating memory and using ~25% CPU. That particular page has some Japanese characters but I don't know if they are Unicode text or paths.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions