Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions docx.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"
"io"
"os"
"strings"
"time"
)

Expand Down Expand Up @@ -61,6 +62,9 @@ func ConvertDocx(r io.Reader) (string, map[string]string, error) {

meta := make(map[string]string)
var textHeader, textBody, textFooter string
var textBodySb64 strings.Builder
var textFooterSb64 strings.Builder
var textHeaderSb64 strings.Builder
for _, override := range contentTypeDefinition.Overrides {
f := zipFiles[override.PartName]

Expand Down Expand Up @@ -92,22 +96,25 @@ func ConvertDocx(r io.Reader) (string, map[string]string, error) {
if err != nil {
return "", nil, err
}
textBody += body + "\n"
textBodySb64.WriteString(body + "\n")
case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml":
footer, err := parseDocxText(f)
if err != nil {
return "", nil, err
}
textFooter += footer + "\n"
textFooterSb64.WriteString(footer + "\n")
case override.ContentType == "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml":
header, err := parseDocxText(f)
if err != nil {
return "", nil, err
}
textHeader += header + "\n"
textHeaderSb64.WriteString(header + "\n")
}

}
textBody += textBodySb64.String()
textFooter += textFooterSb64.String()
textHeader += textHeaderSb64.String()
return textHeader + "\n" + textBody + "\n" + textFooter, meta, nil
}

Expand Down
16 changes: 10 additions & 6 deletions html.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,12 @@ func cleanHTML(r io.Reader, all bool) string {
junkSection := false

d := html.NewTokenizer(r)
var outputSb strings.Builder
for {
// token type
tokenType := d.Next()
if tokenType == html.ErrorToken {
output += outputSb.String()
return output
}
token := d.Token()
Expand All @@ -90,28 +92,29 @@ func cleanHTML(r io.Reader, all bool) string {
}

if !junkSection && mainSection {
output += "<" + token.Data + ">"
outputSb.WriteString("<" + token.Data + ">")
}

case html.TextToken: // text between start and end tag
if !junkSection && mainSection {
output += token.Data
outputSb.WriteString(token.Data)
}

case html.EndTagToken: // </tag>
if !junkSection && mainSection {
output += "</" + token.Data + ">"
outputSb.WriteString("</" + token.Data + ">")
}
if !acceptedHTMLTag(token.Data) {
junkSection = false
}

case html.SelfClosingTagToken: // <tag/>
if !junkSection && mainSection {
output += "<" + token.Data + " />" // TODO: Can probably keep attributes from the meta tags
outputSb.WriteString("<" + token.Data + " />") // TODO: Can probably keep attributes from the meta tags
}
}
}
output += outputSb.String()
}

// HTMLReadabilityOptions is a type which defines parameters that are passed to the justext package.
Expand Down Expand Up @@ -150,14 +153,15 @@ func HTMLReadability(r io.Reader) ([]byte, error) {

useClasses := strings.SplitN(HTMLReadabilityOptionsValues.ReadabilityUseClasses, ",", 10)

output := ""
var outputSb strings.Builder
for _, paragraph := range paragraphSet {
for _, class := range useClasses {
if paragraph.CfClass == class {
output += paragraph.Text + "\n"
outputSb.WriteString(paragraph.Text + "\n")
}
}
}
output := outputSb.String()

return []byte(output), nil
}
Expand Down
5 changes: 3 additions & 2 deletions pptx.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func ConvertPptx(r io.Reader) (string, map[string]string, error) {
}

meta := make(map[string]string)
var textBody string
var textBodySb strings.Builder
for _, override := range contentTypeDefinition.Overrides {
f := zipFiles[override.PartName]

Expand All @@ -59,8 +59,9 @@ func ConvertPptx(r io.Reader) (string, map[string]string, error) {
if err != nil {
return "", nil, fmt.Errorf("could not parse pptx: %v", err)
}
textBody += body + "\n"
textBodySb.WriteString(body + "\n")
}
}
textBody := textBodySb.String()
return strings.TrimSuffix(textBody, "\n"), meta, nil
}
4 changes: 3 additions & 1 deletion rtf.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,16 @@ func ConvertRTF(r io.Reader) (string, map[string]string, error) {

// Step through content looking for meta data and stripping out comments
meta := make(map[string]string)
var outputSb strings.Builder
for _, line := range strings.Split(string(tmpOutput), "\n") {
if parts := strings.SplitN(line, ":", 2); len(parts) > 1 {
meta[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
}
if !strings.HasPrefix(line, "### ") {
output += line + "\n"
outputSb.WriteString(line + "\n")
}
}
output += outputSb.String()

// Identify meta data
if tmp, ok := meta["AUTHOR"]; ok {
Expand Down
5 changes: 4 additions & 1 deletion xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/xml"
"fmt"
"io"
"strings"
)

// ConvertXML converts an XML file to text.
Expand Down Expand Up @@ -40,11 +41,13 @@ func XMLToText(r io.Reader, breaks []string, skip []string, strict bool) (string
case xml.CharData:
result += string(v)
case xml.StartElement:
var resultSb strings.Builder
for _, breakElement := range breaks {
if v.Name.Local == breakElement {
result += "\n"
resultSb.WriteString("\n")
}
}
result += resultSb.String()
for _, skipElement := range skip {
if v.Name.Local == skipElement {
depth := 1
Expand Down