Skip to content

Commit f5ba33b

Browse files
authored
7 add docx to pdoc conversion (#54)
* add pandoc converter from docx to snap * change names from pdoc to snap * resolve lazy loading everywhere * clean up
1 parent 8a889ac commit f5ba33b

File tree

17 files changed

+403
-245
lines changed

17 files changed

+403
-245
lines changed

custom/conf/app.ini

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ LOG_SQL = false
4040
# REPO_INDEXER_TYPE = elasticsearch
4141
# REPO_INDEXER_CONN_STR = http://elastic:qualitygrape10@localhost:9200
4242

43-
4443
[lfs]
4544
PATH = /Users/davidgray/git/bindersnap/data/lfs
4645

@@ -98,8 +97,8 @@ ENABLED = true
9897
[actions]
9998
ENABLED = true
10099

101-
[markup.polidoc]
100+
[markup.snap]
102101
ENABLED = true
103102
RENDER_COMMAND = cat
104-
FILE_EXTENSIONS = .pdoc,.polidoc,.policy
103+
FILE_EXTENSIONS = .pdoc,.snap
105104
IS_INPUT_FILE = true

custom/templates/repo/diff/box.tmpl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
{{$sniffedTypeHead := call $.GetSniffedTypeForBlob $blobHead}}
104104
{{$isImage:= or (call $.IsSniffedTypeAnImage $sniffedTypeBase) (call $.IsSniffedTypeAnImage $sniffedTypeHead)}}
105105
{{$isCsv := (call $.IsCsvFile $file)}}
106-
{{$isPdoc := (call $.IsPdocFile $file)}}
106+
{{$isSnap := (call $.IsSnapFile $file)}}
107107
{{$showFileViewToggle := or $isImage (and (not $file.IsIncomplete) $isCsv)}}
108108
{{$isExpandable := or (gt $file.Addition 0) (gt $file.Deletion 0) $file.IsBin}}
109109
{{$isReviewFile := and $.IsSigned $.PageIsPullFiles (not $.IsArchived) $.IsShowingAllCommits}}
@@ -164,7 +164,7 @@
164164
{{end}}
165165
<button class="btn diff-header-popup-btn tw-p-1">{{svg "octicon-kebab-horizontal" 18}}</button>
166166
<div class="tippy-target">
167-
{{if not (or $file.IsIncomplete $file.IsBin $file.IsSubmodule $isPdoc)}}
167+
{{if not (or $file.IsIncomplete $file.IsBin $file.IsSubmodule $isSnap)}}
168168
<button class="unescape-button item" data-unicode-content-selector="#diff-{{$file.NameHash}}">{{ctx.Locale.Tr "repo.unescape_control_characters"}}</button>
169169
<button class="escape-button tw-hidden item" data-unicode-content-selector="#diff-{{$file.NameHash}}">{{ctx.Locale.Tr "repo.escape_control_characters"}}</button>
170170
{{end}}
@@ -182,10 +182,10 @@
182182
</div>
183183
</h4>
184184
<div class="diff-file-body ui attached unstackable table segment" {{if and $file.IsViewed $.IsShowingAllCommits}}data-folded="true"{{end}}>
185-
<div id="diff-source-{{$file.NameHash}}" class="{{if not $isPdoc}}file-body file-code unicode-escaped{{end}} code-diff{{if $.IsSplitStyle}} code-diff-split{{else}} code-diff-unified{{end}}{{if $showFileViewToggle}} tw-hidden{{end}}">
186-
{{if $isPdoc}}
187-
<div id="diff-rendered-{{$file.NameHash}}" class="file-view markup pdiff {{if $.IsSplitStyle}}code-diff-split{{else}}code-diff-unified{{end}}">
188-
<div hx-get="{{$.Link}}/{{$file.Name}}" hx-trigger="load"></div>
185+
<div id="diff-source-{{$file.NameHash}}" class="{{if not $isSnap}}file-body file-code unicode-escaped{{end}} code-diff{{if $.IsSplitStyle}} code-diff-split{{else}} code-diff-unified{{end}}{{if $showFileViewToggle}} tw-hidden{{end}}">
186+
{{if $isSnap}}
187+
<div id="diff-rendered-{{$file.NameHash}}" class="file-view markup snapdiff {{if $.IsSplitStyle}}code-diff-split{{else}}code-diff-unified{{end}}">
188+
<div hx-get="{{$.Link}}?files={{$file.Name}}&fetch=true&file-only=true" hx-trigger="load"></div>
189189
</div>
190190
{{else if or $file.IsIncomplete $file.IsBin}}
191191
<div class="diff-file-body binary">
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{{range $j, $section := .FileDiff.Sections}}
2-
{{$sectionDiff := $section.GetComputedSectionDiffForPdoc ctx.Locale -}}
2+
{{$sectionDiff := $section.GetComputedSectionDiffForSnap ctx.Locale -}}
33
{{$sectionDiff}}
44
{{end}}

custom/templates/repo/editor/diff_preview.tmpl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{{if .File}}
2-
{{$isPdoc := (call $.IsPdocFile .File)}}
2+
{{$isSnap := (call $.IsSnapFile .File)}}
33
<div class="diff-file-box">
44
<div class="ui attached table segment">
5-
{{if $isPdoc}}
6-
<div class="markup pdiff file-view code-diff-unified unicode-escaped">
7-
{{template "repo/diff/pdoc_diff" dict "FileDiff" .File}}
5+
{{if $isSnap}}
6+
<div class="markup snapdiff file-view code-diff-unified unicode-escaped">
7+
{{template "repo/diff/snap_diff" dict "FileDiff" .File}}
88
</div>
99
{{else}}
1010
<div class="file-body file-code code-diff code-diff-unified unicode-escaped">

models/repo/upload.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"context"
99
"fmt"
1010
"io"
11-
"mime/multipart"
1211
"os"
1312
"path"
1413

@@ -62,7 +61,7 @@ func (upload *Upload) LocalPath() string {
6261
}
6362

6463
// NewUpload creates a new upload object.
65-
func NewUpload(ctx context.Context, name string, buf []byte, file multipart.File) (_ *Upload, err error) {
64+
func NewUpload(ctx context.Context, name string, buf []byte, file io.Reader) (_ *Upload, err error) {
6665
upload := &Upload{
6766
UUID: gouuid.New().String(),
6867
Name: name,

modules/pandoc/command.go

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
package pandoc
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"io"
8+
"os"
9+
"os/exec"
10+
"path/filepath"
11+
"runtime"
12+
"strings"
13+
"time"
14+
15+
"code.gitea.io/gitea/modules/log"
16+
"code.gitea.io/gitea/modules/pandoc/internal"
17+
"code.gitea.io/gitea/modules/process"
18+
"code.gitea.io/gitea/modules/util"
19+
)
20+
21+
// Command represents a command with its subcommands or arguments.
22+
type Command struct {
23+
prog string
24+
args []string
25+
parentContext context.Context
26+
globalArgsLength int
27+
brokenArgs []string
28+
}
29+
30+
// RunOpts represents parameters to run the command. If UseContextTimeout is specified, then Timeout is ignored.
31+
type RunOpts struct {
32+
Env []string
33+
Timeout time.Duration
34+
UseContextTimeout bool
35+
36+
// Dir is the working dir for the pandoc command
37+
Dir string
38+
39+
Stdout, Stderr io.Writer
40+
41+
// Stdin is used for passing input to the command
42+
// The caller must make sure the Stdin writer is closed properly to finish the Run function.
43+
// Otherwise, the Run function may hang for long time or forever, especially when the Pandoc's context deadline is not the same as the caller's.
44+
// Some common mistakes:
45+
// * `defer stdinWriter.Close()` then call `cmd.Run()`: the Run() would never return if the command is killed by timeout
46+
// * `go { case <- parentContext.Done(): stdinWriter.Close() }` with `cmd.Run(DefaultTimeout)`: the command would have been killed by timeout but the Run doesn't return until stdinWriter.Close()
47+
// * `go { if stdoutReader.Read() err != nil: stdinWriter.Close() }` with `cmd.Run()`: the stdoutReader may never return error if the command is killed by timeout
48+
// In the future, ideally the pandoc module itself should have full control of the stdin, to avoid such problems and make it easier to refactor to a better architecture.
49+
Stdin io.Reader
50+
51+
PipelineFunc func(context.Context, context.CancelFunc) error
52+
}
53+
54+
var ErrBrokenCommand = errors.New("pandoc command is broken")
55+
56+
// Run runs the command with the RunOpts
57+
func (c *Command) Run(opts *RunOpts) error {
58+
return c.run(1, opts)
59+
}
60+
61+
func (c *Command) run(skip int, opts *RunOpts) error {
62+
if len(c.brokenArgs) != 0 {
63+
log.Error("pandoc command is broken: %s, broken args: %s", c.LogString(), strings.Join(c.brokenArgs, " "))
64+
return ErrBrokenCommand
65+
}
66+
if opts == nil {
67+
opts = &RunOpts{}
68+
}
69+
70+
// We must not change the provided options
71+
timeout := opts.Timeout
72+
if timeout <= 0 {
73+
timeout = defaultCommandExecutionTimeout
74+
}
75+
76+
var desc string
77+
callerInfo := util.CallerFuncName(1 /* util */ + 1 /* this */ + skip /* parent */)
78+
if pos := strings.LastIndex(callerInfo, "/"); pos >= 0 {
79+
callerInfo = callerInfo[pos+1:]
80+
}
81+
// these logs are for debugging purposes only, so no guarantee of correctness or stability
82+
desc = fmt.Sprintf("pandoc.Run(by:%s, repo:%s): %s", callerInfo, logArgSanitize(opts.Dir), c.LogString())
83+
log.Debug("pandoc.Command: %s", desc)
84+
85+
var ctx context.Context
86+
var cancel context.CancelFunc
87+
var finished context.CancelFunc
88+
89+
if opts.UseContextTimeout {
90+
ctx, cancel, finished = process.GetManager().AddContext(c.parentContext, desc)
91+
} else {
92+
ctx, cancel, finished = process.GetManager().AddContextTimeout(c.parentContext, timeout, desc)
93+
}
94+
defer finished()
95+
96+
startTime := time.Now()
97+
98+
cmd := exec.CommandContext(ctx, c.prog, c.args...)
99+
if opts.Env == nil {
100+
cmd.Env = os.Environ()
101+
} else {
102+
cmd.Env = opts.Env
103+
}
104+
105+
process.SetSysProcAttribute(cmd)
106+
cmd.Dir = opts.Dir
107+
cmd.Stdout = opts.Stdout
108+
cmd.Stderr = opts.Stderr
109+
cmd.Stdin = opts.Stdin
110+
if err := cmd.Start(); err != nil {
111+
return err
112+
}
113+
114+
if opts.PipelineFunc != nil {
115+
err := opts.PipelineFunc(ctx, cancel)
116+
if err != nil {
117+
cancel()
118+
_ = cmd.Wait()
119+
return err
120+
}
121+
}
122+
123+
err := cmd.Wait()
124+
elapsed := time.Since(startTime)
125+
if elapsed > time.Second {
126+
log.Debug("slow pandoc.Command.Run: %s (%s)", c, elapsed)
127+
}
128+
129+
// We need to check if the context is canceled by the program on Windows.
130+
// This is because Windows does not have signal checking when terminating the process.
131+
// It always returns exit code 1, unlike Linux, which has many exit codes for signals.
132+
if runtime.GOOS == "windows" &&
133+
err != nil &&
134+
err.Error() == "" &&
135+
cmd.ProcessState.ExitCode() == 1 &&
136+
ctx.Err() == context.Canceled {
137+
return ctx.Err()
138+
}
139+
140+
if err != nil && ctx.Err() != context.DeadlineExceeded {
141+
return err
142+
}
143+
144+
return ctx.Err()
145+
}
146+
147+
func (c *Command) LogString() string {
148+
// WARNING: this function is for debugging purposes only. It's much better than old code (which only joins args with space),
149+
// It's impossible to make a simple and 100% correct implementation of argument quoting for different platforms here.
150+
debugQuote := func(s string) string {
151+
if strings.ContainsAny(s, " `'\"\t\r\n") {
152+
return fmt.Sprintf("%q", s)
153+
}
154+
return s
155+
}
156+
a := make([]string, 0, len(c.args)+1)
157+
a = append(a, debugQuote(c.prog))
158+
if c.globalArgsLength > 0 {
159+
a = append(a, "...global...")
160+
}
161+
for i := c.globalArgsLength; i < len(c.args); i++ {
162+
a = append(a, debugQuote(logArgSanitize(c.args[i])))
163+
}
164+
return strings.Join(a, " ")
165+
}
166+
167+
func logArgSanitize(arg string) string {
168+
if strings.Contains(arg, "://") && strings.Contains(arg, "@") {
169+
return util.SanitizeCredentialURLs(arg)
170+
} else if filepath.IsAbs(arg) {
171+
base := filepath.Base(arg)
172+
dir := filepath.Dir(arg)
173+
return filepath.Join(filepath.Base(dir), base)
174+
}
175+
return arg
176+
}
177+
178+
// NewCommand creates and returns a new Pandoc Command based on given command and arguments.
179+
// Each argument should be safe to be trusted. User-provided arguments should be passed to AddArgumentValues instead.
180+
func NewCommand(ctx context.Context, args ...internal.CmdArg) *Command {
181+
// Make an explicit copy of globalCommandArgs, otherwise append might overwrite it
182+
cargs := make([]string, 0, len(args))
183+
for _, arg := range args {
184+
cargs = append(cargs, string(arg))
185+
}
186+
return &Command{
187+
prog: PandocExecutable,
188+
args: cargs,
189+
parentContext: ctx,
190+
globalArgsLength: 0,
191+
}
192+
}
193+
194+
// AddArguments adds new pandoc arguments (option/value) to the command. It only accepts string literals, or trusted CmdArg.
195+
// Type CmdArg is in the internal package, so it can not be used outside of this package directly,
196+
// it makes sure that user-provided arguments won't cause RCE risks.
197+
// User-provided arguments should be passed by other AddXxx functions
198+
func (c *Command) AddArguments(args ...internal.CmdArg) *Command {
199+
for _, arg := range args {
200+
c.args = append(c.args, string(arg))
201+
}
202+
return c
203+
}
204+
205+
// AddArgumentValues adds new dynamic argument values to the command.
206+
// The arguments may come from user input and can not be trusted, so no leading '-' is allowed to avoid passing options.
207+
func (c *Command) AddArgumentValues(args ...string) *Command {
208+
for _, arg := range args {
209+
if !isSafeArgumentValue(arg) {
210+
c.brokenArgs = append(c.brokenArgs, arg)
211+
}
212+
}
213+
if len(c.brokenArgs) != 0 {
214+
return c
215+
}
216+
c.args = append(c.args, args...)
217+
return c
218+
}
219+
220+
// isSafeArgumentValue checks if the argument is safe to be used as a value (not an option)
221+
func isSafeArgumentValue(s string) bool {
222+
return s == "" || s[0] != '-'
223+
}

modules/pandoc/internal/cmdarg.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Copyright 2023 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package internal
5+
6+
// CmdArg represents a command argument for git command, and it will be used for the git command directly without any further processing.
7+
// In most cases, you should use the "AddXxx" functions to add arguments, but not use this type directly.
8+
// Casting a risky (user-provided) string to CmdArg would cause security issues if it's injected with a "--xxx" argument.
9+
type CmdArg string

modules/pandoc/pandoc.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package pandoc
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"io"
8+
"os/exec"
9+
"time"
10+
11+
"code.gitea.io/gitea/modules/setting"
12+
)
13+
14+
var (
15+
// the command name of pandoc, will be updated to an absolute path during initialization
16+
PandocExecutable = "pandoc"
17+
// defaultCommandExecutionTimeout default command execution timeout duration
18+
defaultCommandExecutionTimeout = 360 * time.Second
19+
)
20+
21+
// SetExecutablePath changes the path of pandoc executable and checks the file permission and version.
22+
func SetExecutablePath(path string) error {
23+
// If path is empty, we use the default value of PandocExecutable "pandoc" to search for the location of pandoc.
24+
if path != "" {
25+
PandocExecutable = path
26+
}
27+
absPath, err := exec.LookPath(PandocExecutable)
28+
if err != nil {
29+
return fmt.Errorf("pandoc not found: %w", err)
30+
}
31+
PandocExecutable = absPath
32+
return nil
33+
}
34+
35+
// InitSimple initializes pandoc module with a very simple step, no config changes, no global command arguments.
36+
// This method doesn't change anything to filesystem. At the moment, it is only used by some Bindersnap sub-commands.
37+
func InitSimple(ctx context.Context) error {
38+
if setting.Pandoc.HomePath == "" {
39+
return errors.New("unable to init Pandoc's HomeDir, incorrect initialization of the setting and pandoc modules")
40+
}
41+
42+
if setting.Pandoc.Timeout.Default > 0 {
43+
defaultCommandExecutionTimeout = time.Duration(setting.Pandoc.Timeout.Default) * time.Second
44+
}
45+
46+
if err := SetExecutablePath(setting.Pandoc.Path); err != nil {
47+
return err
48+
}
49+
50+
// TODO: check pandoc version
51+
return nil
52+
}
53+
54+
func ConvertDocxToSnap(ctx context.Context, in io.Reader, out io.Writer) error {
55+
var cmd *Command
56+
var stderr io.Writer
57+
cmd = NewCommand(ctx).AddArguments("-f", "docx", "-t", "html")
58+
if err := cmd.Run(&RunOpts{
59+
Stdout: out,
60+
Stderr: stderr,
61+
Stdin: in,
62+
}); err != nil {
63+
return fmt.Errorf("Run: %w - %s", err, stderr)
64+
}
65+
return nil
66+
}

0 commit comments

Comments
 (0)