-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextractor.go
More file actions
73 lines (63 loc) · 2.02 KB
/
extractor.go
File metadata and controls
73 lines (63 loc) · 2.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package main
import (
"os"
"path/filepath"
"regexp"
"strings"
)
// urlPattern matches http/https URLs in markdown content.
// Handles: [text](url), <url>, bare urls, image 
var urlPattern = regexp.MustCompile(`https?://[^\s\)\]"'<>` + "`" + `]+`)
// fencedCodeBlock matches ``` or ~~~ fenced code blocks (multiline).
var fencedCodeBlock = regexp.MustCompile("(?s)```[\\s\\S]*?```|~~~[\\s\\S]*?~~~")
// inlineCode matches `single-line inline code`.
var inlineCode = regexp.MustCompile("`[^`\n]+`")
// invalidURLChars matches URLs containing shell variables or template syntax
// that cannot be valid real URLs.
var invalidURLChars = regexp.MustCompile(`[${}|\\^]`)
// FindMarkdownFiles returns all .md files under dir recursively.
func FindMarkdownFiles(dir string) ([]string, error) {
var files []string
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() && strings.HasSuffix(info.Name(), ".md") {
files = append(files, path)
}
return nil
})
return files, err
}
// stripCode removes fenced code blocks and inline code spans from markdown
// so URLs inside code examples are not checked.
func stripCode(s string) string {
s = fencedCodeBlock.ReplaceAllString(s, "")
s = inlineCode.ReplaceAllString(s, "")
return s
}
// ExtractURLs returns all unique http/https URLs found in a markdown file,
// excluding URLs inside code blocks or containing invalid host characters.
func ExtractURLs(path string) ([]string, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
content := stripCode(string(data))
raw := urlPattern.FindAllString(content, -1)
seen := make(map[string]bool)
var urls []string
for _, u := range raw {
// trim trailing punctuation that is not part of the URL
u = strings.TrimRight(u, ".,;:!?)")
// skip URLs with shell variables or invalid host characters
if invalidURLChars.MatchString(u) {
continue
}
if !seen[u] {
seen[u] = true
urls = append(urls, u)
}
}
return urls, nil
}