diff --git a/cmd/toMarkdown/main.go b/cmd/toMarkdown/main.go new file mode 100644 index 0000000..79c3aa9 --- /dev/null +++ b/cmd/toMarkdown/main.go @@ -0,0 +1,157 @@ +package main + +import ( + "bytes" + "context" + "errors" + "flag" + "fmt" + "log" + "os" + "path/filepath" + "strings" + + nethttp "net/http" + + "github.com/mark3labs/mcp-go/mcp" + "github.com/mark3labs/mcp-go/server" + "github.com/owulveryck/rePocketable/internal/http" + "github.com/owulveryck/rePocketable/internal/markdown" + "github.com/owulveryck/rePocketable/internal/pocket" +) + +var DB map[string]pocket.Item + +type headers map[string][]string + +func (h headers) String() string { + var b strings.Builder + for k, v := range h { + fmt.Fprintf(&b, "%v: %v|", k, v) + } + return b.String() +} + +func (h headers) Set(v string) error { + elements := strings.SplitN(v, ":", 2) + if len(elements) != 2 { + return errors.New("bad header passed") + } + h[elements[0]] = append(h[elements[0]], elements[1]) + return nil +} + +func main() { + // Check if any arguments are provided + if len(os.Args) <= 1 || (len(os.Args) > 1 && os.Args[1] == "-h") { + // No arguments, start MCP server + startMCPServer() + return + } + + // Arguments provided, run in CLI mode + runCLIMode() +} + +func startMCPServer() { + // Create MCP server + s := server.NewMCPServer( + "ToMarkdown 📄", + "1.0.0", + ) + + // Add ToMarkdown tool + tool := mcp.NewTool("ToMarkdown", + mcp.WithDescription("Converts a web page to markdown. Provide a URL and receive the content converted to markdown format. The tool handles downloading the content, extracting the main text, and formatting it as clean markdown."), + mcp.WithString("url", + mcp.Required(), + mcp.Description("The URL of the web page to convert to markdown"), + ), + ) + + // Add tool handler + s.AddTool(tool, toMarkdownHandler) + + // Start the stdio server + if err := server.ServeStdio(s); err != nil { + fmt.Printf("Server error: %v\n", err) + } +} + +func toMarkdownHandler(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { + url, ok := request.Params.Arguments["url"].(string) + if !ok { + return nil, errors.New("url must be a string") + } + + // Initialize HTTP client + downloader, err := http.NewDownloader(nethttp.Header{}) + if err != nil { + return nil, fmt.Errorf("failed to initialize HTTP client: %v", err) + } + + // Create document + item := pocket.Item{ + ResolvedURL: url, + GivenURL: url, + } + + doc := markdown.NewDocument(item) + doc.Client = downloader.HTTPClient + + // Fill document + err = doc.Fill(ctx) + if err != nil { + return nil, fmt.Errorf("cannot fill document: %v", err) + } + + // Instead of writing to a file, capture the output + var buf bytes.Buffer + err = doc.WriteTo(&buf) + if err != nil { + return nil, fmt.Errorf("cannot convert document to markdown: %v", err) + } + + // Return markdown content + return mcp.NewToolResultText(buf.String()), nil +} + +func runCLIMode() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var headersFlag headers + headersFlag = make(map[string][]string) + flag.Var(&headersFlag, "H", "header") + + if usage() { + return + } + + downloader, err := http.NewDownloader(nethttp.Header(headersFlag)) + if err != nil { + log.Fatal(err) + } + + item := pocket.Item{ + ResolvedURL: os.Args[len(os.Args)-1], + GivenURL: os.Args[len(os.Args)-1], + } + + doc := markdown.NewDocument(item) + doc.Client = downloader.HTTPClient + + err = doc.Fill(ctx) + if err != nil { + log.Println("Cannot fill document: ", err) + return + } + + outputFilename := fmt.Sprintf("%v.md", filepath.Base(os.Args[len(os.Args)-1])) + log.Println("writing output: ", outputFilename) + + err = doc.Write(outputFilename) + if err != nil { + log.Fatal("Cannot write document: ", err) + } +} \ No newline at end of file diff --git a/cmd/toMarkdown/usage.go b/cmd/toMarkdown/usage.go new file mode 100644 index 0000000..a41c128 --- /dev/null +++ b/cmd/toMarkdown/usage.go @@ -0,0 +1,30 @@ +package main + +import ( + "flag" + "os" + + "github.com/owulveryck/rePocketable/internal/http" + "github.com/owulveryck/rePocketable/internal/pocket" +) + +func usage() bool { + help := flag.Bool("h", false, "help") + doc := flag.Bool("d", false, "generate usage for documentation (MD)") + flag.Parse() + if *help { + d := &http.Downloader{} + d.Usage() + p := &pocket.Pocket{} + p.Usage() + return true + } + if *doc { + d := &http.Downloader{} + d.Doc(os.Stdout) + p := &pocket.Pocket{} + p.Doc(os.Stdout) + return true + } + return false +} \ No newline at end of file diff --git a/go.mod b/go.mod index 08130a8..40646cb 100644 --- a/go.mod +++ b/go.mod @@ -1,24 +1,32 @@ module github.com/owulveryck/rePocketable -go 1.16 +go 1.23 + +toolchain go1.24.0 require ( github.com/bmaupin/go-epub v0.10.0 github.com/cixtor/readability v1.0.1-0.20210921191510-3f20b8dcf057 github.com/disintegration/imaging v1.6.2 github.com/dyatlov/go-opengraph v0.0.0-20210112100619-dae8665a5b09 - github.com/gabriel-vasile/mimetype v1.4.0 // indirect github.com/go-fonts/liberation v0.2.0 github.com/go-latex/latex v0.0.0-20210823091927-c0d11ff05a81 - github.com/gofrs/uuid v4.1.0+incompatible // indirect github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 - github.com/google/uuid v1.3.0 github.com/kelseyhightower/envconfig v1.4.0 + github.com/mark3labs/mcp-go v0.17.0 github.com/motemen/go-pocket v0.0.0-20201204003030-43b897100651 - github.com/onsi/gomega v1.4.3 // indirect github.com/vincent-petithory/dataurl v1.0.0 golang.org/x/image v0.0.0-20211028202545-6944b10bf410 golang.org/x/net v0.0.0-20211101193420-4a448f8816b3 +) + +require ( + github.com/fogleman/gg v1.3.0 // indirect + github.com/gabriel-vasile/mimetype v1.4.0 // indirect + github.com/gofrs/uuid v4.1.0+incompatible // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/onsi/gomega v1.4.3 // indirect + github.com/yosida95/uritemplate/v3 v3.0.2 // indirect golang.org/x/text v0.3.7 // indirect gopkg.in/yaml.v2 v2.2.2 // indirect ) diff --git a/go.sum b/go.sum index 5834de7..da82418 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,3 @@ -github.com/bmaupin/go-epub v0.9.0 h1:ewGWcGF2ECokLG4Fe7oLl/uWZeepFD345rs8CChHC74= -github.com/bmaupin/go-epub v0.9.0/go.mod h1:mBan+0WgVv5JbPNw1xfnfQoTRN9iPMKBshZwPOL0SY0= github.com/bmaupin/go-epub v0.10.0 h1:KuOrBGE72frtaDDb2j7NyryO1In3Wsdmtm8o0/K8mF0= github.com/bmaupin/go-epub v0.10.0/go.mod h1:mBan+0WgVv5JbPNw1xfnfQoTRN9iPMKBshZwPOL0SY0= github.com/boombuler/barcode v1.0.0/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= @@ -7,6 +5,8 @@ github.com/boombuler/barcode v1.0.1/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl github.com/cixtor/readability v1.0.1-0.20210921191510-3f20b8dcf057 h1:L+UQNGXkDFYi1ABlnU+LU2kE9cRedxMf+uQsHfTLbRI= github.com/cixtor/readability v1.0.1-0.20210921191510-3f20b8dcf057/go.mod h1:WDrZcthrR2RVDxfMu3q0q59UKhReo5mIZAM6w1+MgFo= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/disintegration/imaging v1.6.2 h1:w1LecBlG2Lnp8B3jk5zSuNqd7b4DXhcjwek1ei82L+c= github.com/disintegration/imaging v1.6.2/go.mod h1:44/5580QXChDfwIclfc/PCwrr44amcmDAg8hxG0Ewe4= github.com/dyatlov/go-opengraph v0.0.0-20210112100619-dae8665a5b09 h1:AQLr//nh20BzN3hIWj2+/Gt3FwSs8Nwo/nz4hMIcLPg= @@ -14,7 +14,6 @@ github.com/dyatlov/go-opengraph v0.0.0-20210112100619-dae8665a5b09/go.mod h1:nYi github.com/fogleman/gg v1.3.0 h1:/7zJX8F6AaYQc57WQCyN9cAIz+4bCJGO9B+dyW29am8= github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= -github.com/gabriel-vasile/mimetype v1.3.1 h1:qevA6c2MtE1RorlScnixeG0VA1H4xrXyhyX3oWBynNQ= github.com/gabriel-vasile/mimetype v1.3.1/go.mod h1:fA8fi6KUiG7MgQQ+mEWotXoEOvmxRtOJlERCzSmRvr8= github.com/gabriel-vasile/mimetype v1.4.0 h1:Cn9dkdYsMIu56tGho+fqzh7XmvY2YyGU0FnbhiOsEro= github.com/gabriel-vasile/mimetype v1.4.0/go.mod h1:fA8fi6KUiG7MgQQ+mEWotXoEOvmxRtOJlERCzSmRvr8= @@ -30,19 +29,19 @@ github.com/go-latex/latex v0.0.0-20210823091927-c0d11ff05a81 h1:6zl3BbBhdnMkpSj2 github.com/go-latex/latex v0.0.0-20210823091927-c0d11ff05a81/go.mod h1:SX0U8uGpxhq9o2S/CELCSUxEWWAuoCUcVCQWv7G2OCk= github.com/go-pdf/fpdf v0.5.0/go.mod h1:HzcnA+A23uwogo0tp9yU+l3V+KXhiESpt1PMayhOh5M= github.com/gofrs/uuid v3.1.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= -github.com/gofrs/uuid v4.0.0+incompatible h1:1SD/1F5pU8p29ybwgQSwpQk+mwdRrXCYuPhW6m+TnJw= -github.com/gofrs/uuid v4.0.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= github.com/gofrs/uuid v4.1.0+incompatible h1:sIa2eCvUTwgjbqXrPLfNwUf9S3i3mpH1O1atV+iL/Wk= github.com/gofrs/uuid v4.1.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= -github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/jung-kurt/gofpdf v1.0.0/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= github.com/kelseyhightower/envconfig v1.4.0 h1:Im6hONhd3pLkfDFsbRgu68RDNkGF1r3dvMUtDTo2cv8= github.com/kelseyhightower/envconfig v1.4.0/go.mod h1:cccZRl6mQpaq41TPp5QxidR+Sa3axMbJDNb//FQX6Gg= +github.com/mark3labs/mcp-go v0.17.0 h1:5Ps6T7qXr7De/2QTqs9h6BKeZ/qdeUeGrgM5lPzi930= +github.com/mark3labs/mcp-go v0.17.0/go.mod h1:KmJndYv7GIgcPVwEKJjNcbhVQ+hJGJhrCCB/9xITzpE= github.com/motemen/go-pocket v0.0.0-20201204003030-43b897100651 h1:4h2p7Aoo823bPzV+ctcn11FPqdv7WMLSIx1k0fjQnz0= github.com/motemen/go-pocket v0.0.0-20201204003030-43b897100651/go.mod h1:bg7ss2WtX3nP/McrX592dwx4hMYtH2PvP4a6VKGOBto= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= @@ -53,28 +52,29 @@ github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk github.com/phpdave11/gofpdi v1.0.13/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w= github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= -github.com/vincent-petithory/dataurl v0.0.0-20191104211930-d1553a71de50 h1:uxE3GYdXIOfhMv3unJKETJEhw78gvzuQqRX/rVirc2A= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/vincent-petithory/dataurl v0.0.0-20191104211930-d1553a71de50/go.mod h1:FHafX5vmDzyP+1CQATJn7WFKc9CvnvxyvZy6I1MrG/U= github.com/vincent-petithory/dataurl v1.0.0 h1:cXw+kPto8NLuJtlMsI152irrVw9fRDX8AbShPRpg2CI= github.com/vincent-petithory/dataurl v1.0.0/go.mod h1:FHafX5vmDzyP+1CQATJn7WFKc9CvnvxyvZy6I1MrG/U= +github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4= +github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4= golang.org/x/image v0.0.0-20190910094157-69e4b8554b2a/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/image v0.0.0-20191009234506-e7c1f5e7dbb8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/image v0.0.0-20210607152325-775e3b0c77b9/go.mod h1:023OzeP/+EPmXeapQh35lcL3II3LrY8Ic+EFFKVhULM= -golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d h1:RNPAfi2nHY7C2srAV8A49jpsYr0ADedCk1wq6fTMTvs= golang.org/x/image v0.0.0-20210628002857-a66eb6448b8d/go.mod h1:023OzeP/+EPmXeapQh35lcL3II3LrY8Ic+EFFKVhULM= golang.org/x/image v0.0.0-20211028202545-6944b10bf410 h1:hTftEOvwiOq2+O8k2D5/Q7COC7k5Qcrgc2TFURJYnvQ= golang.org/x/image v0.0.0-20211028202545-6944b10bf410/go.mod h1:023OzeP/+EPmXeapQh35lcL3II3LrY8Ic+EFFKVhULM= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20210505024714-0287a6fb4125/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.0.0-20210903162142-ad29c8ab022f h1:w6wWR0H+nyVpbSAQbzVEIACVyr/h8l/BEkY6Sokc7Eg= -golang.org/x/net v0.0.0-20210903162142-ad29c8ab022f/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20211101193420-4a448f8816b3 h1:VrJZAjbekhoRn7n5FBujY31gboH+iB3pdLxn3gE9FjU= golang.org/x/net v0.0.0-20211101193420-4a448f8816b3/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -83,7 +83,6 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= @@ -94,3 +93,5 @@ gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWD gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/markdown/markdown.go b/internal/markdown/markdown.go new file mode 100644 index 0000000..d3f5400 --- /dev/null +++ b/internal/markdown/markdown.go @@ -0,0 +1,347 @@ +package markdown + +import ( + "bytes" + "context" + "fmt" + "io" + "log" + "net/http" + "os" + "strings" + "sync" + + "github.com/cixtor/readability" + "github.com/owulveryck/rePocketable/internal/pocket" + "golang.org/x/net/html" + "github.com/dyatlov/go-opengraph/opengraph" +) + +// Document represents a markdown document with content extracted from a URL +type Document struct { + Title string + Description string + Author string + Content string + URL string + Client *http.Client + OG *opengraph.OpenGraph +} + +// NewDocument creates a new markdown document from a pocket item +func NewDocument(item pocket.Item) *Document { + return &Document{ + URL: item.URL(), + } +} + +// Fill populates the document with content from the URL +func (d *Document) Fill(ctx context.Context) error { + client := http.DefaultClient + if d.Client != nil { + client = d.Client + } + + r := readability.New() + req, err := http.NewRequestWithContext(ctx, "GET", d.URL, nil) + if err != nil { + return fmt.Errorf("cannot create request: %w", err) + } + + res, err := client.Do(req) + if err != nil { + return fmt.Errorf("cannot fetch document: %w", err) + } + defer res.Body.Close() + + // Get OpenGraph data + og, content := getOpenGraph(res.Body) + d.OG = og + + doc, err := html.Parse(content) + if err != nil { + return err + } + + err = preProcess(doc) + if err != nil { + log.Fatal(err) + } + + // Parse the document + pipeR, pipeW := io.Pipe() + go func() { + defer pipeW.Close() + err = html.Render(pipeW, doc) + if err != nil { + return + } + }() + + article, err := r.Parse(pipeR, d.URL) + if err != nil { + return fmt.Errorf("cannot parse document: %w", err) + } + + // Set metadata + d.Title = article.Title + d.Description = d.OG.Description + d.Author = article.Byline + + // Convert HTML to Markdown + markdown, err := ConvertHTMLToMarkdown(article.Node) + if err != nil { + return fmt.Errorf("cannot convert HTML to markdown: %w", err) + } + + d.Content = markdown + return nil +} + +// Write writes the markdown content to a file +func (d *Document) Write(filename string) error { + // Create a markdown file with the content + f, err := os.Create(filename) + if err != nil { + return fmt.Errorf("cannot create file: %w", err) + } + defer f.Close() + + return d.WriteTo(f) +} + +// WriteTo writes the markdown content to an io.Writer +func (d *Document) WriteTo(w io.Writer) error { + // Write metadata as frontmatter + fmt.Fprintf(w, "---\n") + fmt.Fprintf(w, "title: %s\n", d.Title) + if d.Author != "" { + fmt.Fprintf(w, "author: %s\n", d.Author) + } + if d.Description != "" { + fmt.Fprintf(w, "description: %s\n", d.Description) + } + fmt.Fprintf(w, "source: %s\n", d.URL) + fmt.Fprintf(w, "---\n\n") + + // Write the content + fmt.Fprintf(w, "# %s\n\n", d.Title) + fmt.Fprintf(w, "%s\n", d.Content) + + return nil +} + +// ConvertHTMLToMarkdown converts an HTML node to markdown text +func ConvertHTMLToMarkdown(n *html.Node) (string, error) { + var markdown strings.Builder + + // Traverse the HTML tree and convert to markdown + err := traverseHTML(n, &markdown, 0) + if err != nil { + return "", err + } + + return markdown.String(), nil +} + +// traverseHTML recursively traverses the HTML tree and converts it to markdown +func traverseHTML(n *html.Node, markdown *strings.Builder, depth int) error { + if n.Type == html.TextNode { + text := strings.TrimSpace(n.Data) + if text != "" { + markdown.WriteString(text) + if text[len(text)-1] != ' ' { + markdown.WriteString(" ") + } + } + return nil + } + + if n.Type == html.ElementNode { + switch n.Data { + case "h1": + markdown.WriteString("\n# ") + case "h2": + markdown.WriteString("\n## ") + case "h3": + markdown.WriteString("\n### ") + case "h4": + markdown.WriteString("\n#### ") + case "h5": + markdown.WriteString("\n##### ") + case "h6": + markdown.WriteString("\n###### ") + case "p": + markdown.WriteString("\n\n") + case "br": + markdown.WriteString("\n") + case "strong", "b": + markdown.WriteString("**") + case "em", "i": + markdown.WriteString("*") + case "a": + markdown.WriteString("[") + // We'll close this later and add the URL + case "ul": + markdown.WriteString("\n") + case "ol": + markdown.WriteString("\n") + case "li": + markdown.WriteString("\n- ") + case "blockquote": + markdown.WriteString("\n> ") + case "code": + markdown.WriteString("`") + case "pre": + markdown.WriteString("\n```\n") + case "img": + // Skip images for now as requested + return nil + } + } + + // Process children + for c := n.FirstChild; c != nil; c = c.NextSibling { + err := traverseHTML(c, markdown, depth+1) + if err != nil { + return err + } + } + + // Close tags that need closing + if n.Type == html.ElementNode { + switch n.Data { + case "strong", "b": + markdown.WriteString("**") + case "em", "i": + markdown.WriteString("*") + case "a": + markdown.WriteString("](") + // Find href attribute + for _, a := range n.Attr { + if a.Key == "href" { + markdown.WriteString(a.Val) + break + } + } + markdown.WriteString(")") + case "code": + markdown.WriteString("`") + case "pre": + markdown.WriteString("\n```\n") + } + } + + return nil +} + +// getOpenGraph extract the data from the io.Reader and returns a new reader +func getOpenGraph(r io.Reader) (*opengraph.OpenGraph, io.Reader) { + var buf bytes.Buffer + og := opengraph.NewOpenGraph() + pr, pw := io.Pipe() + + // we need to wait for everything to be done + wg := sync.WaitGroup{} + wg.Add(2) + + // TeeReader gets the data from the r and also writes it to the PipeWriter + tr := io.TeeReader(r, pw) + + go func() { + defer wg.Done() + defer pw.Close() + + // get data from the TeeReader, which feeds the PipeReader through the PipeWriter + err := og.ProcessHTML(tr) + if err != nil { + log.Println(err) + } + }() + + go func() { + defer wg.Done() + // read from the PipeReader to stdout + if _, err := io.Copy(&buf, pr); err != nil { + log.Fatal(err) + } + }() + + wg.Wait() + return og, &buf +} + +// preProcess prepares the HTML for conversion +func preProcess(n *html.Node) error { + switch { + case n.Type == html.ElementNode && n.Data == "figure": + f := &figure{ + images: make([]*html.Node, 0), + } + f.processFigure(n) + // Clear all other images (medium, towarddatascience, ...) + if len(f.images) > 1 { + for _, img := range f.images { + if img != f.validImage { + img.Parent.RemoveChild(img) + } + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if n.Type == html.CommentNode || (n.Type == html.ElementNode && n.Data == "script") { + continue + } + err := preProcess(c) + if err != nil { + return err + } + } + return nil +} + +type figure struct { + images []*html.Node + validImage *html.Node +} + +func (f *figure) processFigure(n *html.Node) error { + if n.Type == html.ElementNode && n.Data == "img" { + f.images = append(f.images, n) + } + if n.Data == "noscript" { + if originalImg := n.PrevSibling; originalImg != nil && originalImg.Data == "img" { + // the img data is encoded as a string in the n.FirstChild.Data field + // Let's parse it as a node: + doc, err := html.Parse(bytes.NewBufferString(n.FirstChild.Data)) + if err != nil { + return err + } + img := getImgNode(doc) + if img != nil { + originalImg.Attr = img.Attr + } + f.validImage = originalImg + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + err := f.processFigure(c) + if err != io.EOF { + return err + } + } + return io.EOF +} + +func getImgNode(node *html.Node) *html.Node { + if node.Type == html.ElementNode && node.Data == "img" { + return node + } + for child := node.FirstChild; child != nil; child = child.NextSibling { + n := getImgNode(child) + if n != nil { + return n + } + } + return nil +} \ No newline at end of file