-
Notifications
You must be signed in to change notification settings - Fork 2
String only plugin support #49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
853cd86
34113ed
f8dd83e
41a00df
6accc44
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,130 @@ | ||
| package html | ||
|
|
||
| import ( | ||
| "strings" | ||
|
|
||
| "github.com/antchfx/htmlquery" | ||
| "golang.org/x/net/html" | ||
| ) | ||
|
|
||
| type Document interface { | ||
| Find(xpath string) []Node | ||
| } | ||
|
|
||
| type Node interface { | ||
| TagName() string | ||
| IsElement() bool | ||
| HasParent() bool | ||
| GetAttribute(key string) string | ||
| GetParent() Node | ||
| ChildNodes() []Node | ||
| Index() int | ||
| Equal(Node) bool | ||
| } | ||
|
|
||
| type HTMLDoc struct { | ||
| root *html.Node | ||
| } | ||
|
|
||
| func NewHTMLDoc(root *html.Node) *HTMLDoc { | ||
| return &HTMLDoc{root: root} | ||
| } | ||
|
|
||
| func (d *HTMLDoc) Find(xpath string) []Node { | ||
| var nodes []Node | ||
| elems := htmlquery.Find(d.root, xpath) | ||
|
|
||
| for _, elem := range elems { | ||
| node := NewHTMLNode(elem) | ||
| nodes = append(nodes, node) | ||
| } | ||
| return nodes | ||
| } | ||
|
|
||
| func (d *HTMLDoc) Root() *HTMLNode { | ||
| return NewHTMLNode(d.root) | ||
| } | ||
|
|
||
| type HTMLNode struct { | ||
| node *html.Node | ||
| } | ||
|
|
||
| func NewHTMLNode(node *html.Node) *HTMLNode { | ||
| return &HTMLNode{node: node} | ||
| } | ||
|
|
||
| func (n HTMLNode) TagName() string { | ||
| return n.node.Data | ||
| } | ||
|
Comment on lines
+56
to
+58
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nil-safety: fix potential panics on root recursion and nil receivers. Same issues as XML adapter. Use pointer receivers with guards and return nil parent explicitly to keep Apply this diff: -func (n HTMLNode) TagName() string {
- return n.node.Data
-}
+func (n *HTMLNode) TagName() string {
+ if n == nil || n.node == nil {
+ return ""
+ }
+ return n.node.Data
+}
-func (n HTMLNode) IsElement() bool {
- return n.node.Type == html.ElementNode
-}
+func (n *HTMLNode) IsElement() bool {
+ if n == nil || n.node == nil {
+ return false
+ }
+ return n.node.Type == html.ElementNode
+}
-func (n *HTMLNode) GetParent() Node {
- return NewHTMLNode(n.node.Parent)
-}
+func (n *HTMLNode) GetParent() Node {
+ if n == nil || n.node == nil || n.node.Parent == nil {
+ return nil
+ }
+ return NewHTMLNode(n.node.Parent)
+}
-func (n *HTMLNode) ChildNodes() []Node {
+func (n *HTMLNode) ChildNodes() []Node {
+ if n == nil || n.node == nil {
+ return nil
+ }
var nodes []Node
@@
-}
+}
-func (n *HTMLNode) Equal(n1 Node) bool {
+func (n *HTMLNode) Equal(n1 Node) bool {
+ if n == nil {
+ _, ok := n1.(*HTMLNode)
+ return !ok
+ }
xn1, ok := n1.(*HTMLNode)
if !ok {
return false
}
return n.node == xn1.node
}
-func (n *HTMLNode) Index() int {
- if n.node.Parent == nil {
+func (n *HTMLNode) Index() int {
+ if n == nil || n.node == nil || n.node.Parent == nil {
return 1
}Also applies to: 60-62, 77-79, 81-89, 91-98, 100-117 🤖 Prompt for AI Agents |
||
|
|
||
| func (n HTMLNode) IsElement() bool { | ||
| return n.node.Type == html.ElementNode | ||
| } | ||
|
|
||
| func (n *HTMLNode) HasParent() bool { | ||
| return n.node.Parent != nil | ||
| } | ||
|
|
||
| func (n *HTMLNode) GetAttribute(key string) string { | ||
| for _, attr := range n.node.Attr { | ||
| if attr.Key == key { | ||
| return attr.Val | ||
| } | ||
| } | ||
| return "" | ||
| } | ||
|
|
||
| func (n *HTMLNode) GetParent() Node { | ||
| return NewHTMLNode(n.node.Parent) | ||
| } | ||
|
|
||
| func (n *HTMLNode) ChildNodes() []Node { | ||
| var nodes []Node | ||
|
|
||
| for c := n.node.FirstChild; c != nil; c = c.NextSibling { | ||
| xn := NewHTMLNode(c) | ||
| nodes = append(nodes, xn) | ||
| } | ||
| return nodes | ||
| } | ||
|
|
||
| func (n *HTMLNode) Equal(n1 Node) bool { | ||
| xn1, ok := n1.(*HTMLNode) | ||
| if !ok { | ||
| return false | ||
| } | ||
|
|
||
| return n.node == xn1.node | ||
| } | ||
|
|
||
| func (n *HTMLNode) Index() int { | ||
| if n.node.Parent == nil { | ||
| return 1 | ||
| } | ||
|
|
||
| idx := 0 | ||
| parent := n.node.Parent | ||
| for c := parent.FirstChild; c != nil; c = c.NextSibling { | ||
| if c.Type == html.ElementNode && c.Data == n.node.Data { | ||
| idx += 1 | ||
| if c == n.node { | ||
| return idx | ||
| } | ||
|
|
||
| } | ||
| } | ||
| return 1 | ||
| } | ||
|
|
||
| func IsValidXPath(xpath, dom string) (bool, error) { | ||
| doc, err := htmlquery.Parse(strings.NewReader(dom)) | ||
| if err != nil { | ||
| return false, err | ||
| } | ||
|
|
||
| elem, err := htmlquery.Query(doc, xpath) | ||
| if err != nil { | ||
| return false, err | ||
| } | ||
| return elem != nil, nil | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,230 @@ | ||
| package html | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "strings" | ||
|
|
||
| "github.com/antchfx/htmlquery" | ||
| "github.com/vertexcover-io/locatr/pkg/internal/utils" | ||
| "github.com/vertexcover-io/locatr/pkg/types" | ||
| "golang.org/x/net/html" | ||
| ) | ||
|
|
||
| // nolint:unused | ||
| func PrintXmlTree(node *html.Node, depth int) { | ||
| if node == nil { | ||
| return | ||
| } | ||
| if node.Type == html.TextNode && strings.TrimSpace(node.Data) == "" { | ||
| return | ||
| } | ||
|
|
||
| fmt.Printf("%sNode: %s", strings.Repeat(" ", depth), node.Data) | ||
| if len(node.Attr) > 0 { | ||
| fmt.Print(" [Attributes: ") | ||
| for _, attr := range node.Attr { | ||
| fmt.Printf("%s=%q ", attr.Key, attr.Val) | ||
| } | ||
| fmt.Print("]") | ||
| } | ||
| fmt.Println() | ||
|
|
||
| for child := node.FirstChild; child != nil; child = child.NextSibling { | ||
| PrintXmlTree(child, depth+1) | ||
| } | ||
| } | ||
|
|
||
| func findFirstElementNode(node *html.Node) *html.Node { | ||
| // If the current node is an element node, return it immediately | ||
| if node.Type == html.ElementNode { | ||
| return node | ||
| } | ||
|
|
||
| // Recursively search through child nodes | ||
| for child := node.FirstChild; child != nil; child = child.NextSibling { | ||
| // Recursively call findFirstElementNode on each child | ||
| found := findFirstElementNode(child) | ||
| // If an element node is found, return it | ||
| if found != nil { | ||
| return found | ||
| } | ||
| } | ||
|
|
||
| // If no element node is found, return nil | ||
| return nil | ||
| } | ||
|
|
||
| // For HTML, unless we evaluate CSS as well, we can never be certain if the | ||
| // element is visible or not. However, we eliminate the base cases that | ||
| // is possible with html only. | ||
| func isElementVisible(element *html.Node) bool { | ||
| // 1. Skip non-element Nodes | ||
| if element.Type != html.ElementNode { | ||
| return false | ||
| } | ||
|
|
||
| // 2. Tags that never render visible content | ||
| switch strings.ToLower(element.Data) { | ||
| case "script", "style", "template", "noscript", "head", "meta", "link": | ||
| return false | ||
| } | ||
|
|
||
| // 3. Check if element has hidden attribute | ||
| if hasAttr(element, "hidden") { | ||
| return false | ||
| } | ||
|
|
||
| // 4. Check if aria hidden has been applied | ||
| if val, ok := attrVal(element, "aria-hidden"); ok && strings.EqualFold(val, "true") { | ||
| return false | ||
| } | ||
|
|
||
| // 5. Check if element is hidden with inline-styles | ||
| if style, ok := attrVal(element, "style"); ok { | ||
| s := strings.ToLower(style) | ||
| if strings.Contains(s, "display:none") || | ||
| strings.Contains(s, "visibility:hidden") || | ||
| strings.Contains(s, "opacity: 0") { | ||
| return false | ||
| } | ||
| } | ||
|
|
||
| return true | ||
| } | ||
|
|
||
| func hasAttr(element *html.Node, name string) bool { | ||
| _, ok := attrVal(element, name) | ||
| return ok | ||
| } | ||
|
|
||
| func attrVal(element *html.Node, name string) (string, bool) { | ||
| for _, a := range element.Attr { | ||
| if strings.EqualFold(a.Key, name) { | ||
| return a.Val, true | ||
| } | ||
| } | ||
| return "", false | ||
| } | ||
|
|
||
| func escapeString(str string) string { | ||
| return html.EscapeString(str) | ||
| } | ||
|
|
||
| func getVisibleText(element *html.Node) string { | ||
| txt := element.Data | ||
| return escapeString(strings.TrimSpace(txt)) | ||
| } | ||
|
Comment on lines
+113
to
+116
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Incorrect element “text” extraction and inconsistent document root for XPath generation.
Apply this diff: -func getVisibleText(element *html.Node) string {
- txt := element.Data
- return escapeString(strings.TrimSpace(txt))
-}
+func getVisibleText(element *html.Node) string {
+ if element == nil {
+ return ""
+ }
+ var b strings.Builder
+ var walk func(*html.Node)
+ walk = func(n *html.Node) {
+ if n.Type == html.TextNode {
+ t := strings.TrimSpace(n.Data)
+ if t != "" {
+ if b.Len() > 0 {
+ b.WriteByte(' ')
+ }
+ b.WriteString(t)
+ }
+ return
+ }
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ walk(c)
+ }
+ }
+ walk(element)
+ return escapeString(b.String())
+}
@@
- spec, err := createElementSpec(node, node)
+ spec, err := createElementSpec(node, root)Also applies to: 169-183, 193-199 |
||
|
|
||
| func isElementValid(element *html.Node) bool { | ||
| if element.Type == html.TextNode && strings.TrimSpace(element.Data) == "" { | ||
| return false | ||
| } | ||
| if element.Data == "hierarchy" { | ||
| return true | ||
| } | ||
| // this check is essential, in iOS, there are cases where the parent heirarchy is marked as | ||
| // not visible, despite having children as visible. In case of iOS, we can't trust on | ||
| // element visibility. | ||
| if element.FirstChild != nil { | ||
| return true | ||
| } | ||
| visible := isElementVisible(element) | ||
| return visible | ||
| } | ||
|
|
||
| func attrsToMap(attrs []html.Attribute) map[string]string { | ||
| attrMap := make(map[string]string) | ||
| for _, attr := range attrs { | ||
| attrMap[attr.Key] = escapeString(attr.Val) | ||
| } | ||
| return attrMap | ||
| } | ||
|
|
||
| // nolint:unused | ||
| func PrintLocatrs(locatrs []string) { | ||
| fmt.Printf("[") | ||
| for i, l := range locatrs { | ||
| if i == len(locatrs)-1 { | ||
| fmt.Printf("'%s'", l) | ||
| continue | ||
| } | ||
| fmt.Printf("'%s', ", l) | ||
|
|
||
| } | ||
| fmt.Println("]") | ||
|
|
||
| } | ||
|
|
||
| func createElementSpec(element *html.Node, root *html.Node) (*types.ElementSpec, error) { | ||
| if !isElementValid(element) { | ||
| return nil, fmt.Errorf("not a valid element") | ||
| } | ||
|
|
||
| text := getVisibleText(element) | ||
| doc := NewHTMLDoc(root) | ||
| node := NewHTMLNode(element) | ||
| xpath := GetOptimalXPath(doc, node) | ||
| uniqueId := utils.GenerateUniqueId(xpath) | ||
|
|
||
| children := []types.ElementSpec{} | ||
| for child := element.FirstChild; child != nil; child = child.NextSibling { | ||
| c, err := createElementSpec(child, root) | ||
| if err == nil && c != nil { | ||
| children = append(children, *c) | ||
| } | ||
| } | ||
| return &types.ElementSpec{ | ||
| TagName: element.Data, | ||
| Id: uniqueId, | ||
| Attributes: attrsToMap(element.Attr), | ||
| Text: text, | ||
| Children: children, | ||
| }, nil | ||
| } | ||
|
|
||
| func MinifySource(source string) (*types.ElementSpec, error) { | ||
| if source == "" { | ||
| return nil, fmt.Errorf("source is empty") | ||
| } | ||
| root, err := htmlquery.Parse(strings.NewReader(source)) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| node := findFirstElementNode(root) | ||
| spec, err := createElementSpec(node, node) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| return spec, nil | ||
| } | ||
|
|
||
| func CreateLocatorMap(source string) (map[string][]string, error) { | ||
| if source == "" { | ||
| return nil, fmt.Errorf("source is empty") | ||
| } | ||
| root, err := htmlquery.Parse(strings.NewReader(source)) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| elementMap := make(map[string][]string) | ||
|
|
||
| var processElement func(*html.Node) | ||
|
|
||
| doc := NewHTMLDoc(root) | ||
| processElement = func(elem *html.Node) { | ||
| node := NewHTMLNode(elem) | ||
| xpath := GetOptimalXPath(doc, node) | ||
| if xpath != "" { | ||
| uniqueId := utils.GenerateUniqueId(xpath) | ||
| elementMap[uniqueId] = []string{xpath} | ||
| } | ||
|
|
||
| for child := elem.FirstChild; child != nil; child = child.NextSibling { | ||
| if isElementValid(child) { | ||
| processElement(child) | ||
| } | ||
| } | ||
| } | ||
| processElement(findFirstElementNode(root)) | ||
| return elementMap, nil | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💡 Verification agent
🧩 Analysis chain
Make htmlquery a direct dependency (not indirect).
github.com/antchfx/htmlqueryis imported directly in the codebase, so it should not be marked// indirect. This can cause drift and unexpected tidy rewrites.Apply this diff:
Run to confirm direct imports and tidy the module:
🏁 Script executed:
Length of output: 68
🏁 Script executed:
Length of output: 1776
Make github.com/antchfx/htmlquery a direct dependency (remove // indirect from go.mod)
github.com/antchfx/htmlquery is imported directly in pkg/internal/html/adapter.go and pkg/internal/html/minifier.go — remove the
// indirectmarker in go.mod and rungo mod tidy. (go.mod: line ~23)📝 Committable suggestion
🤖 Prompt for AI Agents