Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ require (
)

require (
github.com/antchfx/htmlquery v1.3.4 // indirect
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Make htmlquery a direct dependency (not indirect).

github.com/antchfx/htmlquery is imported directly in the codebase, so it should not be marked // indirect. This can cause drift and unexpected tidy rewrites.

Apply this diff:

-	github.com/antchfx/htmlquery v1.3.4 // indirect
+	github.com/antchfx/htmlquery v1.3.4

Run to confirm direct imports and tidy the module:


🏁 Script executed:

#!/bin/bash
# Verify direct imports exist
rg -nP --type=go '^\s*import\s*\(\s*(?s).*?github\.com/antchfx/htmlquery' -C2 || rg -nP --type=go '^\s*import\s*"github\.com/antchfx/htmlquery"'

# Show current go.mod line
rg -n '^(\s*)github\.com/antchfx/htmlquery\b' go.mod -n

# Suggest tidying (to be run locally)
echo "Run: go mod tidy"

Length of output: 68


🏁 Script executed:

#!/bin/bash
# Search for direct imports of htmlquery package
rg -n "github.com/antchfx/htmlquery" --type=go

# Also search for any usage of htmlquery functions/types
rg -n "htmlquery\." --type=go -A1 -B1

# Check if htmlquery is used in any import statements (both single line and multi-line)
rg -nP --type=go 'import.*"github\.com/antchfx/htmlquery"' || rg -nP --type=go -U '(?s)import\s*\([^)]*github\.com/antchfx/htmlquery[^)]*\)'

Length of output: 1776


Make github.com/antchfx/htmlquery a direct dependency (remove // indirect from go.mod)

github.com/antchfx/htmlquery is imported directly in pkg/internal/html/adapter.go and pkg/internal/html/minifier.go — remove the // indirect marker in go.mod and run go mod tidy. (go.mod: line ~23)

-	github.com/antchfx/htmlquery v1.3.4 // indirect
+	github.com/antchfx/htmlquery v1.3.4
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
github.com/antchfx/htmlquery v1.3.4 // indirect
github.com/antchfx/htmlquery v1.3.4
🤖 Prompt for AI Agents
In go.mod around line 23, the dependency github.com/antchfx/htmlquery is marked
as indirect though it is imported directly in pkg/internal/html/adapter.go and
pkg/internal/html/minifier.go; remove the "// indirect" marker from that require
line (making it a direct dependency) and then run `go mod tidy` to update module
metadata and ensure go.sum is correct.

github.com/antchfx/xpath v1.3.3 // indirect
github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect
github.com/aws/smithy-go v1.20.3 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ github.com/BurntSushi/xgbutil v0.0.0-20160919175755-f7c97cef3b4e/go.mod h1:uw9h2
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/antchfx/htmlquery v1.3.4 h1:Isd0srPkni2iNTWCwVj/72t7uCphFeor5Q8nCzj1jdQ=
github.com/antchfx/htmlquery v1.3.4/go.mod h1:K9os0BwIEmLAvTqaNSua8tXLWRWZpocZIH73OzWQbwM=
github.com/antchfx/xmlquery v1.4.4 h1:mxMEkdYP3pjKSftxss4nUHfjBhnMk4imGoR96FRY2dg=
github.com/antchfx/xmlquery v1.4.4/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fusrx9b12fc=
github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs=
Expand Down
130 changes: 130 additions & 0 deletions pkg/internal/html/adapter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package html

import (
"strings"

"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
)

type Document interface {
Find(xpath string) []Node
}

type Node interface {
TagName() string
IsElement() bool
HasParent() bool
GetAttribute(key string) string
GetParent() Node
ChildNodes() []Node
Index() int
Equal(Node) bool
}

type HTMLDoc struct {
root *html.Node
}

func NewHTMLDoc(root *html.Node) *HTMLDoc {
return &HTMLDoc{root: root}
}

func (d *HTMLDoc) Find(xpath string) []Node {
var nodes []Node
elems := htmlquery.Find(d.root, xpath)

for _, elem := range elems {
node := NewHTMLNode(elem)
nodes = append(nodes, node)
}
return nodes
}

func (d *HTMLDoc) Root() *HTMLNode {
return NewHTMLNode(d.root)
}

type HTMLNode struct {
node *html.Node
}

func NewHTMLNode(node *html.Node) *HTMLNode {
return &HTMLNode{node: node}
}

func (n HTMLNode) TagName() string {
return n.node.Data
}
Comment on lines +56 to +58
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Nil-safety: fix potential panics on root recursion and nil receivers.

Same issues as XML adapter. Use pointer receivers with guards and return nil parent explicitly to keep GetOptimalXPath recursion safe.

Apply this diff:

-func (n HTMLNode) TagName() string {
-	return n.node.Data
-}
+func (n *HTMLNode) TagName() string {
+	if n == nil || n.node == nil {
+		return ""
+	}
+	return n.node.Data
+}

-func (n HTMLNode) IsElement() bool {
-	return n.node.Type == html.ElementNode
-}
+func (n *HTMLNode) IsElement() bool {
+	if n == nil || n.node == nil {
+		return false
+	}
+	return n.node.Type == html.ElementNode
+}

-func (n *HTMLNode) GetParent() Node {
-	return NewHTMLNode(n.node.Parent)
-}
+func (n *HTMLNode) GetParent() Node {
+	if n == nil || n.node == nil || n.node.Parent == nil {
+		return nil
+	}
+	return NewHTMLNode(n.node.Parent)
+}

-func (n *HTMLNode) ChildNodes() []Node {
+func (n *HTMLNode) ChildNodes() []Node {
+	if n == nil || n.node == nil {
+		return nil
+	}
 	var nodes []Node
@@
-}
+}

-func (n *HTMLNode) Equal(n1 Node) bool {
+func (n *HTMLNode) Equal(n1 Node) bool {
+	if n == nil {
+		_, ok := n1.(*HTMLNode)
+		return !ok
+	}
 	xn1, ok := n1.(*HTMLNode)
 	if !ok {
 		return false
 	}
 	return n.node == xn1.node
 }

-func (n *HTMLNode) Index() int {
-	if n.node.Parent == nil {
+func (n *HTMLNode) Index() int {
+	if n == nil || n.node == nil || n.node.Parent == nil {
 		return 1
 	}

Also applies to: 60-62, 77-79, 81-89, 91-98, 100-117

🤖 Prompt for AI Agents
In pkg/internal/html/adapter.go around lines 56-58 (and similarly for 60-62,
77-79, 81-89, 91-98, 100-117), change the HTMLNode methods to use pointer
receivers and add nil-safety guards: if the receiver is nil or the underlying
n.node is nil return safe zero values (for TagName and similar string-returning
methods return empty string; for Parent/ParentNode-returning methods return nil
explicitly) so recursive GetOptimalXPath calls don't panic; ensure every method
checks receiver and n.node before accessing fields and adjust signatures to func
(n *HTMLNode) ... accordingly.


func (n HTMLNode) IsElement() bool {
return n.node.Type == html.ElementNode
}

func (n *HTMLNode) HasParent() bool {
return n.node.Parent != nil
}

func (n *HTMLNode) GetAttribute(key string) string {
for _, attr := range n.node.Attr {
if attr.Key == key {
return attr.Val
}
}
return ""
}

func (n *HTMLNode) GetParent() Node {
return NewHTMLNode(n.node.Parent)
}

func (n *HTMLNode) ChildNodes() []Node {
var nodes []Node

for c := n.node.FirstChild; c != nil; c = c.NextSibling {
xn := NewHTMLNode(c)
nodes = append(nodes, xn)
}
return nodes
}

func (n *HTMLNode) Equal(n1 Node) bool {
xn1, ok := n1.(*HTMLNode)
if !ok {
return false
}

return n.node == xn1.node
}

func (n *HTMLNode) Index() int {
if n.node.Parent == nil {
return 1
}

idx := 0
parent := n.node.Parent
for c := parent.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == n.node.Data {
idx += 1
if c == n.node {
return idx
}

}
}
return 1
}

func IsValidXPath(xpath, dom string) (bool, error) {
doc, err := htmlquery.Parse(strings.NewReader(dom))
if err != nil {
return false, err
}

elem, err := htmlquery.Query(doc, xpath)
if err != nil {
return false, err
}
return elem != nil, nil
}
230 changes: 230 additions & 0 deletions pkg/internal/html/minifier.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
package html

import (
"fmt"
"strings"

"github.com/antchfx/htmlquery"
"github.com/vertexcover-io/locatr/pkg/internal/utils"
"github.com/vertexcover-io/locatr/pkg/types"
"golang.org/x/net/html"
)

// nolint:unused
func PrintXmlTree(node *html.Node, depth int) {
if node == nil {
return
}
if node.Type == html.TextNode && strings.TrimSpace(node.Data) == "" {
return
}

fmt.Printf("%sNode: %s", strings.Repeat(" ", depth), node.Data)
if len(node.Attr) > 0 {
fmt.Print(" [Attributes: ")
for _, attr := range node.Attr {
fmt.Printf("%s=%q ", attr.Key, attr.Val)
}
fmt.Print("]")
}
fmt.Println()

for child := node.FirstChild; child != nil; child = child.NextSibling {
PrintXmlTree(child, depth+1)
}
}

func findFirstElementNode(node *html.Node) *html.Node {
// If the current node is an element node, return it immediately
if node.Type == html.ElementNode {
return node
}

// Recursively search through child nodes
for child := node.FirstChild; child != nil; child = child.NextSibling {
// Recursively call findFirstElementNode on each child
found := findFirstElementNode(child)
// If an element node is found, return it
if found != nil {
return found
}
}

// If no element node is found, return nil
return nil
}

// For HTML, unless we evaluate CSS as well, we can never be certain if the
// element is visible or not. However, we eliminate the base cases that
// is possible with html only.
func isElementVisible(element *html.Node) bool {
// 1. Skip non-element Nodes
if element.Type != html.ElementNode {
return false
}

// 2. Tags that never render visible content
switch strings.ToLower(element.Data) {
case "script", "style", "template", "noscript", "head", "meta", "link":
return false
}

// 3. Check if element has hidden attribute
if hasAttr(element, "hidden") {
return false
}

// 4. Check if aria hidden has been applied
if val, ok := attrVal(element, "aria-hidden"); ok && strings.EqualFold(val, "true") {
return false
}

// 5. Check if element is hidden with inline-styles
if style, ok := attrVal(element, "style"); ok {
s := strings.ToLower(style)
if strings.Contains(s, "display:none") ||
strings.Contains(s, "visibility:hidden") ||
strings.Contains(s, "opacity: 0") {
return false
}
}

return true
}

func hasAttr(element *html.Node, name string) bool {
_, ok := attrVal(element, name)
return ok
}

func attrVal(element *html.Node, name string) (string, bool) {
for _, a := range element.Attr {
if strings.EqualFold(a.Key, name) {
return a.Val, true
}
}
return "", false
}

func escapeString(str string) string {
return html.EscapeString(str)
}

func getVisibleText(element *html.Node) string {
txt := element.Data
return escapeString(strings.TrimSpace(txt))
}
Comment on lines +113 to +116
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

⚠️ Potential issue

Incorrect element “text” extraction and inconsistent document root for XPath generation.

  • getVisibleText returns the tag name, not the element’s textual content.
  • MinifySource builds XPaths with the first element as the Document root, while CreateLocatorMap uses the parsed root. This inconsistency can yield different XPaths/IDs for the same element.

Apply this diff:

-func getVisibleText(element *html.Node) string {
-	txt := element.Data
-	return escapeString(strings.TrimSpace(txt))
-}
+func getVisibleText(element *html.Node) string {
+	if element == nil {
+		return ""
+	}
+	var b strings.Builder
+	var walk func(*html.Node)
+	walk = func(n *html.Node) {
+		if n.Type == html.TextNode {
+			t := strings.TrimSpace(n.Data)
+			if t != "" {
+				if b.Len() > 0 {
+					b.WriteByte(' ')
+				}
+				b.WriteString(t)
+			}
+			return
+		}
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			walk(c)
+		}
+	}
+	walk(element)
+	return escapeString(b.String())
+}
@@
-	spec, err := createElementSpec(node, node)
+	spec, err := createElementSpec(node, root)

Also applies to: 169-183, 193-199


func isElementValid(element *html.Node) bool {
if element.Type == html.TextNode && strings.TrimSpace(element.Data) == "" {
return false
}
if element.Data == "hierarchy" {
return true
}
// this check is essential, in iOS, there are cases where the parent heirarchy is marked as
// not visible, despite having children as visible. In case of iOS, we can't trust on
// element visibility.
if element.FirstChild != nil {
return true
}
visible := isElementVisible(element)
return visible
}

func attrsToMap(attrs []html.Attribute) map[string]string {
attrMap := make(map[string]string)
for _, attr := range attrs {
attrMap[attr.Key] = escapeString(attr.Val)
}
return attrMap
}

// nolint:unused
func PrintLocatrs(locatrs []string) {
fmt.Printf("[")
for i, l := range locatrs {
if i == len(locatrs)-1 {
fmt.Printf("'%s'", l)
continue
}
fmt.Printf("'%s', ", l)

}
fmt.Println("]")

}

func createElementSpec(element *html.Node, root *html.Node) (*types.ElementSpec, error) {
if !isElementValid(element) {
return nil, fmt.Errorf("not a valid element")
}

text := getVisibleText(element)
doc := NewHTMLDoc(root)
node := NewHTMLNode(element)
xpath := GetOptimalXPath(doc, node)
uniqueId := utils.GenerateUniqueId(xpath)

children := []types.ElementSpec{}
for child := element.FirstChild; child != nil; child = child.NextSibling {
c, err := createElementSpec(child, root)
if err == nil && c != nil {
children = append(children, *c)
}
}
return &types.ElementSpec{
TagName: element.Data,
Id: uniqueId,
Attributes: attrsToMap(element.Attr),
Text: text,
Children: children,
}, nil
}

func MinifySource(source string) (*types.ElementSpec, error) {
if source == "" {
return nil, fmt.Errorf("source is empty")
}
root, err := htmlquery.Parse(strings.NewReader(source))
if err != nil {
return nil, err
}
node := findFirstElementNode(root)
spec, err := createElementSpec(node, node)
if err != nil {
return nil, err
}
return spec, nil
}

func CreateLocatorMap(source string) (map[string][]string, error) {
if source == "" {
return nil, fmt.Errorf("source is empty")
}
root, err := htmlquery.Parse(strings.NewReader(source))
if err != nil {
return nil, err
}
elementMap := make(map[string][]string)

var processElement func(*html.Node)

doc := NewHTMLDoc(root)
processElement = func(elem *html.Node) {
node := NewHTMLNode(elem)
xpath := GetOptimalXPath(doc, node)
if xpath != "" {
uniqueId := utils.GenerateUniqueId(xpath)
elementMap[uniqueId] = []string{xpath}
}

for child := elem.FirstChild; child != nil; child = child.NextSibling {
if isElementValid(child) {
processElement(child)
}
}
}
processElement(findFirstElementNode(root))
return elementMap, nil
}
Loading