vertexcover-io · Hungerarray · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 25, 2025
diff --git a/go.mod b/go.mod
@@ -20,6 +20,7 @@ require (
 )
 
 require (
+	github.com/antchfx/htmlquery v1.3.4 // indirect
-	github.com/antchfx/htmlquery v1.3.4 // indirect
+	github.com/antchfx/htmlquery v1.3.4
-	github.com/antchfx/htmlquery v1.3.4 // indirect
+	github.com/antchfx/htmlquery v1.3.4
 	github.com/antchfx/xpath v1.3.3 // indirect
 	github.com/aws/aws-sdk-go-v2 v1.30.3 // indirect
 	github.com/aws/smithy-go v1.20.3 // indirect

diff --git a/go.sum b/go.sum
@@ -10,6 +10,8 @@ github.com/BurntSushi/xgbutil v0.0.0-20160919175755-f7c97cef3b4e/go.mod h1:uw9h2
 github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
 github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+github.com/antchfx/htmlquery v1.3.4 h1:Isd0srPkni2iNTWCwVj/72t7uCphFeor5Q8nCzj1jdQ=
+github.com/antchfx/htmlquery v1.3.4/go.mod h1:K9os0BwIEmLAvTqaNSua8tXLWRWZpocZIH73OzWQbwM=
 github.com/antchfx/xmlquery v1.4.4 h1:mxMEkdYP3pjKSftxss4nUHfjBhnMk4imGoR96FRY2dg=
 github.com/antchfx/xmlquery v1.4.4/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fusrx9b12fc=
 github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs=

diff --git a/pkg/internal/html/adapter.go b/pkg/internal/html/adapter.go
@@ -0,0 +1,130 @@
+package html
+
+import (
+	"strings"
+
+	"github.com/antchfx/htmlquery"
+	"golang.org/x/net/html"
+)
+
+type Document interface {
+	Find(xpath string) []Node
+}
+
+type Node interface {
+	TagName() string
+	IsElement() bool
+	HasParent() bool
+	GetAttribute(key string) string
+	GetParent() Node
+	ChildNodes() []Node
+	Index() int
+	Equal(Node) bool
+}
+
+type HTMLDoc struct {
+	root *html.Node
+}
+
+func NewHTMLDoc(root *html.Node) *HTMLDoc {
+	return &HTMLDoc{root: root}
+}
+
+func (d *HTMLDoc) Find(xpath string) []Node {
+	var nodes []Node
+	elems := htmlquery.Find(d.root, xpath)
+
+	for _, elem := range elems {
+		node := NewHTMLNode(elem)
+		nodes = append(nodes, node)
+	}
+	return nodes
+}
+
+func (d *HTMLDoc) Root() *HTMLNode {
+	return NewHTMLNode(d.root)
+}
+
+type HTMLNode struct {
+	node *html.Node
+}
+
+func NewHTMLNode(node *html.Node) *HTMLNode {
+	return &HTMLNode{node: node}
+}
+
+func (n HTMLNode) TagName() string {
+	return n.node.Data
+}
+
+func (n HTMLNode) IsElement() bool {
+	return n.node.Type == html.ElementNode
+}
+
+func (n *HTMLNode) HasParent() bool {
+	return n.node.Parent != nil
+}
+
+func (n *HTMLNode) GetAttribute(key string) string {
+	for _, attr := range n.node.Attr {
+		if attr.Key == key {
+			return attr.Val
+		}
+	}
+	return ""
+}
+
+func (n *HTMLNode) GetParent() Node {
+	return NewHTMLNode(n.node.Parent)
+}
+
+func (n *HTMLNode) ChildNodes() []Node {
+	var nodes []Node
+
+	for c := n.node.FirstChild; c != nil; c = c.NextSibling {
+		xn := NewHTMLNode(c)
+		nodes = append(nodes, xn)
+	}
+	return nodes
+}
+
+func (n *HTMLNode) Equal(n1 Node) bool {
+	xn1, ok := n1.(*HTMLNode)
+	if !ok {
+		return false
+	}
+
+	return n.node == xn1.node
+}
+
+func (n *HTMLNode) Index() int {
+	if n.node.Parent == nil {
+		return 1
+	}
+
+	idx := 0
+	parent := n.node.Parent
+	for c := parent.FirstChild; c != nil; c = c.NextSibling {
+		if c.Type == html.ElementNode && c.Data == n.node.Data {
+			idx += 1
+			if c == n.node {
+				return idx
+			}
+
+		}
+	}
+	return 1
+}
+
+func IsValidXPath(xpath, dom string) (bool, error) {
+	doc, err := htmlquery.Parse(strings.NewReader(dom))
+	if err != nil {
+		return false, err
+	}
+
+	elem, err := htmlquery.Query(doc, xpath)
+	if err != nil {
+		return false, err
+	}
+	return elem != nil, nil
+}
diff --git a/pkg/internal/html/minifier.go b/pkg/internal/html/minifier.go
@@ -0,0 +1,230 @@
+package html
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/antchfx/htmlquery"
+	"github.com/vertexcover-io/locatr/pkg/internal/utils"
+	"github.com/vertexcover-io/locatr/pkg/types"
+	"golang.org/x/net/html"
+)
+
+// nolint:unused
+func PrintXmlTree(node *html.Node, depth int) {
+	if node == nil {
+		return
+	}
+	if node.Type == html.TextNode && strings.TrimSpace(node.Data) == "" {
+		return
+	}
+
+	fmt.Printf("%sNode: %s", strings.Repeat("  ", depth), node.Data)
+	if len(node.Attr) > 0 {
+		fmt.Print(" [Attributes: ")
+		for _, attr := range node.Attr {
+			fmt.Printf("%s=%q ", attr.Key, attr.Val)
+		}
+		fmt.Print("]")
+	}
+	fmt.Println()
+
+	for child := node.FirstChild; child != nil; child = child.NextSibling {
+		PrintXmlTree(child, depth+1)
+	}
+}
+
+func findFirstElementNode(node *html.Node) *html.Node {
+	// If the current node is an element node, return it immediately
+	if node.Type == html.ElementNode {
+		return node
+	}
+
+	// Recursively search through child nodes
+	for child := node.FirstChild; child != nil; child = child.NextSibling {
+		// Recursively call findFirstElementNode on each child
+		found := findFirstElementNode(child)
+		// If an element node is found, return it
+		if found != nil {
+			return found
+		}
+	}
+
+	// If no element node is found, return nil
+	return nil
+}
+
+// For HTML, unless we evaluate CSS as well, we can never be certain if the
+// element is visible or not. However, we eliminate the base cases that
+// is possible with html only.
+func isElementVisible(element *html.Node) bool {
+	// 1. Skip non-element Nodes
+	if element.Type != html.ElementNode {
+		return false
+	}
+
+	// 2. Tags that never render visible content
+	switch strings.ToLower(element.Data) {
+	case "script", "style", "template", "noscript", "head", "meta", "link":
+		return false
+	}
+
+	// 3. Check if element has hidden attribute
+	if hasAttr(element, "hidden") {
+		return false
+	}
+
+	// 4. Check if aria hidden has been applied
+	if val, ok := attrVal(element, "aria-hidden"); ok && strings.EqualFold(val, "true") {
+		return false
+	}
+
+	// 5. Check if element is hidden with inline-styles
+	if style, ok := attrVal(element, "style"); ok {
+		s := strings.ToLower(style)
+		if strings.Contains(s, "display:none") ||
+			strings.Contains(s, "visibility:hidden") ||
+			strings.Contains(s, "opacity: 0") {
+			return false
+		}
+	}
+
+	return true
+}
+
+func hasAttr(element *html.Node, name string) bool {
+	_, ok := attrVal(element, name)
+	return ok
+}
+
+func attrVal(element *html.Node, name string) (string, bool) {
+	for _, a := range element.Attr {
+		if strings.EqualFold(a.Key, name) {
+			return a.Val, true
+		}
+	}
+	return "", false
+}
+
+func escapeString(str string) string {
+	return html.EscapeString(str)
+}
+
+func getVisibleText(element *html.Node) string {
+	txt := element.Data
+	return escapeString(strings.TrimSpace(txt))
+}
+
+func isElementValid(element *html.Node) bool {
+	if element.Type == html.TextNode && strings.TrimSpace(element.Data) == "" {
+		return false
+	}
+	if element.Data == "hierarchy" {
+		return true
+	}
+	// this check is essential, in iOS, there are cases where the parent heirarchy is marked as
+	// not visible, despite having children as visible. In case of iOS, we can't trust on
+	// element visibility.
+	if element.FirstChild != nil {
+		return true
+	}
+	visible := isElementVisible(element)
+	return visible
+}
+
+func attrsToMap(attrs []html.Attribute) map[string]string {
+	attrMap := make(map[string]string)
+	for _, attr := range attrs {
+		attrMap[attr.Key] = escapeString(attr.Val)
+	}
+	return attrMap
+}
+
+// nolint:unused
+func PrintLocatrs(locatrs []string) {
+	fmt.Printf("[")
+	for i, l := range locatrs {
+		if i == len(locatrs)-1 {
+			fmt.Printf("'%s'", l)
+			continue
+		}
+		fmt.Printf("'%s', ", l)
+
+	}
+	fmt.Println("]")
+
+}
+
+func createElementSpec(element *html.Node, root *html.Node) (*types.ElementSpec, error) {
+	if !isElementValid(element) {
+		return nil, fmt.Errorf("not a valid element")
+	}
+
+	text := getVisibleText(element)
+	doc := NewHTMLDoc(root)
+	node := NewHTMLNode(element)
+	xpath := GetOptimalXPath(doc, node)
+	uniqueId := utils.GenerateUniqueId(xpath)
+
+	children := []types.ElementSpec{}
+	for child := element.FirstChild; child != nil; child = child.NextSibling {
+		c, err := createElementSpec(child, root)
+		if err == nil && c != nil {
+			children = append(children, *c)
+		}
+	}
+	return &types.ElementSpec{
+		TagName:    element.Data,
+		Id:         uniqueId,
+		Attributes: attrsToMap(element.Attr),
+		Text:       text,
+		Children:   children,
+	}, nil
+}
+
+func MinifySource(source string) (*types.ElementSpec, error) {
+	if source == "" {
+		return nil, fmt.Errorf("source is empty")
+	}
+	root, err := htmlquery.Parse(strings.NewReader(source))
+	if err != nil {
+		return nil, err
+	}
+	node := findFirstElementNode(root)
+	spec, err := createElementSpec(node, node)
+	if err != nil {
+		return nil, err
+	}
+	return spec, nil
+}
+
+func CreateLocatorMap(source string) (map[string][]string, error) {
+	if source == "" {
+		return nil, fmt.Errorf("source is empty")
+	}
+	root, err := htmlquery.Parse(strings.NewReader(source))
+	if err != nil {
+		return nil, err
+	}
+	elementMap := make(map[string][]string)
+
+	var processElement func(*html.Node)
+
+	doc := NewHTMLDoc(root)
+	processElement = func(elem *html.Node) {
+		node := NewHTMLNode(elem)
+		xpath := GetOptimalXPath(doc, node)
+		if xpath != "" {
+			uniqueId := utils.GenerateUniqueId(xpath)
+			elementMap[uniqueId] = []string{xpath}
+		}
+
+		for child := elem.FirstChild; child != nil; child = child.NextSibling {
+			if isElementValid(child) {
+				processElement(child)
+			}
+		}
+	}
+	processElement(findFirstElementNode(root))
+	return elementMap, nil
+}