Skip to content

Commit c4ecbc4

Browse files
authored
Merge pull request #17 from Salmondx/master
FindStrict and FindAllStrict
2 parents f5e1492 + 24b113c commit c4ecbc4

File tree

3 files changed

+137
-10
lines changed

3 files changed

+137
-10
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ func Cookie(string, string) // Takes key, value pair to set as cookies to be sen
1818
func HTMLParse(string) Root // Takes the HTML string as an argument, returns a pointer to the DOM constructed
1919
func Find([]string) Root // Element tag,(attribute key-value pair) as argument, pointer to first occurence returned
2020
func FindAll([]string) []Root // Same as Find(), but pointers to all occurrences returned
21+
func FindStrict([]string) Root // Element tag,(attribute key-value pair) as argument, pointer to first occurence returned with exact matching values
22+
func FindAllStrict([]string) []Root // Same as FindStrict(), but pointers to all occurrences returned
2123
func FindNextSibling() Root // Pointer to the next sibling of the Element in the DOM returned
2224
func FindNextElementSibling() Root // Pointer to the next element sibling of the Element in the DOM returned
2325
func FindPrevSibling() Root // Pointer to the previous sibling of the Element in the DOM returned

soup.go

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ var debug = false
2626
// Headers contains all HTTP headers to send
2727
var Headers = make(map[string]string)
2828

29-
3029
// Cookies contains all HTTP cookies to send
3130
var Cookies = make(map[string]string)
3231

@@ -62,7 +61,7 @@ func GetWithClient(url string, client *http.Client) (string, error) {
6261
// Set cookies
6362
for cName, cValue := range Cookies {
6463
req.AddCookie(&http.Cookie{
65-
Name: cName,
64+
Name: cName,
6665
Value: cValue,
6766
})
6867
}
@@ -118,7 +117,7 @@ func HTMLParse(s string) Root {
118117
// with or without attribute key and value specified,
119118
// and returns a struct with a pointer to it
120119
func (r Root) Find(args ...string) Root {
121-
temp, ok := findOnce(r.Pointer, args, false)
120+
temp, ok := findOnce(r.Pointer, args, false, false)
122121
if ok == false {
123122
if debug {
124123
panic("Element `" + args[0] + "` with attributes `" + strings.Join(args[1:], " ") + "` not found")
@@ -133,14 +132,44 @@ func (r Root) Find(args ...string) Root {
133132
// and returns an array of structs, each having
134133
// the respective pointers
135134
func (r Root) FindAll(args ...string) []Root {
136-
temp := findAllofem(r.Pointer, args)
135+
temp := findAllofem(r.Pointer, args, false)
137136
if len(temp) == 0 {
138137
if debug {
139138
panic("Element `" + args[0] + "` with attributes `" + strings.Join(args[1:], " ") + "` not found")
140139
}
141140
return []Root{}
142141
}
143-
pointers := make([]Root, 0, 10)
142+
pointers := make([]Root, 0, len(temp))
143+
for i := 0; i < len(temp); i++ {
144+
pointers = append(pointers, Root{temp[i], temp[i].Data, nil})
145+
}
146+
return pointers
147+
}
148+
149+
// FindStrict finds the first occurrence of the given tag name
150+
// only if all the values of the provided attribute are an exact match
151+
func (r Root) FindStrict(args ...string) Root {
152+
temp, ok := findOnce(r.Pointer, args, false, true)
153+
if ok == false {
154+
if debug {
155+
panic("Element `" + args[0] + "` with attributes `" + strings.Join(args[1:], " ") + "` not found")
156+
}
157+
return Root{nil, "", errors.New("element `" + args[0] + "` with attributes `" + strings.Join(args[1:], " ") + "` not found")}
158+
}
159+
return Root{temp, temp.Data, nil}
160+
}
161+
162+
// FindAllStrict finds all occurrences of the given tag name
163+
// only if all the values of the provided attribute are an exact match
164+
func (r Root) FindAllStrict(args ...string) []Root {
165+
temp := findAllofem(r.Pointer, args, true)
166+
if len(temp) == 0 {
167+
if debug {
168+
panic("Element `" + args[0] + "` with attributes `" + strings.Join(args[1:], " ") + "` not found")
169+
}
170+
return []Root{}
171+
}
172+
pointers := make([]Root, 0, len(temp))
144173
for i := 0; i < len(temp); i++ {
145174
pointers = append(pointers, Root{temp[i], temp[i].Data, nil})
146175
}
@@ -253,12 +282,16 @@ checkNode:
253282
}
254283

255284
// Using depth first search to find the first occurrence and return
256-
func findOnce(n *html.Node, args []string, uni bool) (*html.Node, bool) {
285+
func findOnce(n *html.Node, args []string, uni bool, strict bool) (*html.Node, bool) {
257286
if uni == true {
258287
if n.Type == html.ElementNode && n.Data == args[0] {
259288
if len(args) > 1 && len(args) < 4 {
260289
for i := 0; i < len(n.Attr); i++ {
261-
if n.Attr[i].Key == args[1] && n.Attr[i].Val == args[2] {
290+
attr := n.Attr[i]
291+
searchAttrName := args[1]
292+
searchAttrVal := args[2]
293+
if (strict && attributeAndValueEquals(attr, searchAttrName, searchAttrVal)) ||
294+
(!strict && attributeContainsValue(attr, searchAttrName, searchAttrVal)) {
262295
return n, true
263296
}
264297
}
@@ -269,7 +302,7 @@ func findOnce(n *html.Node, args []string, uni bool) (*html.Node, bool) {
269302
}
270303
uni = true
271304
for c := n.FirstChild; c != nil; c = c.NextSibling {
272-
p, q := findOnce(c, args, true)
305+
p, q := findOnce(c, args, true, strict)
273306
if q != false {
274307
return p, q
275308
}
@@ -278,15 +311,19 @@ func findOnce(n *html.Node, args []string, uni bool) (*html.Node, bool) {
278311
}
279312

280313
// Using depth first search to find all occurrences and return
281-
func findAllofem(n *html.Node, args []string) []*html.Node {
314+
func findAllofem(n *html.Node, args []string, strict bool) []*html.Node {
282315
var nodeLinks = make([]*html.Node, 0, 10)
283316
var f func(*html.Node, []string, bool)
284317
f = func(n *html.Node, args []string, uni bool) {
285318
if uni == true {
286319
if n.Data == args[0] {
287320
if len(args) > 1 && len(args) < 4 {
288321
for i := 0; i < len(n.Attr); i++ {
289-
if n.Attr[i].Key == args[1] && n.Attr[i].Val == args[2] {
322+
attr := n.Attr[i]
323+
searchAttrName := args[1]
324+
searchAttrVal := args[2]
325+
if (strict && attributeAndValueEquals(attr, searchAttrName, searchAttrVal)) ||
326+
(!strict && attributeContainsValue(attr, searchAttrName, searchAttrVal)) {
290327
nodeLinks = append(nodeLinks, n)
291328
}
292329
}
@@ -304,6 +341,25 @@ func findAllofem(n *html.Node, args []string) []*html.Node {
304341
return nodeLinks
305342
}
306343

344+
// attributeAndValueEquals reports when the html.Attribute attr has the same attribute name and value as from
345+
// provided arguments
346+
func attributeAndValueEquals(attr html.Attribute, attribute, value string) bool {
347+
return attr.Key == attribute && attr.Val == value
348+
}
349+
350+
// attributeContainsValue reports when the html.Attribute attr has the same attribute name as from provided
351+
// attribute argument and compares if it has the same value in its values parameter
352+
func attributeContainsValue(attr html.Attribute, attribute, value string) bool {
353+
if attr.Key == attribute {
354+
for _, attrVal := range strings.Fields(attr.Val) {
355+
if attrVal == value {
356+
return true
357+
}
358+
}
359+
}
360+
return false
361+
}
362+
307363
// Returns a key pair value (like a dictionary) for each attribute
308364
func getKeyValue(attributes []html.Attribute) map[string]string {
309365
var keyvalues = make(map[string]string)

soup_test.go

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,26 @@ const testHTML = `
4444
</html>
4545
`
4646

47+
const multipleClassesHTML = `
48+
<html>
49+
<head>
50+
<title>Sample Application</title>
51+
</head>
52+
<body>
53+
<div class="first second">Multiple classes</div>
54+
<div class="first">Single class</div>
55+
<div class="second first third">Multiple classes inorder</div>
56+
<div>
57+
<div class="first">Inner single class</div>
58+
<div class="first second">Inner multiple classes</div>
59+
<div class="second first">Inner multiple classes inorder</div>
60+
</div>
61+
</body>
62+
</html>
63+
`
64+
4765
var doc = HTMLParse(testHTML)
66+
var multipleClasses = HTMLParse(multipleClassesHTML)
4867

4968
func TestFind(t *testing.T) {
5069
// Find() and Attrs()
@@ -98,3 +117,53 @@ func TestFindAll(t *testing.T) {
98117
}
99118
}
100119
}
120+
121+
func TestFindAllBySingleClass(t *testing.T) {
122+
actual := multipleClasses.FindAll("div", "class", "first")
123+
if len(actual) != 6 {
124+
t.Errorf("Expected 6 elements to be returned. Actual: %d", len(actual))
125+
}
126+
actual = multipleClasses.FindAll("div", "class", "third")
127+
if len(actual) != 1 {
128+
t.Errorf("Expected 1 element to be returned. Actual: %d", len(actual))
129+
}
130+
}
131+
132+
func TestFindBySingleClass(t *testing.T) {
133+
actual := multipleClasses.Find("div", "class", "first")
134+
if actual.Text() != "Multiple classes" {
135+
t.Errorf("Wrong element returned: %s", actual.Text())
136+
}
137+
actual = multipleClasses.Find("div", "class", "third")
138+
if actual.Text() != "Multiple classes inorder" {
139+
t.Errorf("Wrong element returned: %s", actual.Text())
140+
}
141+
}
142+
143+
func TestFindAllStrict(t *testing.T) {
144+
actual := multipleClasses.FindAllStrict("div", "class", "first second")
145+
if len(actual) != 2 {
146+
t.Errorf("Expected 2 elements to be returned. Actual: %d", len(actual))
147+
}
148+
actual = multipleClasses.FindAllStrict("div", "class", "first third second")
149+
if len(actual) != 0 {
150+
t.Errorf("0 elements should be returned")
151+
}
152+
153+
actual = multipleClasses.FindAllStrict("div", "class", "second first third")
154+
if len(actual) != 1 {
155+
t.Errorf("Single item should be returned")
156+
}
157+
}
158+
159+
func TestFindStrict(t *testing.T) {
160+
actual := multipleClasses.FindStrict("div", "class", "first")
161+
if actual.Text() != "Single class" {
162+
t.Errorf("Wrong element returned: %s", actual.Text())
163+
}
164+
165+
actual = multipleClasses.FindStrict("div", "class", "third")
166+
if actual.Error == nil {
167+
t.Errorf("Element with class \"third\" should not be returned in strict mode")
168+
}
169+
}

0 commit comments

Comments
 (0)