Skip to content

Commit ec1b538

Browse files
committed
Fix getElementsByClassName to support multiple class names, from document (parser), tags (AdvancedTag), or TagCollections.
With tests.
1 parent 0036f64 commit ec1b538

File tree

3 files changed

+142
-5
lines changed

3 files changed

+142
-5
lines changed

AdvancedHTMLParser/Parser.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,13 +379,20 @@ def getElementsByClassName(self, className, root='root'):
379379
'''
380380
getElementsByClassName - Searches and returns all elements containing a given class name.
381381
382-
@param className <str> - A one-word class name
382+
@param className <str> - One or more space-separated class names
383+
383384
@param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root' [default], the root of the parsed tree will be used.
384385
'''
385386
(root, isFromRoot) = self._handleRootArg(root)
386387

387388
elements = []
388389

390+
# Generate list of all classnames to match
391+
classNames = [x.strip() for x in className.strip().split(' ') if x.strip()]
392+
393+
# Run through entire tree on the first
394+
className = classNames.pop(0)
395+
389396
if isFromRoot is True and className in root.classNames:
390397
elements.append(root)
391398

@@ -397,6 +404,11 @@ def getElementsByClassName(self, className, root='root'):
397404

398405
elements += getElementsByClassName(className, child)
399406

407+
408+
# Check if we need to match against any other names
409+
if len(classNames) > 0:
410+
elements = [ em for em in elements for matchClassName in classNames if matchClassName in em.classList ]
411+
400412
return TagCollection(elements)
401413

402414
def getElementsByAttr(self, attrName, attrValue, root='root'):
@@ -1276,8 +1288,11 @@ def getElementsByClassName(self, className, root='root', useIndex=True):
12761288
'''
12771289
getElementsByClassName - Searches and returns all elements containing a given class name.
12781290
1279-
@param className <str> - A one-word class name
1291+
1292+
@param className <str> - One or more space-separated class names
1293+
12801294
@param root <AdvancedTag/'root'> - Search starting at a specific node, if provided. if string 'root', the root of the parsed tree will be used.
1295+
12811296
@param useIndex <bool> If useIndex is True and class names are indexed [see constructor] only the index will be used. Otherwise a full search is performed.
12821297
'''
12831298
(root, isFromRoot) = self._handleRootArg(root)

AdvancedHTMLParser/Tags.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1891,15 +1891,27 @@ def getElementsByClassName(self, className):
18911891
'''
18921892
getElementsByClassName - Search children of this tag for tags containing a given class name
18931893
1894-
@param className - Class name
1894+
@param className <str> - One or more space-separated class names
18951895
18961896
@return - TagCollection of matching elements
18971897
'''
18981898
elements = []
1899+
1900+
# Generate list of all classnames to match
1901+
classNames = [x.strip() for x in className.strip().split(' ') if x.strip()]
1902+
1903+
# Run through entire tree on the first
1904+
className = classNames.pop(0)
1905+
18991906
for child in self.children:
19001907
if child.hasClass(className) is True:
19011908
elements.append(child)
19021909
elements += child.getElementsByClassName(className)
1910+
1911+
# Check if we need to match against any other names
1912+
if len(classNames) > 0:
1913+
elements = [ em for em in elements for matchClassName in classNames if matchClassName in em.classList ]
1914+
19031915
return TagCollection(elements)
19041916

19051917
def getElementsWithAttrValues(self, attrName, attrValues):
@@ -2391,14 +2403,30 @@ def getElementsByClassName(self, className):
23912403
'''
23922404
getElementsByClassName - Get elements within this collection containing a specific class name
23932405
2394-
@param className - A single class name
2406+
@param className <str> - One or more space-separated class names
23952407
23962408
@return - TagCollection of unique elements within this collection tagged with a specific class name
23972409
'''
23982410
ret = TagCollection()
23992411
if len(self) == 0:
24002412
return ret
2401-
_cmpFunc = lambda tag : tag.hasClass(className)
2413+
2414+
# Check for multiple class names
2415+
classNames = className.split(' ')
2416+
if len(classNames) <= 1:
2417+
# Simple - 1 class name
2418+
_cmpFunc = lambda tag : tag.hasClass(className)
2419+
else:
2420+
# Multiple class names
2421+
def _cmpFunc(tag):
2422+
tagClassList = tag.classList
2423+
2424+
for cName in classNames:
2425+
if cName not in tagClassList:
2426+
return False
2427+
2428+
return True
2429+
24022430
for tag in self:
24032431
TagCollection._subset(ret, _cmpFunc, tag)
24042432

tests/AdvancedHTMLParserTests/test_Attributes.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,5 +1225,99 @@ def test_getElementsByClassName(self):
12251225
assert len(classDash2Ems) == 1 , 'Expected to find 1 element with class name = "class-dash2" but got %d. %s' %( len(classDash2Ems), repr(classDash2Ems))
12261226

12271227

1228+
def test_getElementsByClassNameMultipleClasses(self):
1229+
'''
1230+
test_getElementsByClassNameMultipleClasses -
1231+
1232+
This will test that the "getElementsByClassName" method supports a space-separated list of classes in which to match via AND
1233+
'''
1234+
1235+
htmlStr = '''<html><head></head><body>
1236+
1237+
<div id="em1" class="outer blue one" name="outerDiv" >
1238+
1239+
<div id="em2" class="inner blue one" >One</div>
1240+
<div id="em3" class="inner blue two" >Two</div>
1241+
<div id="em4" class="inner green one" >G One</div>
1242+
</div>
1243+
</body>
1244+
</html>
1245+
'''
1246+
1247+
document = AdvancedHTMLParser()
1248+
document.parseStr(htmlStr)
1249+
1250+
# Test a regular query, one element
1251+
oneEms = document.getElementsByClassName('one')
1252+
oneIds = [ em.id for em in oneEms ]
1253+
1254+
assert 'em1' in oneIds , 'Expected to find id="em1" (contains class "one") but did not. Matched: ' + repr(oneIds)
1255+
assert 'em2' in oneIds , 'Expected to find id="em2" (contains class "one") but did not. Matched: ' + repr(oneIds)
1256+
1257+
1258+
# Test multiple, sequential
1259+
blueOneEms = document.getElementsByClassName('blue one')
1260+
blueOneIds = [ em.id for em in blueOneEms ]
1261+
1262+
assert 'em1' in blueOneIds , 'Expected to find id="em1" via "blue one" (contains classes "blue" and "one") but did not. Matched: ' + repr(blueOneIds)
1263+
assert 'em2' in blueOneIds , 'Expected to find id="em2" via "blue one" (contains classes "blue" and "one") but did not. Matched: ' + repr(blueOneIds)
1264+
1265+
1266+
# Test from a tag
1267+
blueOneEms = document.body.getElementsByClassName('blue one')
1268+
blueOneIds = [ em.id for em in blueOneEms ]
1269+
assert 'em1' in blueOneIds , 'Expected to find id="em1" via "blue one" (contains classes "blue" and "one") from a tag but did not. Matched: ' + repr(blueOneIds)
1270+
assert 'em2' in blueOneIds , 'Expected to find id="em2" via "blue one" (contains classes "blue" and "one") from a tag but did not. Matched: ' + repr(blueOneIds)
1271+
1272+
1273+
# Test from a tag collection
1274+
divs = document.getElementsByTagName('div')
1275+
blueOneEms = divs.getElementsByClassName('blue one')
1276+
blueOneIds = [ em.id for em in blueOneEms ]
1277+
assert 'em1' in blueOneIds , 'Expected to find id="em1" via "blue one" (contains classes "blue" and "one") through tag collection but did not. Matched: ' + repr(blueOneIds)
1278+
assert 'em2' in blueOneIds , 'Expected to find id="em2" via "blue one" (contains classes "blue" and "one") through tag collection but did not. Matched: ' + repr(blueOneIds)
1279+
1280+
1281+
1282+
# Try reverse order
1283+
oneBlueEms = document.getElementsByClassName('one blue')
1284+
oneBlueIds = [ em.id for em in oneBlueEms ]
1285+
1286+
1287+
assert 'em1' in oneBlueIds, 'Expected to find id="em1" via "one blue" (contains classes "blue" and "one") but did not. Matched: ' + repr(oneBlueIds)
1288+
assert 'em2' in oneBlueIds, 'Expected to find id="em2" via "one blue" (contains classes "blue" and "one") but did not. Matched: ' + repr(oneBlueIds)
1289+
1290+
1291+
# Try reverse order from a tag
1292+
oneBlueEms = document.body.getElementsByClassName('one blue')
1293+
oneBlueIds = [ em.id for em in oneBlueEms ]
1294+
1295+
assert 'em1' in oneBlueIds, 'Expected to find id="em1" via "one blue" (contains classes "blue" and "one") from a tag but did not. Matched: ' + repr(oneBlueIds)
1296+
assert 'em2' in oneBlueIds, 'Expected to find id="em2" via "one blue" (contains classes "blue" and "one") from a tag but did not. Matched: ' + repr(oneBlueIds)
1297+
1298+
1299+
# Try reverse order from a tagcollection
1300+
divs = document.getElementsByTagName('div')
1301+
oneBlueEms = divs.getElementsByClassName('one blue')
1302+
oneBlueIds = [ em.id for em in oneBlueEms ]
1303+
1304+
assert 'em1' in oneBlueIds, 'Expected to find id="em1" via "one blue" (contains classes "blue" and "one") from a tagcollection but did not. Matched: ' + repr(oneBlueIds)
1305+
assert 'em2' in oneBlueIds, 'Expected to find id="em2" via "one blue" (contains classes "blue" and "one") from a tagcollection but did not. Matched: ' + repr(oneBlueIds)
1306+
1307+
1308+
1309+
# Try others
1310+
innerBlueEms = document.getElementsByClassName('inner blue')
1311+
innerBlueIds = [ em.id for em in innerBlueEms ]
1312+
1313+
assert 'em2' in innerBlueIds , 'Expected to find id="em2" via "inner blue" ( contains classes "blue" and "inner") but did not. Matched: ' + repr(innerBlueIds)
1314+
assert 'em3' in innerBlueIds , 'Expected to find id="em3" via "inner blue" ( contains classes "blue" and "inner") but did not. Matched: ' + repr(innerBlueIds)
1315+
1316+
# Now validate on tag collection
1317+
outerDivNames = document.getElementsByName('outerDiv')
1318+
1319+
assert len(outerDivNames) == 1 , 'Expected to find div name="outerDiv" but did not.'
1320+
1321+
12281322
if __name__ == '__main__':
12291323
sys.exit(subprocess.Popen('GoodTests.py -n1 "%s" %s' %(sys.argv[0], ' '.join(['"%s"' %(arg.replace('"', '\\"'), ) for arg in sys.argv[1:]]) ), shell=True).wait())

0 commit comments

Comments
 (0)