Skip to content

Commit 1fcdea0

Browse files
committed
Implement an intelligent cache for the xpath expression string -> compiled operations (we cache the entire XPathExpression object). Add a test to ensure we are getting savings from this, and to measure how much.
Not in this patch, but additional measurements show that in a 75% hit rate scenario, we only spend 45% as much time, or 2.25 times as fast vs no cache, on the compiling portion.
1 parent e2195b7 commit 1fcdea0

File tree

3 files changed

+242
-2
lines changed

3 files changed

+242
-2
lines changed

AdvancedHTMLParser/xpath/_cache.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
'''
2+
Copyright (c) 2019 Timothy Savannah under terms of LGPLv3. All Rights Reserved.
3+
4+
See LICENSE (https://gnu.org/licenses/lgpl-3.0.txt) for more information.
5+
6+
See: https://github.com/kata198/AdvancedHTMLParser for full information
7+
8+
9+
==INTERNAL==
10+
11+
xpath._cache.py - Internal module for caching recent XPath expression parsings
12+
'''
13+
# vim: set ts=4 sw=4 st=4 expandtab :
14+
15+
import threading
16+
17+
from hashlib import sha1
18+
19+
from ..compat import ensureStringEncoded
20+
21+
__all__ = ('XPathExpressionCache', 'XPathExpressionCacheType', )
22+
23+
# MAX_CACHED_EXPRESSIONS - The maximum number of cached expressions before we perform a clean-up of the cache
24+
MAX_CACHED_EXPRESSIONS = 10
25+
26+
# CLEAR_AT_ONE_TIME - The number of cached expressions that we clear from the cache upon exceeding #MAX_CACHED_EXPRESSIONS
27+
CLEAR_AT_ONE_TIME = 3
28+
29+
class XPathExpressionCacheType(object):
30+
'''
31+
XPathExpressionCacheType - The type of the XPath Expression Cache.
32+
33+
This is meant to be used as a singleton, the instance being "XPathExpressionCache"
34+
'''
35+
36+
def __init__(self):
37+
'''
38+
__init__ - Create this object
39+
'''
40+
41+
self.cachedCompiledExpressions = {}
42+
self.recentCachedExpressionStrs = []
43+
44+
self.cacheLock = threading.Lock()
45+
46+
47+
@staticmethod
48+
def getKeyForExpressionStr(expressionStr):
49+
'''
50+
getKeyForExpressionStr - Get a unique hash "key" for a given expression str,
51+
52+
as will be used to cache the compiled expression.
53+
54+
55+
@param expressionStr <str/unicode/bytes> - The XPath expression str
56+
57+
58+
@return <str> - The key
59+
'''
60+
expressionStr = ensureStringEncoded(expressionStr)
61+
62+
return sha1(expressionStr).hexdigest()
63+
64+
65+
def getCachedExpression(self, expressionStr):
66+
'''
67+
getCachedExpression - Try to get a cached XPathExpression object for a given key
68+
69+
70+
@param expressionStr <str> - The XPath expression str
71+
72+
73+
@return <XPathExpression/None> - The XPathExpression object, if one was cached, otherwise None
74+
'''
75+
key = self.getKeyForExpressionStr(expressionStr)
76+
77+
self.cacheLock.acquire()
78+
xpathExpressionObj = self.cachedCompiledExpressions.get(key, None)
79+
80+
if xpathExpressionObj is None:
81+
self.cacheLock.release()
82+
return None
83+
84+
# We got a match, mark it as hot
85+
while True:
86+
# Ensure we remove all references, if multiple got in somehow
87+
try:
88+
self.recentCachedExpressionStrs.remove(key)
89+
except ValueError:
90+
break
91+
92+
# Add single refernce to end (hot side) of list
93+
self.recentCachedExpressionStrs.append(key)
94+
95+
self.cacheLock.release()
96+
97+
# And return the expression obj
98+
return xpathExpressionObj
99+
100+
101+
def applyCachedExpressionIfAvailable(self, expressionStr, xpathExpressionObj):
102+
'''
103+
applyCachedExpressionIfAvailable - Check if a cached compiled expression object is available, based on the xpath expression string,
104+
105+
and if it is, update the expression object's members with the cached version.
106+
107+
108+
@param expressionStr <str> - The XPath expression str
109+
110+
@param xpathExpressionObj <xpath.expression.XPathExpression> - The expression object
111+
112+
113+
@return <bool> - True if did apply from cache, False if no match (expression needs to be compiled)
114+
'''
115+
cachedExpression = self.getCachedExpression(expressionStr)
116+
if cachedExpression is None:
117+
return False
118+
119+
xpathExpressionObj._copyOperationsFromXPathExpressionObj(cachedExpression)
120+
return True
121+
122+
123+
def setCachedExpression(self, expressionStr, xpathExpressionObj):
124+
'''
125+
setCachedExpression - Sets the expression object to be cached under a given string
126+
127+
128+
@param expressionStr <str> - The XPath expression str
129+
130+
@param xpathExpressionObj <XPathExpression> - The XPathExpression object
131+
'''
132+
key = self.getKeyForExpressionStr(expressionStr)
133+
self.cacheLock.acquire()
134+
try:
135+
while True:
136+
# Ensure we remove all references, if multiple got in somehow
137+
try:
138+
self.recentCachedExpressionStrs.remove(key)
139+
except ValueError:
140+
break
141+
142+
self.cachedCompiledExpressions[key] = xpathExpressionObj
143+
self.recentCachedExpressionStrs.append(key)
144+
145+
numCachedExpressionStrs = len(self.recentCachedExpressionStrs)
146+
if numCachedExpressionStrs > MAX_CACHED_EXPRESSIONS:
147+
148+
numRemainingAfterClear = MAX_CACHED_EXPRESSIONS - CLEAR_AT_ONE_TIME
149+
150+
# Gather and remove overflow
151+
keysToRemove = self.recentCachedExpressionStrs[ : len(self.recentCachedExpressionStrs) - numRemainingAfterClear ]
152+
for keyToRemove in keysToRemove:
153+
try:
154+
del self.cachedCompiledExpressions[keyToRemove]
155+
except:
156+
pass
157+
158+
# Retain references to remaining
159+
self.recentCachedExpressionStrs = self.recentCachedExpressionStrs[ -1 * numRemainingAfterClear : ]
160+
161+
except Exception as exc:
162+
self.cacheLock.release()
163+
raise exc
164+
165+
self.cacheLock.release()
166+
167+
# XPathExpressionCache - The singleton instance of the XPath Expression Cache. Use this instead of creating a new XPathExpressionCacheType()
168+
XPathExpressionCache = XPathExpressionCacheType()
169+
170+
171+
# vim: set ts=4 sw=4 st=4 expandtab :

AdvancedHTMLParser/xpath/expression.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@
1010
'''
1111
# vim: set ts=4 sw=4 st=4 expandtab :
1212

13+
import copy
14+
1315
from ..Tags import TagCollection, AdvancedTag
1416

1517
from ._debug import getXPathDebug
1618
from .exceptions import XPathParseError
1719
from .operation import XPathOperation
1820
from .parsing import parseXPathStrIntoOperations
19-
21+
from ._cache import XPathExpressionCache
2022

2123
__all__ = ('XPathExpression', )
2224

@@ -35,7 +37,28 @@ def __init__(self, xpathStr):
3537
'''
3638

3739
self.xpathStr = xpathStr
38-
self.orderedOperations = parseXPathStrIntoOperations(self.xpathStr)
40+
41+
# Check if we've recently compiled this string, and copy the compiled operations, if so.
42+
wasCached = XPathExpressionCache.applyCachedExpressionIfAvailable( xpathStr, self )
43+
44+
if wasCached is False:
45+
# No cached entity found, compile this string
46+
self.orderedOperations = parseXPathStrIntoOperations(self.xpathStr)
47+
48+
# Save compiled expression in the expression cache
49+
XPathExpressionCache.setCachedExpression( xpathStr, self )
50+
51+
52+
def _copyOperationsFromXPathExpressionObj(self, otherXPathExpressionObj):
53+
'''
54+
_copyOperationsFromXPathExpressionObj - Copies the operations from another XPathExpression object onto this one.
55+
56+
This will clear the current set of operations on this object, replacing it with a copy from the provided object.
57+
58+
59+
@param otherXPathExpressionObj <XPathExpression> - Another XPathExpression object
60+
'''
61+
self.orderedOperations = copy.copy( otherXPathExpressionObj.orderedOperations )
3962

4063

4164
def evaluate(self, pathRoot):

tests/AdvancedHTMLParserTests/test_XPath.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
Test some xpath!
44
'''
55

6+
import time
67
import subprocess
78
import sys
89

@@ -446,6 +447,51 @@ def test_xpathContains(self):
446447
assert item3Found is True , 'Expected to find div id="item3" but did not!'
447448

448449

450+
def test_xpathCache(self):
451+
'''
452+
test_xpathCache - Test that the cache is working
453+
'''
454+
455+
startTime = time.time()
456+
457+
for i in range(200):
458+
# Generate three based off a repeating xpath string
459+
puddingNameSpans = self.parser.getElementsByXPathExpression('//span[ @name = "itemName" and contains( text(), "Pudding" ) ]')
460+
item3Ems = self.parser.getElementsByXPathExpression('''//*[ @id = "it" || "em" || "3" ]''')
461+
puddingNameEms = self.parser.getElementsByXPathExpression('//*[ (@name = "itemName") and contains( text(), "Pudding" ) ]')
462+
# and one random junk that will never hit on cache
463+
junkX = self.parser.getElementsByXPathExpression('''//*[ @id = "it" || "em" || "3" || "%d" || "%d" ]''' %(i, (i+1)*3) )
464+
465+
endTime = time.time()
466+
467+
timeWithCache = endTime - startTime
468+
469+
# Temporarily disable caching by nuking getCachedExpression to always return "miss"
470+
from AdvancedHTMLParser.xpath._cache import XPathExpressionCache
471+
oldGetCachedExpression = XPathExpressionCache.getCachedExpression
472+
XPathExpressionCache.getCachedExpression = lambda expressionStr : None
473+
474+
startTime = time.time()
475+
476+
for i in range(200):
477+
puddingNameSpans = self.parser.getElementsByXPathExpression('//span[ @name = "itemName" and contains( text(), "Pudding" ) ]')
478+
item3Ems = self.parser.getElementsByXPathExpression('''//*[ @id = "it" || "em" || "3" ]''')
479+
puddingNameEms = self.parser.getElementsByXPathExpression('//*[ (@name = "itemName") and contains( text(), "Pudding" ) ]')
480+
junkX = self.parser.getElementsByXPathExpression('''//*[ @id = "it" || "em" || "3" || "%d" || "%d" ]''' %(i, (i+1)*3) )
481+
482+
endTime = time.time()
483+
484+
timeWithoutCache = endTime - startTime
485+
486+
# Restore caching
487+
XPathExpressionCache.getCachedExpression = oldGetCachedExpression
488+
489+
timeWithoutCache = round(timeWithoutCache, 7)
490+
timeWithCache = round(timeWithCache, 7)
491+
print ( "No Cache: %.7f" %( timeWithoutCache, ))
492+
print ( "W/ Cache: %.7f" %( timeWithCache, ))
493+
494+
assert timeWithCache < timeWithoutCache , 'Expected compiling XPath strings to be faster when caching the compiled result, but was not.\nTime with cache : %.7f\nTime without cache: %.7f' %( timeWithCache, timeWithoutCache)
449495

450496
if __name__ == '__main__':
451497
sys.exit(subprocess.Popen('GoodTests.py -n1 "%s" %s' %(sys.argv[0], ' '.join(['"%s"' %(arg.replace('"', '\\"'), ) for arg in sys.argv[1:]]) ), shell=True).wait())

0 commit comments

Comments
 (0)