Skip to content

Commit 6bd3731

Browse files
0xDEC0DEnisimond
andauthored
Refactor the YumFinder (#38)
* Refactor the YumFinder Implement our own XML fetch/parse routines for consuming repomd data, and create a simple lookup dict from the results rather than trying to shoehorn in use of the `repomd.findall` library call. This yields a 30%-50% speedup in the functional tests. * Drop yum.lookup_in_repomd function Stepping through the refactored workflow showed that the YumFinder was deep-copying a dict into a subprocess, looking up a value, and returning the result. Surely, there must be a better way. Co-Authored-By: Nicolas Simonds <nisimond@cisco.com>
1 parent cfec165 commit 6bd3731

File tree

2 files changed

+77
-101
lines changed

2 files changed

+77
-101
lines changed

soufi/finders/yum.py

Lines changed: 55 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,18 @@
22
# All rights reserved.
33

44
import abc
5-
import lzma
5+
import gzip
6+
import pathlib
67
import pickle # nosec
78
import sys
89
import textwrap
910
import warnings
1011
from multiprocessing import Process, Queue
1112
from types import SimpleNamespace
1213

14+
import defusedxml.lxml
1315
import repomd
16+
import requests
1417
from dogpile.cache.backends.null import NullBackend
1518

1619
from soufi import exceptions, finder
@@ -116,12 +119,12 @@ def _walk_source_repos(self, name, version=None):
116119
version = self.version
117120
locations = set()
118121
for repo_url in self.generate_source_repos():
119-
baseurl, repo_xml = self._cache.get_or_create(
122+
baseurl, repo = self._cache.get_or_create(
120123
f"repo-{repo_url}",
121124
do_task,
122125
creator_args=([get_repomd, repo_url], {}),
123126
)
124-
for package in do_task(lookup_in_repomd, baseurl, repo_xml, name):
127+
for package in repo.get(name, []):
125128
# If the package version in the repomd is our version,
126129
# it's easy. Note that we want to match epoch-full and
127130
# epoch-less version formats.
@@ -144,12 +147,12 @@ def _walk_source_repos(self, name, version=None):
144147
def _walk_binary_repos(self, name):
145148
packages = set()
146149
for repo_url in self.generate_binary_repos():
147-
baseurl, repo_xml = self._cache.get_or_create(
150+
_, repo = self._cache.get_or_create(
148151
f"repo-{repo_url}",
149152
do_task,
150153
creator_args=([get_repomd, repo_url], {}),
151154
)
152-
for package in do_task(lookup_in_repomd, baseurl, repo_xml, name):
155+
for package in repo.get(name, []):
153156
# If we have a binary package matching our version, but
154157
# with a different name than the corresponding source
155158
# package, return the NVR fields
@@ -263,18 +266,50 @@ def do_task(target, *args):
263266
return response
264267

265268

266-
# NOTE(nic): To allow for serializing repomd objects, We will instead
267-
# re-serialize the object into its original XML, then re-create and use a
268-
# new Repo object from the cached XML on every cache hit. This makes cache
269-
# hits relatively expensive, but they're still orders of magnitude faster
270-
# than the alternative. The XML is LZMA-compressed here to keep cache
271-
# storage utilization low, and in the case of Redis caching, to reduce the
272-
# amount of cache traffic sent over the wire.
269+
# NOTE(nic): stolen almost verbatim from repomd.load, except this one:
270+
# - has timeouts
271+
# - uses requests instead of urllib to do the heavy lifting
272+
# - uses `lxml.parse` and file objects instead of `lxml.fromstring`
273+
# - returns a plain dict instead of a Repo object
274+
def load_repomd(url):
275+
timeout = YumFinder.timeout
276+
baseurl = requests.utils.parse_url(url)
277+
path = pathlib.PurePosixPath(baseurl.path)
278+
279+
# first we must get the repomd.xml file
280+
repomd_path = path / 'repodata' / 'repomd.xml'
281+
repomd_url = baseurl._replace(path=str(repomd_path))
282+
283+
with requests.get(repomd_url, stream=True, timeout=timeout) as r:
284+
r.raw.decode_content = True
285+
repomd_xml = defusedxml.lxml.parse(r.raw)
286+
287+
# determine the location of *primary.xml.gz from the repomd.xml
288+
primary_element = repomd_xml.find(
289+
'repo:data[@type="primary"]/repo:location', namespaces=repomd._ns
290+
)
291+
primary_path = path / primary_element.get('href')
292+
primary_url = baseurl._replace(path=str(primary_path))
293+
294+
# download and consume *-primary.xml into a dict object for fast lookups.
295+
# We will use repomd.Package objects rather than reimplement them,
296+
# but only as an intermediate step (see `serialize_package`, below)
297+
repo = {}
298+
with requests.get(primary_url, stream=True, timeout=timeout) as r:
299+
r.raw.decode_content = True
300+
with gzip.GzipFile(fileobj=r.raw) as uncompressed:
301+
for element in defusedxml.lxml.parse(uncompressed).getroot():
302+
package = repomd.Package(element)
303+
repo.setdefault(package.name, [])
304+
repo[package.name].append(serialize_package(package))
305+
return repo
306+
307+
273308
def get_repomd(queue, url):
274309
if not url.endswith('/'):
275310
url += '/'
276311
try:
277-
repo = repomd.load(url)
312+
repo = load_repomd(url)
278313
except Exception as e:
279314
# Try and send exceptions back to the caller, just like anything
280315
# else. It's up to the receiver to inspect and re-raise. If the
@@ -289,17 +324,18 @@ def get_repomd(queue, url):
289324

290325
queue.put((e,), timeout=YumFinder.timeout)
291326
return
292-
payload = repomd.defusedxml.lxml.tostring(repo._metadata)
293-
queue.put((str(repo.baseurl), lzma.compress(payload)))
327+
queue.put((url, repo))
294328

295329

296330
# NOTE(nic): repomd.Package object properties do XPath lookups into the
297331
# ElementTree and other similar tricks on the backend, which makes them
298332
# unsuitable for the IPC-based workflow we're trying to use, so we'll
299-
# convert them into simple objects with identical names so that they can be
300-
# easily pickled and passed around. The upside is that we only need to
301-
# carry over what we need to do package lookups. The downside is that we only
302-
# get what we've carried over.
333+
# convert them into simple objects with identical field names so that they
334+
# can be easily pickled and passed around. The upside is that we only need
335+
# to carry over what we need to do package lookups, reducing the space
336+
# requirements by at least two orders of magnitude. The downside is that we
337+
# only get what we've carried over, but in practice that's the opposite of
338+
# a problem.
303339
def serialize_package(package):
304340
return SimpleNamespace(
305341
name=str(package.name),
@@ -309,12 +345,3 @@ def serialize_package(package):
309345
location=str(package.location),
310346
sourcerpm=str(package.sourcerpm),
311347
)
312-
313-
314-
def lookup_in_repomd(queue, baseurl, repomd_xml, name):
315-
if None in (baseurl, repomd_xml):
316-
queue.put([], timeout=YumFinder.timeout)
317-
return
318-
payload = lzma.decompress(repomd_xml)
319-
repo = repomd.Repo(baseurl, repomd.defusedxml.lxml.fromstring(payload))
320-
queue.put([serialize_package(p) for p in repo.findall(name)])

soufi/tests/finders/test_yum_finder.py

Lines changed: 22 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
# Copyright (c) 2021 Cisco Systems, Inc. and its affiliates
22
# All rights reserved.
33
import io
4-
import lzma
54
import string
6-
import urllib
75
from itertools import repeat
86
from unittest import mock
97

@@ -204,7 +202,7 @@ def test__walk_source_repos(self):
204202
finder = self.make_finder(source_repos=src, binary_repos=bin)
205203
package = self.FakePackage(vr=finder.version)
206204
do_task = self.patch(yum, 'do_task')
207-
do_task.side_effect = ([baseurl, None], [package])
205+
do_task.return_value = (baseurl, {finder.name: [package]})
208206
self.patch(finder, 'test_url').return_value = False
209207
url = finder._walk_source_repos(finder.name)
210208
self.assertEqual(baseurl + package.location, url)
@@ -216,7 +214,7 @@ def test__walk_source_repos_different_version_hit(self):
216214
finder = self.make_finder(source_repos=src, binary_repos=bin)
217215
package = self.FakePackage()
218216
do_task = self.patch(yum, 'do_task')
219-
do_task.side_effect = ([baseurl, None], [package])
217+
do_task.return_value = (baseurl, {finder.name: [package]})
220218
self.patch(finder, 'test_url').return_value = True
221219
url = finder._walk_source_repos(finder.name)
222220
self.assertEqual(baseurl + package.location, url)
@@ -228,7 +226,7 @@ def test__walk_source_repos_different_version_miss(self):
228226
finder = self.make_finder(source_repos=src, binary_repos=bin)
229227
package = self.FakePackage()
230228
do_task = self.patch(yum, 'do_task')
231-
do_task.side_effect = ([baseurl, None], [package])
229+
do_task.return_value = (baseurl, {finder.name: [package]})
232230
self.patch(finder, 'test_url').return_value = False
233231
url = finder._walk_source_repos(finder.name)
234232
self.assertIsNone(url)
@@ -243,7 +241,7 @@ def test__walk_binary_repos(self):
243241
srcrpm = self.make_package(n=n, v=v, r=r)
244242
package = self.FakePackage(vr=finder.version, sourcerpm=srcrpm)
245243
do_task = self.patch(yum, 'do_task')
246-
do_task.side_effect = ([None, None], [package])
244+
do_task.return_value = (None, {finder.name: [package]})
247245
name, version = finder._walk_binary_repos(finder.name)
248246
self.expectThat(name, Equals(n))
249247
self.expectThat(version, Equals(f"{v}-{r}"))
@@ -254,7 +252,7 @@ def test__walk_binary_repos_no_sourcerpm(self):
254252
finder = self.make_finder(source_repos=src, binary_repos=bin)
255253
package = self.FakePackage(vr=finder.version, sourcerpm='')
256254
do_task = self.patch(yum, 'do_task')
257-
do_task.side_effect = ([None, None], [package])
255+
do_task.return_value = (None, {finder.name: [package]})
258256
name, version = finder._walk_binary_repos(finder.name)
259257
self.assertIsNone(name)
260258
self.assertIsNone(version)
@@ -266,7 +264,7 @@ def test__walk_binary_repos_different_name_multiple_versions(self):
266264
package1 = self.FakePackage()
267265
package2 = self.FakePackage()
268266
do_task = self.patch(yum, 'do_task')
269-
do_task.side_effect = ([None, None], [package1, package2])
267+
do_task.return_value = (None, {finder.name: [package1, package2]})
270268
name, version = finder._walk_binary_repos(finder.name)
271269
self.assertIsNone(name)
272270
self.assertIsNone(version)
@@ -281,7 +279,7 @@ def test__walk_binary_repos_different_name_different_version(self):
281279
srcrpm = self.make_package(n=n, v=v, r=r)
282280
package = self.FakePackage(sourcerpm=srcrpm)
283281
do_task = self.patch(yum, 'do_task')
284-
do_task.side_effect = ([None, None], [package])
282+
do_task.return_value = (None, {finder.name: [package]})
285283
name, version = finder._walk_binary_repos(finder.name)
286284
self.expectThat(name, Equals(n))
287285
self.expectThat(version, Equals(f"{v}-{r}"))
@@ -355,97 +353,48 @@ def setUp(self):
355353
def test_get_repomd(self):
356354
# Mock up a successful repomd fetch
357355
url = self.factory.make_url()
358-
repo = self.patch(repomd, 'load')
359-
lxml = self.patch(repomd.defusedxml.lxml, 'tostring')
360-
lxml.return_value = b"<xml>Test</xml>"
361-
compress = self.patch(lzma, 'compress')
356+
self.patch(requests, 'get')
357+
lxml = self.patch(repomd.defusedxml.lxml, 'parse')
358+
package = mock.MagicMock()
359+
lxml.return_value.getroot.return_value = [package]
362360

363-
# Ensure that get_repomd re-serializes the object XML and returns a
364-
# compressed payload
365361
yum.get_repomd(self.queue, url)
366-
compress.assert_called_once_with(lxml.return_value)
362+
# Mocking out all the various and sundry Package properties would be
363+
# tedious, and we don't really care about anything past the package
364+
# name anyhow
367365
self.queue.put.assert_called_once_with(
368-
(str(repo.return_value.baseurl), compress.return_value)
366+
(url + "/", {package.findtext.return_value: [mock.ANY]})
369367
)
370368

371369
def test_get_repomd_http_error(self):
372370
# Mock up a failure to fetch the repomd
373371
url = self.factory.make_url()
374-
load = self.patch(repomd, 'load')
375-
load.side_effect = urllib.error.HTTPError(None, None, None, None, None)
376-
lxml = self.patch(repomd.defusedxml.lxml, 'tostring')
377-
compress = self.patch(lzma, 'compress')
372+
load = self.patch(requests, 'get')
373+
load.side_effect = requests.exceptions.HTTPError()
374+
lxml = self.patch(repomd.defusedxml.lxml, 'parse')
378375

379376
# Ensure that get_repomd won't fill the cache with garbage
380377
yum.get_repomd(self.queue, url)
381378
lxml.assert_not_called()
382-
compress.assert_not_called()
383379
self.queue.put.assert_called_once_with((load.side_effect,), timeout=30)
384380

385381
def test_get_repomd_unserializable_http_error(self):
386-
# Ibid, but initializing the HTTPError with a live file pointer will
382+
# Ibid, but initializing the exception with a live file pointer will
387383
# make it refuse to serialize
388384
url = self.factory.make_url()
389385
fp = io.BufferedReader(io.StringIO())
390-
load = self.patch(repomd, 'load')
391-
load.side_effect = urllib.error.HTTPError(None, None, None, None, fp)
392-
lxml = self.patch(repomd.defusedxml.lxml, 'tostring')
393-
compress = self.patch(lzma, 'compress')
386+
load = self.patch(requests, 'get')
387+
load.side_effect = requests.exceptions.RequestException(fp)
388+
lxml = self.patch(repomd.defusedxml.lxml, 'parse')
394389

395390
# Ensure that we get a re-raised plain Exception
396391
yum.get_repomd(self.queue, url)
397392
lxml.assert_not_called()
398-
compress.assert_not_called()
399393
self.queue.put.assert_called_once_with((mock.ANY,), timeout=30)
400394
self.assertIn(
401395
're-raising as plain Exception', str(self.queue.put.call_args)
402396
)
403397

404-
def test_lookup_in_repomd(self):
405-
# Mock up a successful package lookup
406-
baseurl = self.factory.make_url()
407-
repomd_xml = self.factory.make_string()
408-
name = self.factory.make_string()
409-
decompress = self.patch(lzma, 'decompress')
410-
lxml = self.patch(repomd.defusedxml.lxml, 'fromstring')
411-
repo = self.patch(repomd, 'Repo')
412-
package = mock.MagicMock()
413-
repo.return_value.findall.return_value = [package]
414-
415-
# lookup_in_repomd should decompress/rehydrate a repomd object
416-
yum.lookup_in_repomd(self.queue, baseurl, repomd_xml, name)
417-
decompress.assert_called_once_with(repomd_xml)
418-
lxml.assert_called_once_with(decompress.return_value)
419-
repo.assert_called_once_with(baseurl, lxml.return_value)
420-
# The response should be converted into a "simplified package"
421-
self.queue.put.assert_called_once_with(
422-
[yum.serialize_package(package)]
423-
)
424-
425-
def test_lookup_in_repomd_no_baseurl(self):
426-
# Ensure that calling lookup_in_repomd without valid args does no work
427-
repomd_xml = self.factory.make_string()
428-
name = self.factory.make_string()
429-
decompress = self.patch(lzma, 'decompress')
430-
repo = self.patch(repomd, 'Repo')
431-
432-
yum.lookup_in_repomd(self.queue, None, repomd_xml, name)
433-
decompress.assert_not_called()
434-
repo.assert_not_called()
435-
self.queue.put.assert_called_once_with([], timeout=30)
436-
437-
def test_lookup_in_repomd_no_repoxml(self):
438-
# Ibid.
439-
baseurl = self.factory.make_url()
440-
name = self.factory.make_string()
441-
decompress = self.patch(lzma, 'decompress')
442-
repo = self.patch(repomd, 'Repo')
443-
444-
yum.lookup_in_repomd(self.queue, baseurl, None, name)
445-
decompress.assert_not_called()
446-
repo.assert_not_called()
447-
self.queue.put.assert_called_once_with([], timeout=30)
448-
449398
def test_do_task(self):
450399
# Mock up a process that does not exit upon return
451400
data = self.factory.make_string('response')

0 commit comments

Comments
 (0)