Skip to content

Commit fdf5e3b

Browse files
committed
Add links database reading and API
The $repo.links.tar.gz databases provided in the repositories are now parsed by archweb, to be able to expose them via a simple API and to be able to list the linked sonames in the package details overview.
1 parent 7a10972 commit fdf5e3b

File tree

16 files changed

+493
-124
lines changed

16 files changed

+493
-124
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ Archweb provides multiple management commands for importing various sorts of dat
125125
* mirrorresolv - Poll every active mirror URLs and determine wheteher they have IP4 and/or IPv6 addresses.
126126
* populate_signoffs - retrieves the latest commit message of a signoff-eligible package.
127127
* update_planet - Import all feeds for users who have a valid website and website_rss in their user profile.
128+
* read_links - Reads a repo.links.db.tar.gz file and updates the Soname model.
129+
* read_links_inotify - Watches a templated patch for updates of *.links.tar.gz to update Arch databases with.
128130

129131
# Updating iPXE image
130132

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import logging
2+
import multiprocessing
3+
import os
4+
import pyinotify
5+
import sys
6+
import threading
7+
import time
8+
9+
from django.db.utils import OperationalError
10+
11+
12+
logging.basicConfig(
13+
level=logging.WARNING,
14+
format='%(asctime)s -> %(levelname)s: %(message)s',
15+
datefmt='%Y-%m-%d %H:%M:%S',
16+
stream=sys.stderr)
17+
logger = logging.getLogger()
18+
19+
20+
class Database(object):
21+
'''A object representing a pacman database on the filesystem. It stores
22+
various bits of metadata and state representing the file path, when we last
23+
updated, how long our delay is before performing the update, whether we are
24+
updating now, etc.'''
25+
def __init__(self, arch, path, callback_func, delay=60.0, nice=3, retry_limit=5):
26+
self.arch = arch
27+
self.path = path
28+
self.delay = delay
29+
self.nice = nice
30+
self.retry_limit = retry_limit
31+
self.mtime = None
32+
self.last_import = None
33+
self.update_thread = None
34+
self.updating = False
35+
self.run_again = False
36+
self.lock = threading.Lock()
37+
self.callback_func = callback_func
38+
39+
def _start_update_countdown(self):
40+
self.update_thread = threading.Timer(self.delay, self.update)
41+
logger.info('Starting %.1f second countdown to update %s',
42+
self.delay, self.path)
43+
self.update_thread.start()
44+
45+
def queue_for_update(self, mtime):
46+
logger.debug('Queueing database %s...', self.path)
47+
with self.lock:
48+
self.mtime = mtime
49+
if self.updating:
50+
# store the fact that we will need to run it again
51+
self.run_again = True
52+
return
53+
if self.update_thread:
54+
self.update_thread.cancel()
55+
self.update_thread = None
56+
self._start_update_countdown()
57+
58+
def update(self):
59+
logger.debug('Updating database %s...', self.path)
60+
with self.lock:
61+
self.last_import = time.time()
62+
self.updating = True
63+
64+
try:
65+
# invoke reporead's primary method. we do this in a separate
66+
# process for memory conservation purposes; these processes grow
67+
# rather large so it is best to free up the memory ASAP.
68+
# A retry mechanism exists for when reporead_inotify runs on a different machine.
69+
def run():
70+
retry = True
71+
retry_count = 0
72+
if self.nice != 0:
73+
os.nice(self.nice)
74+
while retry and retry_count < self.retry_limit:
75+
try:
76+
self.callback_func(self.arch, self.path, {})
77+
retry = False
78+
except OperationalError as exc:
79+
retry_count += 1
80+
logger.error('Unable to update database \'%s\', retrying=%d', self.path, retry_count, exc_info=exc)
81+
time.sleep(5)
82+
83+
if retry_count == self.retry_limit:
84+
logger.error('Unable to update database, exceeded maximum retries')
85+
86+
process = multiprocessing.Process(target=run)
87+
process.start()
88+
process.join()
89+
finally:
90+
logger.debug('Done updating database %s.', self.path)
91+
with self.lock:
92+
self.update_thread = None
93+
self.updating = False
94+
if self.run_again:
95+
self.run_again = False
96+
self._start_update_countdown()
97+
98+
99+
class EventHandler(pyinotify.ProcessEvent):
100+
'''Our main event handler which listens for database change events. Because
101+
we are watching the whole directory, we filter down and only look at those
102+
events dealing with files databases.'''
103+
104+
def my_init(self, filename_suffix, callback_func, **kwargs):
105+
self.databases = {}
106+
self.arch_lookup = {}
107+
108+
self.filename_suffix = filename_suffix
109+
self.callback_func = callback_func
110+
111+
# we really want a single path to arch mapping, so massage the data
112+
arch_paths = kwargs['arch_paths']
113+
for arch, paths in arch_paths.items():
114+
self.arch_lookup.update((path.rstrip('/'), arch) for path in paths)
115+
116+
def process_default(self, event):
117+
'''Primary event processing function which kicks off reporead timer
118+
threads if a files database was updated.'''
119+
name = event.name
120+
if not name:
121+
return
122+
# screen to only the files we care about, skipping temp files
123+
if name.endswith(self.filename_suffix) and not name.startswith('.'):
124+
path = event.pathname
125+
stat = os.stat(path)
126+
database = self.databases.get(path, None)
127+
if database is None:
128+
arch = self.arch_lookup.get(event.path, None)
129+
if arch is None:
130+
logger.warning(
131+
'Could not determine arch for %s, skipping update',
132+
path)
133+
return
134+
database = Database(arch, path, self.callback_func)
135+
self.databases[path] = database
136+
database.queue_for_update(stat.st_mtime)
137+
138+
139+
# vim: set ts=4 sw=4 et:
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import logging
2+
import pyinotify
3+
import sys
4+
import threading
5+
6+
from django.core.management.base import BaseCommand, CommandError
7+
from django.db import connection, transaction
8+
9+
from main.models import Arch, Repo
10+
from .readlinks import read_links
11+
from .archweb_inotify import EventHandler
12+
13+
logging.basicConfig(
14+
level=logging.INFO,
15+
format='%(asctime)s -> %(levelname)s: %(message)s',
16+
datefmt='%Y-%m-%d %H:%M:%S',
17+
stream=sys.stderr)
18+
logger = logging.getLogger()
19+
20+
21+
def wrapper_read_links(arch, filepath, obj):
22+
read_links(filepath)
23+
24+
25+
class Command(BaseCommand):
26+
help = "Watch links files and run an update when necessary."
27+
args = "[path_template]"
28+
29+
def handle(self, path_template=None, **options):
30+
v = int(options.get('verbosity', 0))
31+
if v == 0:
32+
logger.level = logging.ERROR
33+
elif v == 1:
34+
logger.level = logging.INFO
35+
elif v >= 2:
36+
logger.level = logging.DEBUG
37+
38+
if not path_template:
39+
path_template = '/srv/ftp/%(repo)s/os/%(arch)s/'
40+
self.path_template = path_template
41+
42+
notifier = self.setup_notifier()
43+
# this thread is done using the database; all future access is done in
44+
# the spawned read_repo() processes, so close the otherwise completely
45+
# idle connection.
46+
connection.close()
47+
48+
logger.info('Entering notifier loop')
49+
notifier.loop()
50+
51+
logger.info('Cancelling remaining threads...')
52+
for thread in threading.enumerate():
53+
if hasattr(thread, 'cancel'):
54+
thread.cancel()
55+
56+
@transaction.atomic
57+
def setup_notifier(self):
58+
'''Set up and configure the inotify machinery and logic.
59+
This takes the provided or default path_template and builds a list of
60+
directories we need to watch for database updates. It then validates
61+
and passes these on to the various pyinotify pieces as necessary and
62+
finally builds and returns a notifier object.'''
63+
with transaction.atomic():
64+
arches = Arch.objects.filter(agnostic=False)
65+
repos = Repo.objects.all()
66+
67+
arch_path_map = {arch: None for arch in arches}
68+
all_paths = set()
69+
total_paths = 0
70+
for arch in arches:
71+
combos = ({'repo': repo.name.lower(), 'arch': arch.name}
72+
for repo in repos)
73+
# take a python format string and generate all unique combinations
74+
# of directories from it; using set() ensures we filter it down
75+
paths = {self.path_template % values for values in combos}
76+
total_paths += len(paths)
77+
all_paths |= paths
78+
arch_path_map[arch] = paths
79+
80+
logger.info('Watching %d total paths', total_paths)
81+
logger.debug(all_paths)
82+
83+
# sanity check- basically ensure every path we created from the
84+
# template mapped to only one architecture
85+
if total_paths != len(all_paths):
86+
raise CommandError('path template did not uniquely '
87+
'determine architecture for each file')
88+
89+
# A proper atomic replacement of the database as done by rsync is type
90+
# IN_MOVED_TO. repo-add/remove will finish with a IN_CLOSE_WRITE.
91+
mask = pyinotify.IN_CLOSE_WRITE | pyinotify.IN_MOVED_TO
92+
93+
manager = pyinotify.WatchManager()
94+
for name in all_paths:
95+
manager.add_watch(name, mask)
96+
97+
handler = EventHandler(arch_paths=arch_path_map, filename_suffix='.links.tar.gz', callback_func=wrapper_read_links)
98+
return pyinotify.Notifier(manager, handler)
99+
100+
101+
# vim: set ts=4 sw=4 et:
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import logging
2+
import os
3+
import re
4+
import sys
5+
import tarfile
6+
7+
from django.core.management.base import BaseCommand, CommandError
8+
9+
from main.models import Repo, Package, Soname
10+
11+
12+
logging.basicConfig(
13+
level=logging.INFO,
14+
format='%(asctime)s -> %(levelname)s: %(message)s',
15+
datefmt='%Y-%m-%d %H:%M:%S',
16+
stream=sys.stderr)
17+
logger = logging.getLogger()
18+
19+
20+
class Command(BaseCommand):
21+
help = "Import links db (soname mapping)."
22+
missing_args_message = 'missing links db'
23+
24+
def add_arguments(self, parser):
25+
parser.add_argument('args', nargs='*', help='<arch> <filename>')
26+
27+
def handle(self, filename=None, **options):
28+
if not filename:
29+
raise CommandError('Links database file is required.')
30+
31+
filename = os.path.normpath(filename)
32+
if not os.path.exists(filename) or not os.path.isfile(filename):
33+
raise CommandError('Specified links database file does not exist.')
34+
35+
v = int(options.get('verbosity', None))
36+
if v == 0:
37+
logger.level = logging.ERROR
38+
elif v == 1:
39+
logger.level = logging.INFO
40+
elif v >= 2:
41+
logger.level = logging.DEBUG
42+
43+
return read_linksdb(filename)
44+
45+
46+
def get_pkginfo(pkgnamever):
47+
pkgname, pkgver, pkgrel = pkgnamever.rsplit('-', 2)
48+
epoch = '0'
49+
if ':' in pkgver:
50+
epoch, pkgver = pkgver.split(':')
51+
52+
return pkgname, epoch, pkgver, pkgrel
53+
54+
55+
def read_linksdb(repopath):
56+
logger.info("Starting linksdb parsing")
57+
if not os.path.exists(repopath):
58+
logger.error("Could not read file %s", repopath)
59+
60+
logger.info("Reading repo tarfile %s", repopath)
61+
filename = os.path.split(repopath)[1]
62+
m = re.match(r"^(.*)\.links\.tar(\..*)?$", filename)
63+
if m:
64+
reponame = m.group(1)
65+
else:
66+
logger.error("File does not have the proper extension")
67+
raise Exception("File does not have the proper extension")
68+
69+
repository = Repo.objects.get(name__iexact=reponame)
70+
sonames = []
71+
72+
with tarfile.open(repopath, 'r') as repodb:
73+
logger.debug("Starting soname parsing")
74+
75+
for tarinfo in repodb.getmembers():
76+
if tarinfo.isreg():
77+
pkgnamever = os.path.dirname(tarinfo.name)
78+
pkgnamever = pkgnamever.replace('./', '')
79+
pkgname, epoch, pkgver, pkgrel = get_pkginfo(pkgnamever)
80+
81+
dbpkg = Package.objects.filter(pkgname=pkgname, pkgver=pkgver,
82+
pkgrel=pkgrel, epoch=epoch,
83+
repo=repository).first()
84+
85+
if not dbpkg:
86+
logging.info("Package name '%s' not found in repo database", pkgname)
87+
continue
88+
89+
files_data = repodb.extractfile(tarinfo)
90+
old_sonames = Soname.objects.filter(pkg=dbpkg)
91+
for soname in files_data:
92+
soname = soname.strip().decode()
93+
# New soname which we do not track yet for this package
94+
if not old_sonames.filter(name=soname):
95+
sonames.append(Soname(pkg=dbpkg, name=soname))
96+
97+
if sonames:
98+
Soname.objects.bulk_create(sonames)

0 commit comments

Comments
 (0)