Skip to content

Commit 3ba6ba9

Browse files
authored
Merge pull request #120 from philipmat/112_specify_files_directly
Allows specifying dump files individually
2 parents cd0929a + 654a7de commit 3ba6ba9

File tree

4 files changed

+72
-23
lines changed

4 files changed

+72
-23
lines changed

README.md

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,23 +86,41 @@ $ sha256sum -c discogs_*_CHECKSUM.txt
8686

8787
Run `run.py` to convert the dump files to csv.
8888

89+
There are two run modes:
90+
91+
1. You can point it to a directory where the discogs dump files are
92+
and use one or multiple `--export` options to indicate which files to process:
93+
8994
```sh
9095
# ensure the virtual environment is active
9196
(.discogsenv) $ python3 run.py \
9297
--bz2 \ # compresses resulting csv files
9398
--apicounts \ # provides more accurate progress counts
9499
--export artist --export label --export master --export release \
100+
--output csv-dir # folder where to output the csv files
95101
dump-dir \ # folder where the data dumps are
96-
csv-dir # folder where to output the csv files
102+
```
103+
104+
2. You can specify the individual files instead:
105+
106+
```sh
107+
# ensure the virtual environment is active
108+
(.discogsenv) $ python3 run.py \
109+
--bz2 \ # compresses resulting csv files
110+
--apicounts \ # provides more accurate progress counts
111+
--output csv-dir # folder where to output the csv files
112+
path/to/discogs_20200806_artist.xml.gz path/to/discogs_20200806_labels.xml.gz
97113
```
98114

99115
`run.py` takes the following arguments:
100116

101117
- `--export`: the types of dump files to export: "artist", "label", "master", "release.
102118
It matches the names of the dump files, e.g. "discogs_20200806_*artist*s.xml.gz"
119+
Not needed if the individual files are specified.
103120
- `--bz2`: Compresses output csv files using bz2 compression library.
104121
- `--limit=<lines>`: Limits export to some number of entities
105122
- `--apicounts`: Makes progress report more accurate by getting total amounts from Discogs API.
123+
- `--output` : the folder where to store the csv files; default it current directory
106124

107125
The exporter provides progress information in real time:
108126

discogsxml2db/exporter.py

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,19 @@ def _write_rows(writer, entity, name):
3737
class EntityCsvExporter(object):
3838
"""Read a Discogs dump XML file and exports SQL table records as CSV.
3939
"""
40-
def __init__(self, entity, in_dir, out_dir,
40+
def __init__(self, entity, in_file_or_dir, out_dir,
4141
limit=None, bz2=True,
4242
dry_run=False, debug=False, max_hint=None, verbose=False):
4343
self.entity = entity
4444
self.parser = _parsers[entity]()
4545
self.max_hint = max_hint
4646
self.verbose = verbose
4747

48-
lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity)
49-
self.pattern = os.path.join(in_dir, lookup)
48+
if os.path.isfile(in_file_or_dir):
49+
self.pattern = in_file_or_dir
50+
else:
51+
lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity)
52+
self.pattern = os.path.join(in_file_or_dir, lookup)
5053

5154
# where and how the exporter will write to
5255
self.out_dir = out_dir
@@ -287,8 +290,7 @@ def write_track_artists(self, writer, release):
287290

288291

289292
def main(arguments):
290-
in_base = arguments['INPUT']
291-
out_base = arguments['OUTPUT'] or '.'
293+
out_base = arguments['--output'] or '.'
292294
limit = int(arguments['--limit']) if arguments['--limit'] else None
293295
bz2_on = arguments['--bz2']
294296
debug = arguments['--debug']
@@ -312,14 +314,38 @@ def main(arguments):
312314
except Exception:
313315
pass
314316

315-
for entity in arguments['--export']:
316-
expected_count = rough_counts['{}s'.format(entity)]
317-
exporter = _exporters[entity](
318-
in_base,
319-
out_base,
320-
limit=limit,
321-
bz2=bz2_on,
322-
debug=debug,
323-
max_hint=min(expected_count, limit or expected_count),
324-
dry_run=dry_run)
325-
exporter.export()
317+
if arguments["INPUT_DIR"] and os.path.isdir(arguments["INPUT_DIR"]):
318+
# use --export to select the entities
319+
in_base = arguments['INPUT_DIR']
320+
for entity in arguments['--export']:
321+
expected_count = rough_counts['{}s'.format(entity)]
322+
exporter = _exporters[entity](
323+
in_base,
324+
out_base,
325+
limit=limit,
326+
bz2=bz2_on,
327+
debug=debug,
328+
max_hint=min(expected_count, limit or expected_count),
329+
dry_run=dry_run)
330+
exporter.export()
331+
elif arguments["<INPUT_FILE>"] or os.path.isfile(arguments["INPUT_DIR"]):
332+
files = []
333+
if arguments["<INPUT_FILE>"]:
334+
files = arguments["<INPUT_FILE>"]
335+
else:
336+
files = [arguments["INPUT_DIR"]]
337+
for in_file in files:
338+
for entity in _exporters:
339+
# discogs files are named discogs_{date}_{entity}s.xml
340+
if f"_{entity}" in in_file:
341+
expected_count = rough_counts['{}s'.format(entity)]
342+
exporter = _exporters[entity](
343+
in_file,
344+
out_base,
345+
limit=limit,
346+
bz2=bz2_on,
347+
debug=debug,
348+
max_hint=min(expected_count, limit or expected_count),
349+
dry_run=dry_run)
350+
exporter.export()
351+
break

run.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*-
33
"""Usage:
4-
run.py [--bz2] [--dry-run] [--limit=<lines>] [--debug] [--apicounts] INPUT [OUTPUT] [--export=<entity>]...
4+
run.py [--bz2] [--dry-run] [--limit=<lines>] [--debug] [--apicounts] [--output=<dir>] <INPUT_FILE> <INPUT_FILE>...
5+
run.py [--bz2] [--dry-run] [--limit=<lines>] [--debug] [--apicounts] [--output=<dir>] INPUT_DIR [--export=<entity>]...
56
67
Options:
78
--bz2 Compress output files using bz2 compression library.
8-
--limit=<lines> Limit export to some number of entities
9-
--export=<entity> Limit export to some entities (repeatable)
9+
--limit=<lines> Limit export to some number of entities (all otherwise)
10+
--export=<entity> Limit export to some entities (repeatable).
11+
Entity is one of: artist, label, master, release.
1012
--debug Turn on debugging prints
1113
--apicounts Check entities counts with Discogs API
12-
--dry-run Do not write
14+
--dry-run Do not write csv files.
15+
--output=<dir> Where to write the csv files. Defaults to current dir.
1316
1417
"""
1518
import sys
@@ -20,4 +23,6 @@
2023

2124
if __name__ == '__main__':
2225
arguments = docopt(__doc__, version='Discogs-to-SQL exporter')
26+
if arguments["--debug"]:
27+
print(arguments)
2328
sys.exit(main(arguments))

tests/test_extract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ def _check_counts(self, entity, tmp_path):
7676
# - export=label
7777

7878
arguments = {
79-
"INPUT": self._samples_folder,
80-
"OUTPUT": tmp_path,
79+
"INPUT_DIR": self._samples_folder,
80+
"--output": tmp_path,
8181
"--export": [entity],
8282
"--limit": None,
8383
"--bz2": False,

0 commit comments

Comments
 (0)