Skip to content

Commit cd0929a

Browse files
authored
Merge pull request #114 from philipmat/dotnet_parser
.NET Core based parser - 2x speedup
2 parents c765fb4 + 2632f89 commit cd0929a

32 files changed

+2092
-7
lines changed

.github/workflows/dotnet.yml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2+
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3+
4+
name: DotNet Build
5+
6+
on: [ push, pull_request ]
7+
# push:
8+
# branches: [ develop ]
9+
# pull_request:
10+
# branches: [ develop ]
11+
12+
jobs:
13+
build:
14+
runs-on: ubuntu-latest
15+
defaults:
16+
run:
17+
working-directory: ./alternatives/dotnet
18+
19+
steps:
20+
- uses: actions/checkout@v2
21+
- name: Setup .NET Core
22+
uses: actions/setup-dotnet@v1
23+
with:
24+
dotnet-version: 3.1.x
25+
- name: Install dependencies
26+
run: |
27+
pwd
28+
dotnet restore
29+
- name: Build
30+
run: dotnet build --configuration Release --no-restore
31+
- name: Test
32+
run: dotnet test --no-restore --verbosity normal
33+
34+
- name: Publish
35+
run: |
36+
dotnet publish discogs/discogs.csproj -o ./artifacts/discogs-linux -r linux-x64 --self-contained true -p:PublishSingleFile=true -p:PublishTrimmed=true
37+
dotnet publish discogs/discogs.csproj -o ./artifacts/discogs-osx -r osx-x64 --self-contained true -p:PublishSingleFile=true -p:PublishTrimmed=true
38+
dotnet publish discogs/discogs.csproj -o ./artifacts/discogs-win -r win-x64 --self-contained true -p:PublishSingleFile=true -p:PublishTrimmed=true
39+
- name: Upload build artifacts - linux
40+
uses: actions/upload-artifact@v2
41+
with:
42+
name: discogsxml2db-linux-x64
43+
path: /home/runner/work/discogs-xml2db/discogs-xml2db/alternatives/dotnet/artifacts/discogs-linux
44+
- name: Upload build artifacts - macOS
45+
uses: actions/upload-artifact@v2
46+
with:
47+
name: discogsxml2db-osx-x64
48+
path: /home/runner/work/discogs-xml2db/discogs-xml2db/alternatives/dotnet/artifacts/discogs-osx
49+
- name: Upload build artifacts - Win
50+
uses: actions/upload-artifact@v2
51+
with:
52+
name: discogsxml2db-win-x64
53+
path: /home/runner/work/discogs-xml2db/discogs-xml2db/alternatives/dotnet/artifacts/discogs-win

README.md

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
# discogs-xml2db v2.0
1+
# discogs-xml2db v2
22

33
discogs-xml2db is a python program for importing [discogs data dumps](https://data.discogs.com/)
44
into several databases.
55

6-
Version 2.0 is a rewrite of the original *discogs-xml2db*
6+
Version 2 is a rewrite of the original *discogs-xml2db*
77
(referred in here as the *classic* version).
88
It is based on a [branch by RedApple](https://github.com/redapple/discogs-xml2db)
99
and it is several times faster.
@@ -12,6 +12,30 @@ Currently supports MySQL and PostgreSQL as target databases.
1212
Instructions for importing into MongoDB, though these are untested.
1313
Let us know how it goes!
1414

15+
## Experimental version
16+
17+
In parallel to the original Python codebase, we're working on a parser/exporter
18+
that's even faster. This is a complete rewrite in C# and initial results are highly
19+
promising:
20+
21+
| File | Record Count | Python | C# |
22+
| --- | ---: | :---: | :---: |
23+
| discogs_20200806_artists.xml.gz | 7,046,615 | 6:22 | 2:35 |
24+
| discogs_20200806_labels.xml.gz | 1,571,873 | 1:15 | 0:22 |
25+
| discogs_20200806_masters.xml.gz | 1,734,371 | 3:56 | 1:57 |
26+
| discogs_20200806_releases.xml.gz | 12,867,980 | 1:45:16 | 42:38 |
27+
28+
If you're interested in testing one of this versions, read more about it
29+
in the [.NET Parser README](./alternatives/dotnet/README.md) or grab
30+
the appropriate binaries from the
31+
[Releases page](https://github.com/philipmat/discogs-xml2db/releases).
32+
33+
While this version does not have yet complete feature-parity with the Python
34+
version, the core export-to-csv is there and it's likely it will
35+
eventually replace it.
36+
37+
![DotNet Build](https://github.com/philipmat/discogs-xml2db/workflows/DotNet%20Build/badge.svg)
38+
1539
## Running discogs-xml2db
1640

1741
![Build Status - develop](https://github.com/philipmat/discogs-xml2db/workflows/Python%20build%20check/badge.svg?branch=develop)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
{
2+
// Use IntelliSense to find out which attributes exist for C# debugging
3+
// Use hover for the description of the existing attributes
4+
// For further information visit https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"name": ".NET Core Launch (console)",
9+
"type": "coreclr",
10+
"request": "launch",
11+
"preLaunchTask": "build",
12+
// If you have changed target frameworks, make sure to update the program path.
13+
// "program": "${workspaceFolder}/dotnet.sln",
14+
"program": "${workspaceFolder}/discogs/bin/Debug/netcoreapp3.1/discogs.dll",
15+
"args": ["--verbose", "${input:runOptions}", "${input:testFiles}"],
16+
"cwd": "${workspaceFolder}",
17+
// For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console
18+
"console": "internalConsole",
19+
"stopAtEntry": false
20+
},
21+
{
22+
"name": ".NET Core Attach",
23+
"type": "coreclr",
24+
"request": "attach",
25+
"processId": "${command:pickProcess}"
26+
}
27+
],
28+
"inputs": [
29+
{
30+
"id": "runOptions",
31+
"description": "What options",
32+
"type": "pickString",
33+
"options": [
34+
"",
35+
"--dry-run",
36+
"--gz",
37+
]
38+
},
39+
{
40+
"id": "testFiles",
41+
"description": "What file to process?",
42+
"type": "pickString",
43+
"options": [
44+
"",
45+
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/artist.xml",
46+
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/label.xml",
47+
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/master.xml",
48+
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/release.xml",
49+
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_artists.xml.gz",
50+
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_labels.xml.gz",
51+
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_masters.xml.gz",
52+
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_releases.xml.gz",
53+
"/Users/af59986/Dev/tmp/discogs/discogs_20200806_labels.xml.gz",
54+
]
55+
}
56+
]
57+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"version": "2.0.0",
3+
"tasks": [
4+
{
5+
"label": "build",
6+
"command": "dotnet",
7+
"type": "process",
8+
"args": [
9+
"build",
10+
"${workspaceFolder}/dotnet.sln",
11+
"/property:GenerateFullPaths=true",
12+
"/consoleloggerparameters:NoSummary"
13+
],
14+
"problemMatcher": "$msCompile"
15+
},
16+
{
17+
"label": "publish",
18+
"command": "dotnet",
19+
"type": "process",
20+
"args": [
21+
"publish",
22+
"${workspaceFolder}/discogs/discogs.csproj",
23+
"/property:GenerateFullPaths=true",
24+
"/consoleloggerparameters:NoSummary"
25+
],
26+
"problemMatcher": "$msCompile"
27+
},
28+
{
29+
"label": "watch",
30+
"command": "dotnet",
31+
"type": "process",
32+
"args": [
33+
"watch",
34+
"run",
35+
"${workspaceFolder}/discogs/discogs.csproj",
36+
"/property:GenerateFullPaths=true",
37+
"/consoleloggerparameters:NoSummary"
38+
],
39+
"problemMatcher": "$msCompile"
40+
}
41+
]
42+
}

alternatives/dotnet/README.md

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Discogs .NET Parser
2+
3+
This alternative `discogsxml2db` is written in C# and run on Microsoft .NET Core.
4+
5+
It provides a significant speedup over the python version:
6+
7+
| File | Record Count | Python | C# |
8+
| --- | ---: | :---: | :---: |
9+
| discogs_20200806_artists.xml.gz | 7,046,615 | 6:22 | 2:35 |
10+
| discogs_20200806_labels.xml.gz | 1,571,873 | 1:15 | 0:22 |
11+
| discogs_20200806_masters.xml.gz | 1,734,371 | 3:56 | 1:57 |
12+
| discogs_20200806_releases.xml.gz | 12,867,980 | 1:45:16 | 42:38 |
13+
14+
## Features
15+
16+
**Done**:
17+
18+
- parsing all four discogs dumps, both *.xml* and *.xml.gz*;
19+
- exporting to csv and compressed csv. Produces the exact same
20+
files that the Python version does;
21+
- displaying progress of import/export process;
22+
- "dry runs": only parsing the files and displaying counts,
23+
not producing any csv files;
24+
25+
**TODO**:
26+
27+
- option to track progress display against the most recently reported
28+
discogs record counts (`--api-counts` argument);
29+
- option to import the resulting csv files into the database;
30+
this process is currently manual or done through the python DB-specific
31+
scripts;
32+
- option to specify the output folder for csv files;
33+
34+
## Installing
35+
36+
Unlike the Python version, this version requires no installation.
37+
38+
Simply download the archive appropriate for your platform. Unzip,
39+
and you should have 2 files: a `discogs` executable (or `discogs.exe` on
40+
Windows) and a "discogs.pdb" support file.
41+
42+
That's it.
43+
44+
## Running
45+
46+
Executing `discogs` without any parameters or passing `--help` will
47+
output a list of available arguments:
48+
49+
```text
50+
Usage: discogs [options] [files...]
51+
52+
Options:
53+
54+
--dry-run Parse the files, output counts, but don't write any actual files
55+
--verbose More verbose output
56+
--gz Compress output files (gzip)
57+
files... Path to discogs_[date]_[type].xml, or .xml.gz files.
58+
Can specify multiple files.
59+
```
60+
61+
To export one or more discogs xml files to csv, simply pass it as parameters
62+
to the executable: `discogs /tmp/discogs_20200806_artists.xml.gz /tmp/discogs_20200806_labels.xml.gz`.
63+
64+
Currently, the program exports the csv files in the same folder as each of the
65+
original xml files. If you would like the csv files to be compressed to `.csv.gz`,
66+
pass the `--gz` argument: `discogs --gz /tmp/discogs_20200806_artists.xml.gz`.
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.IO.Compression;
5+
using System.Linq;
6+
using System.Threading.Tasks;
7+
8+
namespace discogs
9+
{
10+
public interface IExporter<T> : IDisposable
11+
where T : IExportToCsv, new()
12+
{
13+
Task ExportAsync(T value);
14+
Task CompleteExportAsync(int finalCount);
15+
}
16+
17+
public class CsvExporter<T> : IExporter<T>
18+
where T : IExportToCsv, new()
19+
{
20+
private const int BufferSize = 1024 * 1024;
21+
private readonly string _typeName;
22+
private readonly Dictionary<string, (string FilePath, StreamWriter FileStream)> _csvStreams;
23+
private bool disposedValue;
24+
25+
public CsvExporter(string outPutDirectory, bool compress = false, bool verbose = false)
26+
{
27+
_typeName = typeof(T).Name.Split('.')[^1];
28+
_csvStreams = GetCsvFilesFor(outPutDirectory, compress);
29+
}
30+
public async Task CompleteExportAsync(int finalCount)
31+
{
32+
var csvFileNames = string.Join("; ", _csvStreams.Select(kvp => kvp.Value.FilePath));
33+
// pbar.WriteLine("Parsing done. Writing streams.");
34+
foreach (var kvp in _csvStreams)
35+
{
36+
await kvp.Value.FileStream.FlushAsync();
37+
kvp.Value.FileStream.Close();
38+
// await kvp.Value.FileStream.DisposeAsync();
39+
}
40+
Console.WriteLine($"Found {finalCount:n0} {_typeName}s. Wrote them to {csvFileNames}.");
41+
}
42+
43+
public async Task ExportAsync(T value)
44+
{
45+
IEnumerable<(string StreamName, string[] Row)> csvExports = value.ExportToCsv();
46+
foreach (var (streamName, row) in csvExports)
47+
{
48+
await _csvStreams[streamName].FileStream.WriteLineAsync(CsvExtensions.ToCsv(row));
49+
}
50+
}
51+
52+
private static Dictionary<string, (string FilePath, StreamWriter FileStream)> GetCsvFilesFor(string outPutDirectory, bool compress)
53+
{
54+
var obj = new T();
55+
IReadOnlyDictionary<string, string[]> files = obj.GetCsvExportScheme();
56+
Dictionary<string, (string FilePath, StreamWriter FileStream)> csvFiles = files.ToDictionary(
57+
kvp => kvp.Key,
58+
kvp =>
59+
{
60+
var extension = compress ? "csv.gz" : "csv";
61+
var csvFile = Path.Combine(outPutDirectory, $"{kvp.Key}.{extension}");
62+
StreamWriter stream;
63+
if (compress)
64+
{
65+
var fs = File.Create(csvFile, bufferSize: BufferSize);
66+
var gzStream = new GZipStream(fs, CompressionMode.Compress, leaveOpen: false);
67+
stream = new StreamWriter(gzStream, encoding: System.Text.Encoding.UTF8);
68+
}
69+
else
70+
{
71+
stream = new StreamWriter(csvFile, append: false, encoding: System.Text.Encoding.UTF8, bufferSize: BufferSize);
72+
}
73+
stream.WriteLine(CsvExtensions.ToCsv(kvp.Value));
74+
return (csvFile, stream);
75+
});
76+
77+
return csvFiles;
78+
}
79+
80+
// // TODO: override finalizer only if 'Dispose(bool disposing)' has code to free unmanaged resources
81+
// ~CsvExporter()
82+
// {
83+
// // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
84+
// Dispose(disposing: false);
85+
// }
86+
87+
public void Dispose()
88+
{
89+
// Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
90+
Dispose(disposing: true);
91+
GC.SuppressFinalize(this);
92+
}
93+
94+
protected virtual void Dispose(bool disposing)
95+
{
96+
if (!disposedValue)
97+
{
98+
if (disposing)
99+
{
100+
// dispose managed state (managed objects)
101+
foreach (var kvp in _csvStreams)
102+
{
103+
var (_, stream) = kvp.Value;
104+
stream.Dispose();
105+
}
106+
}
107+
108+
// TODO: free unmanaged resources (unmanaged objects) and override finalizer
109+
// TODO: set large fields to null
110+
disposedValue = true;
111+
}
112+
}
113+
}
114+
}

0 commit comments

Comments
 (0)