diff --git a/README.md b/README.md index 78c44f4f3..7c7528683 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,83 @@ In order to load a psql session, use the following command: ./develop.sh psql +### Setup the MusicBrainz Server + +MusicBrainz database containing all the MusicBrainz metadata is needed for +setting up your application. The ``mbdump.tar.bz2`` is the core MusicBrainz +archive which includes the tables for artist, release_group etc. +The ``mbdump-derived.tar.bz2`` archive contains annotations, user tags and search indexes. +These archives include all the data required for setting up an instance of +AcousticBrainz. + +You can import the database dumps by downloading and importing the data in +a single command: + + docker-compose -f docker/docker-compose.dev.yml run musicbrainz_db + +**Note** + +One can also manually download the dumps and then import it:- + + 1. For this, you have to download the dumps ``mbdump.tar.bz2`` and ``mbdump-derived.tar.bz2`` + from http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/. + + **Warning** + + Make sure to get the latest dumps + + 2. Then the environment variable ``DUMPS_DIR`` must be set to the path of the + folders containing the dumps. This can be done by: + + export DUMPS_DIR="Path of the folder containing the dumps" + + You can check that the variable ``DUMPS_DIR`` has been succesfully assigned or not by: + + echo $DUMPS_DIR + + This must display the path of your folder containing the database dumps. The folder must contain at least the file ``mbdump.tar.bz2``. + + 3. Then import the database dumps by this command: + + docker-compose -f docker/docker-compose.dev.yml run -v $DUMPS_DIR:/home/musicbrainz/dumps \ + -v $PWD/data/mbdata:/var/lib/postgresql/data/pgdata musicbrainz_db + +**Note** + + You can also use the smaller sample dumps available at http://ftp.musicbrainz.org/pub/musicbrainz/data/sample/ + to set up the MusicBrainz database. However, note that these dumps are .tar.xz + dumps while AcousticBrainz currently only supports import of .tar.bz2 dumps. + So, a decompression of the sample dumps and recompression into .tar.bz2 dumps + will be needed. This can be done using the following command: + + xzcat mbdump-sample.tar.xz | bzip2 > mbdump.tar.bz2 + +**Warning** + +Keep in mind that this process is very time consuming, so make sure that you don't delete the ``data/mbdata`` directory accidently. Also make sure that you have about 25GB of free space to keep the MusicBrainz data. + +Initialization of AcousticBrainz database is also required: + + ./develop.sh run --rm webserver python2 manage.py init_db + +Then you can start all the services: + + ./develop.sh up --build + +## Initialize the MusicBrainz database: + + ./develop.sh run --rm webserver python2 manage.py init_mb_db + +## Import the MusicBrainz database in AcousticBrainz database: + + ./develop.sh run --rm webserver python2 manage.py import_musicbrainz_db + +### Manually + +Full installation instructions are available in [INSTALL.md](https://github.com/metabrainz/acousticbrainz-server/blob/master/INSTALL.md) file. After installing, continue the following steps. + +## Configuration and development + ### Building static files We use webpack as our JavaScript/CSS build system. diff --git a/admin/sql/create_musicbrainz_foreign_keys.sql b/admin/sql/create_musicbrainz_foreign_keys.sql new file mode 100644 index 000000000..d5280aadc --- /dev/null +++ b/admin/sql/create_musicbrainz_foreign_keys.sql @@ -0,0 +1,175 @@ +BEGIN; + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_type + FOREIGN KEY (type) + REFERENCES musicbrainz.artist_type(id); + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_area + FOREIGN KEY (area) + REFERENCES musicbrainz.area(id); + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_gender + FOREIGN KEY (gender) + REFERENCES musicbrainz.gender(id); + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_begin_area + FOREIGN KEY (begin_area) + REFERENCES musicbrainz.area(id); + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_end_area + FOREIGN KEY (end_area) + REFERENCES musicbrainz.area(id); + +ALTER TABLE musicbrainz.artist_credit_name + ADD CONSTRAINT artist_credit_name_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id) + ON DELETE CASCADE; + +ALTER TABLE musicbrainz.artist_credit_name + ADD CONSTRAINT artist_credit_name_fk_artist + FOREIGN KEY (artist) + REFERENCES musicbrainz.artist(id) + ON DELETE CASCADE; + +ALTER TABLE musicbrainz.artist_gid_redirect + ADD CONSTRAINT artist_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.artist(id); + +ALTER TABLE musicbrainz.area + ADD CONSTRAINT area_fk_type + FOREIGN KEY (type) + REFERENCES musicbrainz.area_type(id); + +ALTER TABLE musicbrainz.area_type + ADD CONSTRAINT area_type_fk_parent + FOREIGN KEY (parent) +REFERENCES musicbrainz.area_type(id); + +ALTER TABLE musicbrainz.recording + ADD CONSTRAINT recording_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id); + +ALTER TABLE musicbrainz.recording_gid_redirect + ADD CONSTRAINT recording_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.recording(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_release_group + FOREIGN KEY (release_group) + REFERENCES musicbrainz.release_group(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_status + FOREIGN KEY (status) + REFERENCES musicbrainz.release_status(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_packaging + FOREIGN KEY (packaging) + REFERENCES musicbrainz.release_packaging(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_language + FOREIGN KEY (language) + REFERENCES musicbrainz.language(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_script + FOREIGN KEY (script) + REFERENCES musicbrainz.script(id); + +ALTER TABLE musicbrainz.release_gid_redirect + ADD CONSTRAINT release_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.release(id); + +ALTER TABLE musicbrainz.track + ADD CONSTRAINT track_fk_recording + FOREIGN KEY (recording) + REFERENCES musicbrainz.recording(id); + +ALTER TABLE musicbrainz.track + ADD CONSTRAINT track_fk_medium + FOREIGN KEY (medium) + REFERENCES musicbrainz.medium(id); + +ALTER TABLE musicbrainz.track + ADD CONSTRAINT track_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id); + +ALTER TABLE musicbrainz.track_gid_redirect + ADD CONSTRAINT track_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.track(id); + +ALTER TABLE musicbrainz.release_group + ADD CONSTRAINT release_group_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id); + +ALTER TABLE musicbrainz.release_group + ADD CONSTRAINT release_group_fk_type + FOREIGN KEY (type) + REFERENCES musicbrainz.release_group_primary_type(id); + +ALTER TABLE musicbrainz.release_group_primary_type + ADD CONSTRAINT release_group_primary_type_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.release_group_primary_type; + +ALTER TABLE musicbrainz.release_group_gid_redirect + ADD CONSTRAINT release_group_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.release_group(id); + +ALTER TABLE musicbrainz.medium + ADD CONSTRAINT medium_fk_release + FOREIGN KEY (release) + REFERENCES musicbrainz.release(id); + +ALTER TABLE musicbrainz.medium + ADD CONSTRAINT medium_fk_format + FOREIGN KEY (format) + REFERENCES musicbrainz.medium_format(id); + +ALTER TABLE musicbrainz.medium_format + ADD CONSTRAINT medium_format_fk_parent + FOREIGN KEY (parent) +REFERENCES musicbrainz.medium_format(id); + +ALTER TABLE musicbrainz.release_status + ADD CONSTRAINT release_status_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.release_status(id); + +ALTER TABLE musicbrainz.release_packaging + ADD CONSTRAINT release_packaging_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.release_packaging(id); + +ALTER TABLE musicbrainz.gender + ADD CONSTRAINT gender_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.gender(id); + +ALTER TABLE musicbrainz.artist_type + ADD CONSTRAINT artist_type_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.artist_type(id); + +COMMIT; diff --git a/admin/sql/create_musicbrainz_indexes.sql b/admin/sql/create_musicbrainz_indexes.sql new file mode 100644 index 000000000..8b5707655 --- /dev/null +++ b/admin/sql/create_musicbrainz_indexes.sql @@ -0,0 +1,67 @@ +BEGIN; + +CREATE UNIQUE INDEX artist_idx_gid ON musicbrainz.artist (gid); +CREATE INDEX artist_idx_name ON musicbrainz.artist (name); +CREATE INDEX artist_idx_sort_name ON musicbrainz.artist (sort_name); +CREATE INDEX artist_idx_area ON musicbrainz.artist (area); +CREATE INDEX artist_idx_begin_area ON musicbrainz.artist (begin_area); +CREATE INDEX artist_idx_end_area ON musicbrainz.artist (end_area); + +CREATE UNIQUE INDEX artist_idx_null_comment ON musicbrainz.artist (name) WHERE comment IS NULL; +CREATE UNIQUE INDEX artist_idx_uniq_name_comment ON musicbrainz.artist (name, comment) WHERE comment IS NOT NULL; + +CREATE UNIQUE INDEX area_type_idx_gid ON musicbrainz.area_type (gid); + +CREATE UNIQUE INDEX area_idx_gid ON musicbrainz.area (gid); +CREATE INDEX area_idx_name ON musicbrainz.area (name); + +CREATE INDEX artist_credit_name_idx_artist ON musicbrainz.artist_credit_name (artist); + +CREATE UNIQUE INDEX recording_idx_gid ON musicbrainz.recording (gid); +CREATE INDEX recording_idx_name ON musicbrainz.recording (name); +CREATE INDEX recording_idx_artist_credit ON musicbrainz.recording (artist_credit); + +CREATE UNIQUE INDEX release_idx_gid ON musicbrainz.release (gid); +CREATE INDEX release_idx_name ON musicbrainz.release (name); +CREATE INDEX release_idx_release_group ON musicbrainz.release (release_group); +CREATE INDEX release_idx_artist_credit ON musicbrainz.release (artist_credit); + +CREATE UNIQUE INDEX track_idx_gid ON musicbrainz.track (gid); +CREATE INDEX track_idx_recording ON musicbrainz.track (recording); +CREATE INDEX track_idx_name ON musicbrainz.track (name); +CREATE INDEX track_idx_artist_credit ON musicbrainz.track (artist_credit); + +CREATE INDEX artist_gid_redirect_idx_new_id ON musicbrainz.artist_gid_redirect (new_id); + +CREATE INDEX recording_gid_redirect_idx_new_id ON musicbrainz.recording_gid_redirect (new_id); + +CREATE INDEX release_gid_redirect_idx_new_id ON musicbrainz.release_gid_redirect (new_id); + +CREATE INDEX release_group_gid_redirect_idx_new_id ON musicbrainz.release_group_gid_redirect (new_id); + +CREATE INDEX track_gid_redirect_idx_new_id ON musicbrainz.track_gid_redirect (new_id); + +CREATE UNIQUE INDEX release_group_idx_gid ON musicbrainz.release_group (gid); +CREATE INDEX release_group_idx_name ON musicbrainz.release_group (name); +CREATE INDEX release_group_idx_artist_credit ON musicbrainz.release_group (artist_credit); + +CREATE INDEX medium_idx_track_count ON musicbrainz.medium (track_count); + +CREATE UNIQUE INDEX medium_format_idx_gid ON musicbrainz.medium_format (gid); + +CREATE UNIQUE INDEX release_status_idx_gid ON musicbrainz.release_status (gid); + +CREATE UNIQUE INDEX language_idx_iso_code_2b ON musicbrainz.language (iso_code_2b); +CREATE UNIQUE INDEX language_idx_iso_code_2t ON musicbrainz.language (iso_code_2t); +CREATE UNIQUE INDEX language_idx_iso_code_1 ON musicbrainz.language (iso_code_1); +CREATE UNIQUE INDEX language_idx_iso_code_3 ON musicbrainz.language (iso_code_3); + +CREATE UNIQUE INDEX release_packaging_idx_gid ON musicbrainz.release_packaging (gid); + +CREATE UNIQUE INDEX script_idx_iso_code ON musicbrainz.script (iso_code); + +CREATE UNIQUE INDEX gender_idx_gid ON musicbrainz.gender (gid); + +CREATE UNIQUE INDEX artist_type_idx_gid ON musicbrainz.artist_type (gid); + +COMMIT; diff --git a/admin/sql/create_musicbrainz_primary_keys.sql b/admin/sql/create_musicbrainz_primary_keys.sql new file mode 100644 index 000000000..ec595879e --- /dev/null +++ b/admin/sql/create_musicbrainz_primary_keys.sql @@ -0,0 +1,27 @@ +BEGIN; + +ALTER TABLE musicbrainz.artist ADD CONSTRAINT artist_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.artist_credit ADD CONSTRAINT artist_credit_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.artist_credit_name ADD CONSTRAINT artist_credit_name_pkey PRIMARY KEY (artist_credit, position); +ALTER TABLE musicbrainz.artist_gid_redirect ADD CONSTRAINT artist_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.area ADD CONSTRAINT area_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.area_type ADD CONSTRAINT area_type_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.recording ADD CONSTRAINT recording_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.recording_gid_redirect ADD CONSTRAINT recording_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.release ADD CONSTRAINT release_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_gid_redirect ADD CONSTRAINT release_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.track ADD CONSTRAINT track_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.track_gid_redirect ADD CONSTRAINT track_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.release_group ADD CONSTRAINT release_group_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_group_gid_redirect ADD CONSTRAINT release_group_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.medium ADD CONSTRAINT medium_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.medium_format ADD CONSTRAINT medium_format_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_status ADD CONSTRAINT release_status_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_group_primary_type ADD CONSTRAINT release_group_primary_type_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.language ADD CONSTRAINT language_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_packaging ADD CONSTRAINT release_packaging_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.script ADD CONSTRAINT script_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.gender ADD CONSTRAINT gender_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.artist_type ADD CONSTRAINT artist_type_pkey PRIMARY KEY (id); + +COMMIT; diff --git a/admin/sql/create_musicbrainz_schema.sql b/admin/sql/create_musicbrainz_schema.sql new file mode 100644 index 000000000..2bbfa59fc --- /dev/null +++ b/admin/sql/create_musicbrainz_schema.sql @@ -0,0 +1,7 @@ +-- Create the musicbrainz schema. + +BEGIN; + +CREATE SCHEMA IF NOT EXISTS musicbrainz; + +COMMIT; diff --git a/admin/sql/create_musicbrainz_tables.sql b/admin/sql/create_musicbrainz_tables.sql new file mode 100644 index 000000000..706b338db --- /dev/null +++ b/admin/sql/create_musicbrainz_tables.sql @@ -0,0 +1,275 @@ +BEGIN; + +CREATE TABLE musicbrainz.artist ( + id SERIAL, + gid UUID NOT NULL, + name VARCHAR NOT NULL, + sort_name VARCHAR NOT NULL, + begin_date_year SMALLINT, + begin_date_month SMALLINT, + begin_date_day SMALLINT, + end_date_year SMALLINT, + end_date_month SMALLINT, + end_date_day SMALLINT, + type INTEGER, -- references artist_type.id + area INTEGER, -- references area.id + gender INTEGER, -- references gender.id + comment VARCHAR(255) NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + ended BOOLEAN NOT NULL DEFAULT FALSE + CONSTRAINT artist_ended_check CHECK ( + ( + -- If any end date fields are not null, then ended must be true + (end_date_year IS NOT NULL OR + end_date_month IS NOT NULL OR + end_date_day IS NOT NULL) AND + ended = TRUE + ) OR ( + -- Otherwise, all end date fields must be null + (end_date_year IS NULL AND + end_date_month IS NULL AND + end_date_day IS NULL) + ) + ), + begin_area INTEGER, -- references area.id + end_area INTEGER -- references area.id +); + +CREATE TABLE musicbrainz.artist_credit ( + id SERIAL, + name VARCHAR NOT NULL, + artist_count SMALLINT NOT NULL, + ref_count INTEGER DEFAULT 0, + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.artist_credit_name ( + artist_credit INTEGER NOT NULL, -- PK, references artist_credit.id CASCADE + position SMALLINT NOT NULL, -- PK + artist INTEGER NOT NULL, -- references artist.id CASCADE + name VARCHAR NOT NULL, + join_phrase TEXT NOT NULL DEFAULT '' +); + +CREATE TABLE musicbrainz.artist_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references artist.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.area ( + id SERIAL, -- PK + gid uuid NOT NULL, + name VARCHAR NOT NULL, + type INTEGER, -- references area_type.id + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >=0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + begin_date_year SMALLINT, + begin_date_month SMALLINT, + begin_date_day SMALLINT, + end_date_year SMALLINT, + end_date_month SMALLINT, + end_date_day SMALLINT, + ended BOOLEAN NOT NULL DEFAULT FALSE + CHECK ( + ( + -- If any end date fields are not null, then ended must be true + (end_date_year IS NOT NULL OR + end_date_month IS NOT NULL OR + end_date_day IS NOT NULL) AND + ended = TRUE + ) OR ( + -- Otherwise, all end date fields must be null + (end_date_year IS NULL AND + end_date_month IS NULL AND + end_date_day IS NULL) + ) + ), + comment VARCHAR(255) NOT NULL DEFAULT '' +); + +CREATE TABLE musicbrainz.area_type ( + id SERIAL, -- PK + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references area_type.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.recording ( + id SERIAL, + gid UUID NOT NULL, + name VARCHAR NOT NULL, + artist_credit INTEGER NOT NULL, -- references artist_credit.id + length INTEGER CHECK (length IS NULL OR length > 0), + comment VARCHAR(255) NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + video BOOLEAN NOT NULL DEFAULT FALSE +); + + +CREATE TABLE musicbrainz.recording_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references recording.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.release ( + id SERIAL, + gid UUID NOT NULL, + name VARCHAR NOT NULL, + artist_credit INTEGER NOT NULL, -- references artist_credit.id + release_group INTEGER NOT NULL, -- references release_group.id + status INTEGER, -- references release_status.id + packaging INTEGER, -- references release_packaging.id + language INTEGER, -- references language.id + script INTEGER, -- references script.id + barcode VARCHAR(255), + comment VARCHAR(255) NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + quality SMALLINT NOT NULL DEFAULT -1, + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.release_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references release.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.track ( + id SERIAL, + gid UUID NOT NULL, + recording INTEGER NOT NULL, -- references recording.id + medium INTEGER NOT NULL, -- references medium.id + position INTEGER NOT NULL, + number TEXT NOT NULL, + name VARCHAR NOT NULL, + artist_credit INTEGER NOT NULL, -- references artist_credit.id + length INTEGER CHECK (length IS NULL OR length > 0), + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + is_data_track BOOLEAN NOT NULL DEFAULT FALSE +); + +CREATE TABLE musicbrainz.track_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references track.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.release_group ( + id SERIAL, + gid UUID NOT NULL, + name VARCHAR NOT NULL, + artist_credit INTEGER NOT NULL, -- references artist_credit.id + type INTEGER, -- references release_group_primary_type.id + comment VARCHAR(255) NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.release_group_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references release_group.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.medium ( + id SERIAL, + release INTEGER NOT NULL, -- references release.id + position INTEGER NOT NULL, + format INTEGER, -- references medium_format.id + name VARCHAR NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + track_count INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE musicbrainz.medium_format ( + id SERIAL, + name VARCHAR(100) NOT NULL, + parent INTEGER, -- references medium_format.id + child_order INTEGER NOT NULL DEFAULT 0, + year SMALLINT, + has_discids BOOLEAN NOT NULL DEFAULT FALSE, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.release_status ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references release_status.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.release_group_primary_type ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references release_group_primary_type.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.language ( + id SERIAL, + iso_code_2t CHAR(3), -- ISO 639-2 (T) + iso_code_2b CHAR(3), -- ISO 639-2 (B) + iso_code_1 CHAR(2), -- ISO 639 + name VARCHAR(100) NOT NULL, + frequency INTEGER NOT NULL DEFAULT 0, + iso_code_3 CHAR(3) -- ISO 639-3 +); +ALTER TABLE musicbrainz.language + ADD CONSTRAINT iso_code_check + CHECK (iso_code_2t IS NOT NULL OR iso_code_3 IS NOT NULL); + +CREATE TABLE musicbrainz.release_packaging ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references release_packaging.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.script ( + id SERIAL, + iso_code CHAR(4) NOT NULL, -- ISO 15924 + iso_number CHAR(3) NOT NULL, -- ISO 15924 + name VARCHAR(100) NOT NULL, + frequency INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE musicbrainz.gender ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references gender.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.artist_type ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references artist_type.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.replication_control ( + id SERIAL, + current_replication_sequence INTEGER, + last_replication_date TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +COMMIT; diff --git a/admin/sql/drop_musicbrainz_schema.sql b/admin/sql/drop_musicbrainz_schema.sql new file mode 100644 index 000000000..f12ed0757 --- /dev/null +++ b/admin/sql/drop_musicbrainz_schema.sql @@ -0,0 +1 @@ +DROP SCHEMA IF EXISTS musicbrainz CASCADE; diff --git a/admin/updates/20180525-musicbrainz-schema.sql b/admin/updates/20180525-musicbrainz-schema.sql new file mode 100644 index 000000000..cc1bb328a --- /dev/null +++ b/admin/updates/20180525-musicbrainz-schema.sql @@ -0,0 +1,531 @@ +BEGIN; + +CREATE SCHEMA IF NOT EXISTS musicbrainz; + +CREATE TABLE musicbrainz.artist ( + id SERIAL, + gid UUID NOT NULL, + name VARCHAR NOT NULL, + sort_name VARCHAR NOT NULL, + begin_date_year SMALLINT, + begin_date_month SMALLINT, + begin_date_day SMALLINT, + end_date_year SMALLINT, + end_date_month SMALLINT, + end_date_day SMALLINT, + type INTEGER, -- references artist_type.id + area INTEGER, -- references area.id + gender INTEGER, -- references gender.id + comment VARCHAR(255) NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + ended BOOLEAN NOT NULL DEFAULT FALSE + CONSTRAINT artist_ended_check CHECK ( + ( + -- If any end date fields are not null, then ended must be true + (end_date_year IS NOT NULL OR + end_date_month IS NOT NULL OR + end_date_day IS NOT NULL) AND + ended = TRUE + ) OR ( + -- Otherwise, all end date fields must be null + (end_date_year IS NULL AND + end_date_month IS NULL AND + end_date_day IS NULL) + ) + ), + begin_area INTEGER, -- references area.id + end_area INTEGER -- references area.id +); + +CREATE TABLE musicbrainz.artist_credit ( + id SERIAL, + name VARCHAR NOT NULL, + artist_count SMALLINT NOT NULL, + ref_count INTEGER DEFAULT 0, + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.artist_credit_name ( + artist_credit INTEGER NOT NULL, -- PK, references artist_credit.id CASCADE + position SMALLINT NOT NULL, -- PK + artist INTEGER NOT NULL, -- references artist.id CASCADE + name VARCHAR NOT NULL, + join_phrase TEXT NOT NULL DEFAULT '' +); + +CREATE TABLE musicbrainz.artist_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references artist.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.area ( + id SERIAL, -- PK + gid uuid NOT NULL, + name VARCHAR NOT NULL, + type INTEGER, -- references area_type.id + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >=0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + begin_date_year SMALLINT, + begin_date_month SMALLINT, + begin_date_day SMALLINT, + end_date_year SMALLINT, + end_date_month SMALLINT, + end_date_day SMALLINT, + ended BOOLEAN NOT NULL DEFAULT FALSE + CHECK ( + ( + -- If any end date fields are not null, then ended must be true + (end_date_year IS NOT NULL OR + end_date_month IS NOT NULL OR + end_date_day IS NOT NULL) AND + ended = TRUE + ) OR ( + -- Otherwise, all end date fields must be null + (end_date_year IS NULL AND + end_date_month IS NULL AND + end_date_day IS NULL) + ) + ), + comment VARCHAR(255) NOT NULL DEFAULT '' +); + +CREATE TABLE musicbrainz.area_type ( + id SERIAL, -- PK + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references area_type.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.recording ( + id SERIAL, + gid UUID NOT NULL, + name VARCHAR NOT NULL, + artist_credit INTEGER NOT NULL, -- references artist_credit.id + length INTEGER CHECK (length IS NULL OR length > 0), + comment VARCHAR(255) NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + video BOOLEAN NOT NULL DEFAULT FALSE +); + + +CREATE TABLE musicbrainz.recording_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references recording.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.release ( + id SERIAL, + gid UUID NOT NULL, + name VARCHAR NOT NULL, + artist_credit INTEGER NOT NULL, -- references artist_credit.id + release_group INTEGER NOT NULL, -- references release_group.id + status INTEGER, -- references release_status.id + packaging INTEGER, -- references release_packaging.id + language INTEGER, -- references language.id + script INTEGER, -- references script.id + barcode VARCHAR(255), + comment VARCHAR(255) NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + quality SMALLINT NOT NULL DEFAULT -1, + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.release_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references release.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.track ( + id SERIAL, + gid UUID NOT NULL, + recording INTEGER NOT NULL, -- references recording.id + medium INTEGER NOT NULL, -- references medium.id + position INTEGER NOT NULL, + number TEXT NOT NULL, + name VARCHAR NOT NULL, + artist_credit INTEGER NOT NULL, -- references artist_credit.id + length INTEGER CHECK (length IS NULL OR length > 0), + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + is_data_track BOOLEAN NOT NULL DEFAULT FALSE +); + +CREATE TABLE musicbrainz.track_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references track.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.release_group ( + id SERIAL, + gid UUID NOT NULL, + name VARCHAR NOT NULL, + artist_credit INTEGER NOT NULL, -- references artist_credit.id + type INTEGER, -- references release_group_primary_type.id + comment VARCHAR(255) NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.release_group_gid_redirect ( + gid UUID NOT NULL, -- PK + new_id INTEGER NOT NULL, -- references release_group.id + created TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +CREATE TABLE musicbrainz.medium ( + id SERIAL, + release INTEGER NOT NULL, -- references release.id + position INTEGER NOT NULL, + format INTEGER, -- references medium_format.id + name VARCHAR NOT NULL DEFAULT '', + edits_pending INTEGER NOT NULL DEFAULT 0 CHECK (edits_pending >= 0), + last_updated TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + track_count INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE musicbrainz.medium_format ( + id SERIAL, + name VARCHAR(100) NOT NULL, + parent INTEGER, -- references medium_format.id + child_order INTEGER NOT NULL DEFAULT 0, + year SMALLINT, + has_discids BOOLEAN NOT NULL DEFAULT FALSE, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.release_status ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references release_status.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.release_group_primary_type ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references release_group_primary_type.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.language ( + id SERIAL, + iso_code_2t CHAR(3), -- ISO 639-2 (T) + iso_code_2b CHAR(3), -- ISO 639-2 (B) + iso_code_1 CHAR(2), -- ISO 639 + name VARCHAR(100) NOT NULL, + frequency INTEGER NOT NULL DEFAULT 0, + iso_code_3 CHAR(3) -- ISO 639-3 +); +ALTER TABLE musicbrainz.language + ADD CONSTRAINT iso_code_check + CHECK (iso_code_2t IS NOT NULL OR iso_code_3 IS NOT NULL); + +CREATE TABLE musicbrainz.release_packaging ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references release_packaging.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.script ( + id SERIAL, + iso_code CHAR(4) NOT NULL, -- ISO 15924 + iso_number CHAR(3) NOT NULL, -- ISO 15924 + name VARCHAR(100) NOT NULL, + frequency INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE musicbrainz.gender ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references gender.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +CREATE TABLE musicbrainz.artist_type ( + id SERIAL, + name VARCHAR(255) NOT NULL, + parent INTEGER, -- references artist_type.id + child_order INTEGER NOT NULL DEFAULT 0, + description TEXT, + gid uuid NOT NULL +); + +ALTER TABLE musicbrainz.artist ADD CONSTRAINT artist_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.artist_credit ADD CONSTRAINT artist_credit_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.artist_credit_name ADD CONSTRAINT artist_credit_name_pkey PRIMARY KEY (artist_credit, position); +ALTER TABLE musicbrainz.artist_gid_redirect ADD CONSTRAINT artist_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.area ADD CONSTRAINT area_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.area_type ADD CONSTRAINT area_type_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.recording ADD CONSTRAINT recording_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.recording_gid_redirect ADD CONSTRAINT recording_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.release ADD CONSTRAINT release_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_gid_redirect ADD CONSTRAINT release_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.track ADD CONSTRAINT track_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.track_gid_redirect ADD CONSTRAINT track_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.release_group ADD CONSTRAINT release_group_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_group_gid_redirect ADD CONSTRAINT release_group_gid_redirect_pkey PRIMARY KEY (gid); +ALTER TABLE musicbrainz.medium ADD CONSTRAINT medium_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.medium_format ADD CONSTRAINT medium_format_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_status ADD CONSTRAINT release_status_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_group_primary_type ADD CONSTRAINT release_group_primary_type_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.language ADD CONSTRAINT language_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.release_packaging ADD CONSTRAINT release_packaging_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.script ADD CONSTRAINT script_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.gender ADD CONSTRAINT gender_pkey PRIMARY KEY (id); +ALTER TABLE musicbrainz.artist_type ADD CONSTRAINT artist_type_pkey PRIMARY KEY (id); + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_type + FOREIGN KEY (type) + REFERENCES musicbrainz.artist_type(id); + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_area + FOREIGN KEY (area) + REFERENCES musicbrainz.area(id); + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_gender + FOREIGN KEY (gender) + REFERENCES musicbrainz.gender(id); + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_begin_area + FOREIGN KEY (begin_area) + REFERENCES musicbrainz.area(id); + +ALTER TABLE musicbrainz.artist + ADD CONSTRAINT artist_fk_end_area + FOREIGN KEY (end_area) + REFERENCES musicbrainz.area(id); + +ALTER TABLE musicbrainz.artist_credit_name + ADD CONSTRAINT artist_credit_name_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id) + ON DELETE CASCADE; + +ALTER TABLE musicbrainz.artist_credit_name + ADD CONSTRAINT artist_credit_name_fk_artist + FOREIGN KEY (artist) + REFERENCES musicbrainz.artist(id) + ON DELETE CASCADE; + +ALTER TABLE musicbrainz.artist_gid_redirect + ADD CONSTRAINT artist_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.artist(id); + +ALTER TABLE musicbrainz.area + ADD CONSTRAINT area_fk_type + FOREIGN KEY (type) + REFERENCES musicbrainz.area_type(id); + +ALTER TABLE musicbrainz.area_type + ADD CONSTRAINT area_type_fk_parent + FOREIGN KEY (parent) +REFERENCES musicbrainz.area_type(id); + +ALTER TABLE musicbrainz.recording + ADD CONSTRAINT recording_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id); + +ALTER TABLE musicbrainz.recording_gid_redirect + ADD CONSTRAINT recording_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.recording(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_release_group + FOREIGN KEY (release_group) + REFERENCES musicbrainz.release_group(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_status + FOREIGN KEY (status) + REFERENCES musicbrainz.release_status(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_packaging + FOREIGN KEY (packaging) + REFERENCES musicbrainz.release_packaging(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_language + FOREIGN KEY (language) + REFERENCES musicbrainz.language(id); + +ALTER TABLE musicbrainz.release + ADD CONSTRAINT release_fk_script + FOREIGN KEY (script) + REFERENCES musicbrainz.script(id); + +ALTER TABLE musicbrainz.release_gid_redirect + ADD CONSTRAINT release_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.release(id); + +ALTER TABLE musicbrainz.track + ADD CONSTRAINT track_fk_recording + FOREIGN KEY (recording) + REFERENCES musicbrainz.recording(id); + +ALTER TABLE musicbrainz.track + ADD CONSTRAINT track_fk_medium + FOREIGN KEY (medium) + REFERENCES musicbrainz.medium(id); + +ALTER TABLE musicbrainz.track + ADD CONSTRAINT track_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id); + +ALTER TABLE musicbrainz.track_gid_redirect + ADD CONSTRAINT track_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.track(id); + +ALTER TABLE musicbrainz.release_group + ADD CONSTRAINT release_group_fk_artist_credit + FOREIGN KEY (artist_credit) + REFERENCES musicbrainz.artist_credit(id); + +ALTER TABLE musicbrainz.release_group + ADD CONSTRAINT release_group_fk_type + FOREIGN KEY (type) + REFERENCES musicbrainz.release_group_primary_type(id); + +ALTER TABLE musicbrainz.release_group_primary_type + ADD CONSTRAINT release_group_primary_type_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.release_group_primary_type; + +ALTER TABLE musicbrainz.release_group_gid_redirect + ADD CONSTRAINT release_group_gid_redirect_fk_new_id + FOREIGN KEY (new_id) + REFERENCES musicbrainz.release_group(id); + +ALTER TABLE musicbrainz.medium + ADD CONSTRAINT medium_fk_release + FOREIGN KEY (release) + REFERENCES musicbrainz.release(id); + +ALTER TABLE musicbrainz.medium + ADD CONSTRAINT medium_fk_format + FOREIGN KEY (format) + REFERENCES musicbrainz.medium_format(id); + +ALTER TABLE musicbrainz.medium_format + ADD CONSTRAINT medium_format_fk_parent + FOREIGN KEY (parent) +REFERENCES musicbrainz.medium_format(id); + +ALTER TABLE musicbrainz.release_status + ADD CONSTRAINT release_status_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.release_status(id); + +ALTER TABLE musicbrainz.release_packaging + ADD CONSTRAINT release_packaging_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.release_packaging(id); + +ALTER TABLE musicbrainz.gender + ADD CONSTRAINT gender_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.gender(id); + +ALTER TABLE musicbrainz.artist_type + ADD CONSTRAINT artist_type_fk_parent + FOREIGN KEY (parent) + REFERENCES musicbrainz.artist_type(id); + +CREATE UNIQUE INDEX artist_idx_gid ON musicbrainz.artist (gid); +CREATE INDEX artist_idx_name ON musicbrainz.artist (name); +CREATE INDEX artist_idx_sort_name ON musicbrainz.artist (sort_name); +CREATE INDEX artist_idx_area ON musicbrainz.artist (area); +CREATE UNIQUE INDEX artist_idx_null_comment ON musicbrainz.artist (name) WHERE comment IS NULL; +CREATE UNIQUE INDEX artist_idx_uniq_name_comment ON musicbrainz.artist (name, comment) WHERE comment IS NOT NULL; + +CREATE UNIQUE INDEX area_idx_gid ON musicbrainz.area (gid); +CREATE INDEX area_idx_name ON musicbrainz.area (name) + +CREATE INDEX artist_credit_name_idx_artist ON musicbrainz.artist_credit_name (artist); + +CREATE UNIQUE INDEX recording_idx_gid ON musicbrainz.recording (gid); +CREATE INDEX recording_idx_name ON musicbrainz.recording (name); +CREATE INDEX recording_idx_artist_credit ON musicbrainz.recording (artist_credit); + +CREATE UNIQUE INDEX release_idx_gid ON musicbrainz.release (gid); +CREATE INDEX release_idx_name ON musicbrainz.release (name); +CREATE INDEX release_idx_release_group ON musicbrainz.release (release_group); +CREATE INDEX release_idx_artist_credit ON musicbrainz.release (artist_credit); + +CREATE UNIQUE INDEX track_idx_gid ON musicbrainz.track (gid); +CREATE INDEX track_idx_recording ON musicbrainz.track (recording); +CREATE INDEX track_idx_name ON musicbrainz.track (name); +CREATE INDEX track_idx_artist_credit ON musicbrainz.track (artist_credit); + +CREATE INDEX artist_gid_redirect_idx_new_id ON musicbrainz.artist_gid_redirect (new_id); + +CREATE INDEX recording_gid_redirect_idx_new_id ON musicbrainz.recording_gid_redirect (new_id); + +CREATE INDEX release_gid_redirect_idx_new_id ON musicbrainz.release_gid_redirect (new_id); + +CREATE INDEX release_group_gid_redirect_idx_new_id ON musicbrainz.release_group_gid_redirect (new_id); + +CREATE INDEX track_gid_redirect_idx_new_id ON musicbrainz.track_gid_redirect (new_id); + +CREATE UNIQUE INDEX release_group_idx_gid ON musicbrainz.release_group (gid); +CREATE INDEX release_group_idx_name ON musicbrainz.release_group (name); +CREATE INDEX release_group_idx_artist_credit ON musicbrainz.release_group (artist_credit); + +CREATE INDEX medium_idx_track_count ON musicbrainz.medium (track_count); + +CREATE INDEX artist_idx_begin_area ON musicbrainz.artist (begin_area); +CREATE INDEX artist_idx_end_area ON musicbrainz.artist (end_area); + +CREATE UNIQUE INDEX area_type_idx_gid ON musicbrainz.area_type (gid); + +CREATE UNIQUE INDEX medium_format_idx_gid ON musicbrainz.medium_format (gid); + +CREATE UNIQUE INDEX release_status_idx_gid ON musicbrainz.release_status (gid); + +CREATE UNIQUE INDEX language_idx_iso_code_2b ON musicbrainz.language (iso_code_2b); +CREATE UNIQUE INDEX language_idx_iso_code_2t ON musicbrainz.language (iso_code_2t); +CREATE UNIQUE INDEX language_idx_iso_code_1 ON musicbrainz.language (iso_code_1); +CREATE UNIQUE INDEX language_idx_iso_code_3 ON musicbrainz.language (iso_code_3); + +CREATE UNIQUE INDEX release_packaging_idx_gid ON musicbrainz.release_packaging (gid); + +CREATE UNIQUE INDEX script_idx_iso_code ON musicbrainz.script (iso_code); + +CREATE UNIQUE INDEX gender_idx_gid ON musicbrainz.gender (gid); + +CREATE UNIQUE INDEX artist_type_idx_gid ON musicbrainz.artist_type (gid); + +COMMIT; diff --git a/config.py.example b/config.py.example index 2a202d6a5..9354e5781 100644 --- a/config.py.example +++ b/config.py.example @@ -11,6 +11,9 @@ SECRET_KEY = "CHANGE_ME" # Primary database SQLALCHEMY_DATABASE_URI = "postgresql://acousticbrainz@db/acousticbrainz" +# MusicBrainz Database +MB_DATABASE_URI = "postgresql://musicbrainz:musicbrainz@musicbrainz_db:5432/musicbrainz_db" + # URI to connect to an empty database as the superuser POSTGRES_ADMIN_URI = "postgresql://postgres@db/template1" # URI to connect to the acousticbrainz database as the superuser (to install extensions) @@ -67,3 +70,17 @@ FEATURE_EVAL_FILTERING = True FEATURE_EVAL_MODEL_SELECTION = False DEBUG_TB_INTERCEPT_REDIRECTS = False + +# Maximum number of recordings to fetch at a time for importing MusicBrainz metadata. +RECORDINGS_FETCHED_PER_BATCH = 10000 + +# Sleep duration for musicbrainz importer to wait after a complete import and +# between every 2 batches +SLEEP_DURATION = 30 # number of seconds to wait between runs +BATCH_SLEEP_DURATION = 5 # number of seconds to wait between batches + +# Base url to download the replication packets +REPLICATION_PACKETS_URL = "https://metabrainz.org/api/musicbrainz/" + +# Token to access any MetaBrainz API +# ACCESS_TOKEN = "" diff --git a/db/data.py b/db/data.py index 7ee0ad769..f20a7f2bc 100644 --- a/db/data.py +++ b/db/data.py @@ -11,6 +11,10 @@ import db import db.exceptions +from flask import current_app + +from brainzutils import musicbrainz_db + _whitelist_file = os.path.join(os.path.dirname(__file__), "tagwhitelist.json") _whitelist_tags = set(json.load(open(_whitelist_file))) @@ -873,3 +877,279 @@ def get_summary_data(mbid, offset=0): pass return summary + + +def get_new_recordings_from_lowlevel(): + with db.engine.begin() as connection: + rows_to_fetch = current_app.config['RECORDINGS_FETCHED_PER_BATCH'] + + query = text("""SELECT lowlevel.gid + FROM lowlevel + LEFT JOIN musicbrainz.recording + ON lowlevel.gid = musicbrainz.recording.gid + WHERE musicbrainz.recording.gid is NULL + ORDER BY lowlevel.id + LIMIT :rows_to_fetch + """) + gids = connection.execute(query, {"rows_to_fetch": rows_to_fetch}) + gids = gids.fetchall() + gids_in_AB = [value[0] for value in gids] + + return gids_in_AB + + +def get_mbids_from_gid_redirect_tables(): + """Fetch mbids from recording gid redirect table and calls function + get_original_entity to get the redirected result. + + Returns: + Dictionary containing the redirected original entity ids with MBIDs as keys. + - mbid: Recording mbids of the entities + - id: Original redirected ids of the entities after mbid redirect + """ + with db.engine.begin() as connection: + query = text(""" + SELECT gid + FROM musicbrainz.recording_gid_redirect + """) + result = connection.execute(query) + mbids = result.fetchall() + + recording_mbids = [] + for mbid in mbids: + recording_mbids.append(str(mbid[0])) + return recording_mbids + + +def get_current_schema_and_replication_sequence(): + """Fetch current schema sequence and current replication number + from the musicbrainz database. + + Returns: + schema_seq: last schema sequence number. + mb_replication_seq: last updated replication sequence. + """ + with musicbrainz_db.engine.begin() as connection: + query = text(""" + SELECT current_schema_sequence, current_replication_sequence + FROM replication_control + """) + result = connection.execute(query) + schema_seq, mb_replication_seq = result.fetchone() + return schema_seq, mb_replication_seq + + +def get_replication_sequence_from_mb_schema(): + """Fetch current replication sequence last updated in replication + control table in musicbrainz schema in AB database. + + Returns: + sequence[0]: current replication sequence number from a sqlachemy + type object. + """ + with db.engine.begin() as connection: + query = text(""" + SELECT current_replication_sequence + FROM musicbrainz.replication_control + """) + result = connection.execute(query) + sequence = result.fetchone() + return sequence[0] + + +def update_replication_sequence(replication_seq): + """Store new replication sequence into replication_control table for future + updates and deletes from replication packets. + + Args: + replication_seq: Current replication sequence to replace the old one. + """ + with db.engine.begin() as connection: + query = text(""" + UPDATE musicbrainz.replication_control + SET current_replication_sequence = :replication_seq + """) + connection.execute(query, {'replication_seq': replication_seq}) + + +def write_replication_control(replication_seq): + """Insert first replication sequence into replication_control table. + + Args: + replication_seq: first replication sequence to start the download of packets from. + """ + with db.engine.begin() as connection: + query = text(""" + INSERT INTO musicbrainz.replication_control (current_replication_sequence) + VALUES (:replication_seq) + """) + connection.execute(query, {'replication_seq': replication_seq}) + + +def get_mbids_from_gid_redirect_tables_from_MB_db(): + """Fetch mbids from recording gid redirect table of MusicBrainz + database over the direct connection and calls function + get_original_entity to get the redirected result. + + Returns: + Dictionary containing the redirected original entity ids with MBIDs as keys. + - mbid: Recording mbids of the entities + - id: Original redirected ids of the entities after mbid redirect + """ + with musicbrainz_db.engine.begin() as connection: + query = text(""" + SELECT gid + FROM musicbrainz.recording_gid_redirect + """) + result = connection.execute(query) + mbids = result.fetchall() + + recording_mbids = [] + for mbid in mbids: + recording_mbids.append(str(mbid[0])) + return recording_mbids + + +def load_lowlevel_and_recording_data(): + """Fetch data in which gid column value is present in both lowlevel + and musicbrainz.recording table from AcousticBrainz database. + + Returns: + data (of type - sqlalchemy.resultproxy): data retrieved + from the lowlevel and recording table. + """ + with db.engine.begin() as connection: + query = text(""" + SELECT * + FROM lowlevel + INNER JOIN musicbrainz.recording + ON musicbrainz.recording.gid = lowlevel.gid + LIMIT 10000 + """) + result = connection.execute(query) + data = result.fetchall() + return data + + +def load_lowlevel_data(): + """Fetch lowlevel data from AcousticBrainz database. + + Returns: + lowlevel_data (of type - sqlalchemy.resultproxy): data retrieved + from lowlevel table. + """ + with db.engine.begin() as connection: + query = text(""" + SELECT * + FROM lowlevel + LIMIT 10000 + """) + result = connection.execute(query) + lowlevel_data = result.fetchall() + return lowlevel_data + + +def load_recording_data_from_MB_db(lowlevel_data): + """Fetch recording data from MusicBrainz database over the + direct connection whose gid matches with those in lowlevel + table in AB database. + + Args: + lowlevel_data: list of gids of the data present in lowlevel table. + + Returns:0 + rec_data (of type - sqlalchemy.resultproxy): data retrieved + from recording table of MusicBrainz database. + """ + with musicbrainz_db.engine.begin() as connection: + query = text(""" + SELECT * + FROM recording + WHERE recording.gid in :gids + """) + result = connection.execute(query, {"gids": tuple(lowlevel_data)}) + rec_data = result.fetchall() + return rec_data + + +def load_recording_data_from_AB_db(lowlevel_data): + """Fetch recording data from MusicBrainz database over the + direct connection whose gid matches with those in lowlevel + table in AB database. + + Args: + lowlevel_data: list of gids of the data present in lowlevel table. + + Returns:0 + rec_data (of type - sqlalchemy.resultproxy): data retrieved + from recording table of MusicBrainz database. + """ + with db.engine.begin() as connection: + query = text(""" + SELECT * + FROM musicbrainz.recording + WHERE musicbrainz.recording.gid in :gids + """) + result = connection.execute(query, {"gids": tuple(lowlevel_data)}) + rec_data = result.fetchall() + return rec_data + + +def load_lowlevel_and_recording_data_using_exists(): + """Fetch data in which gid column value is present in both lowlevel + and musicbrainz.recording table from AcousticBrainz database. + + Returns: + data (of type - sqlalchemy.resultproxy): data retrieved + from the lowlevel and recording table. + """ + with db.engine.begin() as connection: + query = text(""" + SELECT * + FROM lowlevel + WHERE EXISTS ( + SELECT musicbrainz.recording.gid + FROM musicbrainz.recording + WHERE musicbrainz.recording.gid = lowlevel.gid + ) + LIMIT 10000 + """) + result = connection.execute(query) + data = result.fetchall() + return data + +def load_lowlevel_data_from_dataset(dataset_id): + with db.engine.begin() as connection: + query = text(""" + SELECT lowlevel.*, musicbrainz.recording.* + FROM lowlevel + INNER JOIN musicbrainz.recording + ON musicbrainz.recording.gid = lowlevel.gid + INNER JOIN dataset_class_member + ON dataset_class_member.mbid = lowlevel.gid + INNER JOIN dataset_class + ON dataset_class.id = dataset_class_member.class + INNER JOIN dataset + ON dataset.id = dataset_class.dataset + WHERE dataset.id = :id + LIMIT 10000 + """) + result = connection.execute(query, {"id": dataset_id}) + data = result.fetchall() + return data + +def get_all_recordings_in_dataset(dataset_id): + with db.engine.begin() as connection: + query = text(""" + SELECT * + FROM dataset_class_member + INNER JOIN dataset_class + ON dataset_class.id = dataset_class_member.class + INNER JOIN dataset + ON dataset.id = dataset_class.dataset + WHERE dataset.id = :id + LIMIT 10000 + """) + result = connection.execute(query, {"id": dataset_id}) + data = result.fetchall() + return data diff --git a/db/import_mb_data.py b/db/import_mb_data.py new file mode 100644 index 000000000..352b46a62 --- /dev/null +++ b/db/import_mb_data.py @@ -0,0 +1,2218 @@ +import db +from brainzutils import musicbrainz_db +from sqlalchemy import text +import time +import logging +from flask import current_app + +BATCH_SLEEP_DURATION = 5 # number of seconds to wait between batches +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO) + + +def load_musicbrainz_schema_data(connection, table_name): + """General function to load all the data from the specified + musicbrainz schema table name. + + Args: + connection: database connection to execute the query. + table_name: Name of the table from musicbrainz schema. + + Returns: + Specified table data fetched from the database. + """ + query = text("""SELECT * FROM musicbrainz.{table_name}""".format(table_name=table_name)) + result = connection.execute(query, {'table_name': table_name}) + return result.fetchall() + + +def join_columns(columns): + """Join the column names of the tables by a comma in between and + a colon as prefix to pass the values in the insert query. + + Args: + columns: A list of all the columns of any table. + + Returns: + A string of column names separated by commas. + """ + columns[0] = ':' + columns[0] + return ',:'.join(columns) + + +def insert_data_into_musicbrainz_schema(connection, transaction, table_name, columns, values): + """Insert data into musicbrainz schema tables whose table_name, column names and + data values are specified. + + Args: + connection: database connection to execute the query. + transaction: transaction for every write operation. + table_name: Name of the table to apply the insert query on. + columns: Name of all the columns of the given table. + values: Data values of the rows to insert into the tables. + """ + trans = connection.begin() + query = text(""" + INSERT INTO musicbrainz.{table_name} ({columns}) + VALUES ({column_values}) + """.format(table_name=table_name, + columns=','.join(columns), + value_str=join_columns(columns))) + + result = connection.execute(query) + transaction.commit() + + +def get_data_from_musicbrainz(table_name, data, column='id'): + """Fetch data from main MusicBrainz database for the given column name, + data value and table name. + + Args: + table_name: Table name whose data is to be fetched. + data: data value whose corresponding row is fetched. + column: Column names for the tables. Take default as 'id' if not + specified. + + Returns: + Table name, columns and data values fetched from the MusicBrainz + database. + """ + with musicbrainz_db.engine.begin() as connection: + query = text(""" + SELECT * + FROM :table_name + WHERE :column = :data + """) + + result = connection.execute(query, {'table_name': table_name, + 'column': column, + 'data': data} + ) + + values = dict(result.fetchone()) + columns = [key for key in values] + return table_name, columns, value + + +def load_artist_credit(connection, MB_release_data, MB_release_group_data, MB_track_data, MB_artist_credit_name_data, artist_credit_from_recording): + """Fetch artist_credit table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Also fetch data corresponding to release, release_group and track table. + + Args: + connection: database connection to execute the query. + MB_release_data (of type - sqlalchemy.resultproxy): data retrieved from the release table + of the MusicBrainz database (should contain artist credit values). + MB_release_group_data (of type - sqlalchemy.resultproxy): data retrieved from the release_group + table of the MusicBrainz database (should contain artist credit values). + MB_track_data (of type - sqlalchemy.resultproxy): data retrieved from the track table of the + MusicBrainz database (should contain artist credit values). + MB_artist_credit_name_data (of type - sqlalchemy.resultproxy): data retrieved from the artist_credit_name + table of the MusicBrainz database (should contain artist credit values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + artist_credit data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to artist_credit column in release table + MB_release_fk_artist_credit = list({value['artist_credit'] for value in MB_release_data}) + + # Get data corresponding to artist_credit column in release_group table + MB_release_group_fk_artist_credit = list({value['artist_credit'] for value in MB_release_group_data}) + + # Get data corresponding to artist_credit column in track table + MB_track_fk_artist_credit = list({value['artist_credit'] for value in MB_track_data}) + + # Get data corresponding to artist_credit column in artist_credit_name table + MB_artist_credit_name_fk_artist_credit = list({value['artist_credit'] for value in MB_artist_credit_name_data}) + + if artist_credit_from_recording: + filters.append("artist_credit.id in :ids") + filter_data["ids"] = tuple(artist_credit_from_recording) + + if MB_release_data: + filters.append("artist_credit.id in :release_data") + filter_data["release_data"] = tuple(MB_release_fk_artist_credit) + + if MB_release_group_data: + filters.append("artist_credit.id in :release_group_data") + filter_data["release_group_data"] = tuple(MB_release_group_fk_artist_credit) + + if MB_track_data: + filters.append("artist_credit.id in :track_data") + filter_data["track_data"] = tuple(MB_track_fk_artist_credit) + + if MB_artist_credit_name_data: + filters.append("artist_credit.id in :artist_credit_name_data") + filter_data["artist_credit_name_data"] = tuple(MB_artist_credit_name_fk_artist_credit) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + artist_credit_query = text(""" + SELECT DISTINCT artist_credit.id, artist_credit.name, artist_credit.artist_count, + artist_credit.ref_count, artist_credit.created + FROM artist_credit + {filterstr} + """.format(filterstr=filterstr) + ) + + result = connection.execute(artist_credit_query, filter_data) + MB_artist_credit_data = result.fetchall() + + return MB_artist_credit_data + + +def load_artist_type(connection): + """Fetch artist_type table data from MusicBrainz database for the recording MBIDs in + AcousticBrainz database. Retrieving complete data because the rows in MusicBrainz database + for this table are much less in number. + + Args: + connection: database connection to execute the query. + Returns: + artist_type data fetched from MusicBrainz database. + """ + artist_type_query = text(""" + SELECT * + FROM artist_type + ORDER BY id + """) + result = connection.execute(artist_type_query) + MB_artist_type_data = result.fetchall() + + return MB_artist_type_data + + +def load_area_type(connection): + """Fetch area_type table data from MusicBrainz database for the recording MBIDs in + AcousticBrainz database. Retrieving complete data because the rows in MusicBrainz database + for this table are much less in number. + + Args: + connection: database connection to execute the query. + Returns: + area_type data fetched from MusicBrainz database. + """ + area_type_query = text(""" + SELECT * + FROM area_type + ORDER BY id + """) + result = connection.execute(area_type_query) + MB_area_type_data = result.fetchall() + + return MB_area_type_data + + +def load_begin_area_type(connection, artist_credit_from_recording): + """Fetch area_type table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database for the begin area column + in artist table. + + Args: + connection: database connection to execute the query. + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + begin_area_type data fetched from MusicBrainz database. + """ + begin_area_type_query = text(""" + SELECT DISTINCT area_type.id, + area_type.name, + area_type.parent, + area_type.child_order, + area_type.description, + area_type.gid + FROM area_type + INNER JOIN area + ON area.type = area_type.id + INNER JOIN artist + ON area.id = artist.begin_area + INNER JOIN artist_credit + ON artist.id = artist_credit.id + WHERE artist_credit.id in :data + """) + result = connection.execute(begin_area_type_query, {'data': tuple(artist_credit_from_recording)}) + MB_begin_area_type_data = result.fetchall() + + return MB_begin_area_type_data + + +def load_end_area_type(connection, artist_credit_from_recording): + """Fetch area_type table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database for the end area column in + artist table. + + Args: + connection: database connection to execute the query. + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + end_area_type data fetched from MusicBrainz database. + """ + end_area_type_query = text(""" + SELECT DISTINCT area_type.id, + area_type.name, + area_type.parent, + area_type.child_order, + area_type.description, + area_type.gid + FROM area_type + INNER JOIN area + ON area.type = area_type.id + INNER JOIN artist + ON area.id = artist.end_area + INNER JOIN artist_credit + ON artist.id = artist_credit.id + WHERE artist_credit.id in :data + """) + result = connection.execute(end_area_type_query, {'data': tuple(artist_credit_from_recording)}) + MB_end_area_type_data = result.fetchall() + + return MB_end_area_type_data + + +def load_release_status(connection): + """Fetch release_status table data from MusicBrainz database for the recording MBID + in AcousticBrainz database. Retrieving complete data because the rows in MusicBrainz database + for this table are much less in number. + + Args: + connection: database connection to execute the query. + Returns: + release_status data fetched from MusicBrainz database. + """ + release_status_query = text(""" + SELECT * + FROM release_status + ORDER BY id + """) + result = connection.execute(release_status_query) + MB_release_status_data = result.fetchall() + + return MB_release_status_data + + +def load_release_group_primary_type(connection): + """Fetch release_group_primary_type table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. Retrieving complete data because the rows + in MusicBrainz database for this table are much less in number. + + Args: + connection: database connection to execute the query. + Returns: + release_group_primary_type data fetched from MusicBrainz database. + """ + release_status_query = text(""" + SELECT * + FROM release_group_primary_type + ORDER BY id + """) + result = connection.execute(release_status_query) + MB_release_group_primary_type_data = result.fetchall() + + return MB_release_group_primary_type_data + + +def load_medium_format(connection): + """Fetch medium_format table data from MusicBrainz database for the recording MBIDs in + AcousticBrainz database. Retrieving complete data because the rows in + MusicBrainz database for this table are much less in number. + + Args: + connection: database connection to execute the query. + Returns: + medium_format data fetched from MusicBrainz database. + """ + medium_format_query = text(""" + SELECT * + FROM medium_format + ORDER BY id + """) + result = connection.execute(medium_format_query) + MB_medium_format_data = result.fetchall() + + return MB_medium_format_data + + +def load_release_packaging(connection): + """Fetch release_packaging table data from MusicBrainz database for the recording MBIDs in + AcousticBrainz database. Retrieving complete data because the rows in MusicBrainz database + for this table are much less in number. + + Args: + connection: database connection to execute the query. + Returns: + release_packaging data fetched from MusicBrainz database. + """ + release_packaging_query = text(""" + SELECT * + FROM release_packaging + ORDER BY id + """) + result = connection.execute(release_packaging_query) + MB_release_packaging_data = result.fetchall() + + return MB_release_packaging_data + + +def load_language(connection, MB_release_data, artist_credit_from_recording): + """Fetch language table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Also fetch data corresponding to release table. + + Args: + connection: database connection to execute the query. + MB_release_data (of type - sqlalchemy.resultproxy): data retrieved from the + release table of the MusicBrainz database (should contain language values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + language data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to language column in release table + MB_release_fk_language = list({value['language'] for value in MB_release_data}) + + if artist_credit_from_recording: + filters.append("release.artist_credit in :ids") + filter_data["ids"] = tuple(artist_credit_from_recording) + + if MB_release_data: + filters.append("language.id in :data") + filter_data["data"] = tuple(MB_release_fk_language) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + language_query = text(""" + SELECT DISTINCT language.id, + language.iso_code_2t, + language.iso_code_2b, + language.iso_code_1, + language.name, + language.frequency, + language.iso_code_3 + FROM language + INNER JOIN release + ON release.language = language.id + {filterstr} + """.format(filterstr=filterstr) + ) + result = connection.execute(language_query, filter_data) + MB_language_data = result.fetchall() + + return MB_language_data + + +def load_script(connection, MB_release_data, artist_credit_from_recording): + """Fetch script table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Also fetch data corresponding to release table. + + Args: + connection: database connection to execute the query. + MB_release_data (of type - sqlalchemy.resultproxy): data retrieved from the + release table of the MusicBrainz database (should contain script values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + script data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to language column in release table + MB_release_fk_script = list({value['script'] for value in MB_release_data}) + + if artist_credit_from_recording: + filters.append("release.artist_credit in :ids") + filter_data["ids"] = tuple(artist_credit_from_recording) + + if MB_release_data: + filters.append("script.id in :data") + filter_data["data"] = tuple(MB_release_fk_script) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + script_query = text(""" + SELECT DISTINCT script.id, + script.iso_code, + script.iso_number, + script.name, + script.frequency + FROM script + INNER JOIN release + ON release.script = script.id + {filterstr} + """.format(filterstr=filterstr) + ) + result = connection.execute(script_query, filter_data) + MB_script_data = result.fetchall() + + return MB_script_data + + +def load_gender(connection): + """ Fetch gender table data from MusicBrainz database for the recording MBIDs in + AcousticBrainz database. Retrieving complete data because the rows in MusicBrainz + database for this table are much less in number. + + Args: + connection: database connection to execute the query. + Returns: + gender data fetched from MusicBrainz database. + """ + gender_query = text(""" + SELECT * + FROM gender + ORDER BY id + """) + result = connection.execute(gender_query) + MB_gender_data = result.fetchall() + + return MB_gender_data + + +def load_area(connection, MB_artist_data, artist_credit_from_recording): + """ Fetch area table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Also fetch data corresponding to artist table. + + Args: + connection: database connection to execute the query. + MB_artist_data (of type - sqlalchemy.resultproxy): data retrieved from the + artist table of the MusicBrainz database (should contain area values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + area data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to area column in artist table + MB_artist_fk_area = list({value['area'] for value in MB_artist_data}) + + if artist_credit_from_recording: + filters.append("artist_credit.id in :credit") + filter_data["credit"] = tuple(artist_credit_from_recording) + + if MB_artist_data: + filters.append("area.id in :data") + filter_data["data"] = tuple(MB_artist_fk_area) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + area_query = text(""" + SELECT DISTINCT area.id, + area.gid, + area.name, + area.type, + area.edits_pending, + area.last_updated, + area.begin_date_year, + area.begin_date_month, + area.begin_date_day, + area.end_date_year, + area.end_date_month, + area.end_date_day, + area.ended, + area.comment + FROM area + INNER JOIN artist + ON area.id = artist.area + INNER JOIN artist_credit + ON artist.id = artist_credit.id + {filterstr} + """.format(filterstr=filterstr) + ) + + result = connection.execute(area_query, filter_data) + MB_area_data = result.fetchall() + + return MB_area_data + + +def load_begin_area(connection, MB_artist_data, artist_credit_from_recording): + """Fetch area table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database for begin area column. + + Also fetch data corresponding to artist table. + + Args: + connection: database connection to execute the query. + MB_artist_data (of type - sqlalchemy.resultproxy): data retrieved from the + artist table of the MusicBrainz database (should contain begin_area values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + begin_area data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to begin_area column in artist table + MB_artist_fk_begin_area = list({value['begin_area'] for value in MB_artist_data}) + + if artist_credit_from_recording: + filters.append("artist_credit.id in :credit") + filter_data["credit"] = tuple(artist_credit_from_recording) + + if MB_artist_data: + filters.append("area.id in :data") + filter_data["data"] = tuple(MB_artist_fk_begin_area) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + begin_area_query = text(""" + SELECT DISTINCT area.id, + area.gid, + area.name, + area.type, + area.edits_pending, + area.last_updated, + area.begin_date_year, + area.begin_date_month, + area.begin_date_day, + area.end_date_year, + area.end_date_month, + area.end_date_day, + area.ended, + area.comment + FROM area + INNER JOIN artist + ON area.id = artist.begin_area + INNER JOIN artist_credit + ON artist.id = artist_credit.id + {filterstr} + """.format(filterstr=filterstr) + ) + + result = connection.execute(begin_area_query, filter_data) + MB_begin_area_data = result.fetchall() + + return MB_begin_area_data + + +def load_end_area(connection, MB_artist_data, artist_credit_from_recording): + """Fetch area table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database for end area column. + + Also fetch data corresponding to artist table. + + Args: + connection: database connection to execute the query. + MB_artist_data (of type - sqlalchemy.resultproxy): data retrieved from the + artist table of the MusicBrainz database (should contain end_area values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + end_area data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to end_area column in artist table + MB_artist_fk_end_area = list({value['end_area'] for value in MB_artist_data}) + + if artist_credit_from_recording: + filters.append("artist_credit.id in :ids") + filter_data["ids"] = tuple(artist_credit_from_recording) + + if MB_artist_data: + filters.append("area.id in :data") + filter_data["data"] = tuple(MB_artist_fk_end_area) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + end_area_query = text(""" + SELECT DISTINCT area.id, + area.gid, + area.name, + area.type, + area.edits_pending, + area.last_updated, + area.begin_date_year, + area.begin_date_month, + area.begin_date_day, + area.end_date_year, + area.end_date_month, + area.end_date_day, + area.ended, + area.comment + FROM area + INNER JOIN artist + ON area.id = artist.end_area + INNER JOIN artist_credit + ON artist.id = artist_credit.id + {filterstr} + """.format(filterstr=filterstr) + ) + result = connection.execute(end_area_query, filter_data) + MB_end_area_data = result.fetchall() + + return MB_end_area_data + + +def load_artist_credit_name(connection, artist_credit_from_recording): + """Fetch artist_credit_name table data from MusicBrainz database + for the recording MBIDs in AcousticBrainz database. + + Args: + connection: database connection to execute the query. + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + artist_credit_name data fetched from MusicBrainz database. + """ + artist_credit_name_query = text(""" + SELECT DISTINCT artist_credit_name.artist_credit, + artist_credit_name.position, + artist_credit_name.artist, + artist_credit_name.name, + artist_credit_name.join_phrase + FROM artist_credit_name + INNER JOIN artist_credit + ON artist_credit_name.artist_credit = artist_credit.id + WHERE artist_credit.id in :data + """) + result = connection.execute(artist_credit_name_query, {'data': tuple(artist_credit_from_recording)}) + MB_artist_credit_name_data = result.fetchall() + + return MB_artist_credit_name_data + + +def load_artist(connection, MB_artist_credit_name_data, MB_artist_gid_redirect_data, artist_credit_from_recording): + """Fetch artist table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Also fetch data corresponding to artist_credit_name table. + + Args: + connection: database connection to execute the query. + MB_artist_credit_name_data (of type - sqlalchemy.resultproxy): data retrieved from the + artist_credit_name table of the MusicBrainz database (should contain artist values). + MB_artist_gid_redirect_data (of type - sqlalchemy.resultproxy): data retrieved from the + artist_gid_redirect table of the MusicBrainz database (should contain artist values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + artist data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to artist column in artist_credit_name table. + MB_artist_credit_name_fk_artist = list({value['artist'] for value in MB_artist_credit_name_data}) + + # Get data corresponding to new_id column in artist_gid_redirect table. + MB_artist_gid_redirect_fk_artist = list({value['new_id'] for value in MB_artist_gid_redirect_data}) + + if artist_credit_from_recording: + filters.append("artist_credit.id in :credit") + filter_data["credit"] = tuple(artist_credit_from_recording) + + if MB_artist_credit_name_data: + filters.append("artist.id in :data") + filter_data["data"] = tuple(MB_artist_credit_name_fk_artist) + + if MB_artist_gid_redirect_data: + filters.append("artist.id in :redirect_data") + filter_data["redirect_data"] = tuple(MB_artist_gid_redirect_fk_artist) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + artist_query = text(""" + SELECT DISTINCT artist.id, artist.gid, artist.name, artist.sort_name, artist.begin_date_year, + artist.begin_date_month, artist.begin_date_day, artist.end_date_year, artist.end_date_month, + artist.end_date_day, artist.type, artist.area, artist.gender, artist.comment, artist.edits_pending, + artist.last_updated, artist.ended, artist.begin_area, artist.end_area + FROM artist + INNER JOIN artist_credit + ON artist_credit.id = artist.id + {filterstr} + """.format(filterstr=filterstr) + ) + + result = connection.execute(artist_query, filter_data) + MB_artist_data = result.fetchall() + + return MB_artist_data + + +def load_artist_gid_redirect(connection, artist_credit_from_recording): + """Fetch artist_gid_redirect table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Args: + connection: database connection to execute the query. + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + artist_gid_redirect data fetched from MusicBrainz database. + """ + artist_gid_redirect_query = text(""" + SELECT DISTINCT artist_gid_redirect.gid, + artist_gid_redirect.new_id, + artist_gid_redirect.created + FROM artist_gid_redirect + INNER JOIN artist + ON artist.id = artist_gid_redirect.new_id + INNER JOIN artist_credit + ON artist.id = artist_credit.id + WHERE artist_credit.id in :data + """) + result = connection.execute(artist_gid_redirect_query, {'data': tuple(artist_credit_from_recording)}) + MB_artist_gid_redirect_data = result.fetchall() + + return MB_artist_gid_redirect_data + + +def load_recording(connection, gids_in_AB, MB_recording_gid_redirect_data): + """Fetch recording table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Args: + connection: database connection to execute the query. + gids_in_AB: list of recordings mbids present in lowlevel table in AB database. + MB_recording_gid_redirect_data (of type - sqlalchemy.resultproxy): data retrieved from the + recording_gid_redirect table of the MusicBrainz database (should contain recording values). + Returns: + recording data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to new_id column in recording_gid_redirect table. + MB_recording_gid_redirect_fk_recording = list({value['new_id'] for value in MB_recording_gid_redirect_data}) + + if gids_in_AB: + filters.append("recording.gid in :gids") + filter_data["gids"] = tuple(gids_in_AB) + + if MB_recording_gid_redirect_data: + filters.append("recording.id in :redirect_data") + filter_data["redirect_data"] = tuple(MB_recording_gid_redirect_fk_recording) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + recording_query = text(""" + SELECT DISTINCT recording.id, recording.gid, recording.name, recording.artist_credit, + recording.length, recording.comment, recording.edits_pending, recording.last_updated, + recording.video + FROM recording + {filterstr} + """.format(filterstr=filterstr) + ) + result = connection.execute(recording_query, filter_data) + MB_recording_data = result.fetchall() + + return MB_recording_data + + +def load_recording_gid_redirect(connection, gids_in_AB): + """Fetch recording_gid_redirect table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Args: + connection: database connection to execute the query. + gids_in_AB: list of recordings mbids present in lowlevel table in AB database. + Returns: + recording_gid_redirect data fetched from MusicBrainz database. + """ + recording_gid_redirect_query = text(""" + SELECT DISTINCT recording_gid_redirect.gid, + recording_gid_redirect.new_id, + recording_gid_redirect.created + FROM recording_gid_redirect + INNER JOIN recording + ON recording.id = recording_gid_redirect.new_id + WHERE recording.gid in :gids + """) + result = connection.execute(recording_gid_redirect_query, {'gids': tuple(gids_in_AB)}) + MB_recording_gid_redirect_data = result.fetchall() + + return MB_recording_gid_redirect_data + + +def load_release_group(connection, MB_release_group_gid_redirect_data, MB_release_data, artist_credit_from_recording): + """Fetch release_group table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Also fetch data corresponding to release_group_gid_redirect and + release table. + + Args: + connection: database connection to execute the query. + MB_release_group_gid_redirect_data (of type - sqlalchemy.resultproxy): data retrieved from the + release_group_gid_redirect table of the MusicBrainz database + (should contain release_group values). + MB_release_data (of type - sqlalchemy.resultproxy): data retrieved from the + release table of the MusicBrainz database (should contain release_group values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + release_group data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to release_group column in release_group_gid_redirect table. + MB_release_group_gid_redirect_fk_release_group = list({value['new_id'] for value in MB_release_group_gid_redirect_data}) + + # Get data corresponding to release_group column in release table. + MB_release_fk_release_group = list({value['release_group'] for value in MB_release_data}) + + if artist_credit_from_recording: + filters.append("release_group.artist_credit in :credit") + filter_data["credit"] = tuple(artist_credit_from_recording) + + if MB_release_group_gid_redirect_data: + filters.append("release_group.id in :redirect_data") + filter_data["redirect_data"] = tuple(MB_release_group_gid_redirect_fk_release_group) + + if MB_release_data: + filters.append("release_group.id in :release_data") + filter_data["release_data"] = tuple(MB_release_fk_release_group) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + release_group_query = text(""" + SELECT DISTINCT release_group.id, + release_group.gid, + release_group.name, + release_group.artist_credit, + release_group.type, + release_group.comment, + release_group.edits_pending, + release_group.last_updated + FROM release_group + {filterstr} + """.format(filterstr=filterstr) + ) + + result = connection.execute(release_group_query, filter_data) + MB_release_group_data = result.fetchall() + + return MB_release_group_data + + +def load_release_group_gid_redirect(connection, artist_credit_from_recording): + """Fetch release_group_gid_redirect table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Args: + connection: database connection to execute the query. + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + release_group_gid_redirect data fetched from MusicBrainz database. + """ + release_group_gid_redirect_query = text(""" + SELECT DISTINCT release_group_gid_redirect.gid, + release_group_gid_redirect.new_id, + release_group_gid_redirect.created + FROM release_group_gid_redirect + INNER JOIN release_group + ON release_group.id = release_group_gid_redirect.new_id + WHERE release_group.artist_credit in :data + """) + result = connection.execute(release_group_gid_redirect_query, {'data': tuple(artist_credit_from_recording)}) + MB_release_group_gid_redirect_data = result.fetchall() + + return MB_release_group_gid_redirect_data + + +def load_release(connection, MB_medium_data, MB_release_gid_redirect_data, artist_credit_from_recording): + """Fetch release table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Also fetch data corresponding to medium and release_gid_redirect table. + + Args: + connection: database connection to execute the query. + MB_medium_data (of type - sqlalchemy.resultproxy): data retrieved from the + medium table of the MusicBrainz database (should contain release values). + MB_release_gid_redirect_data (of type - sqlalchemy.resultproxy): data retrieved from the + release_gid_redirect table of the MusicBrainz database + (should contain release values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + release data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to release column in medium table. + MB_medium_fk_release = list({value['release'] for value in MB_medium_data}) + + # Get data corresponding to new_id column in release_gid_redirect table. + MB_release_gid_redirect_fk_release = list({value['new_id'] for value in MB_release_gid_redirect_data}) + + if artist_credit_from_recording: + filters.append("release.artist_credit in :credit") + filter_data["credit"] = tuple(artist_credit_from_recording) + + if MB_medium_data: + filters.append("release.id in :medium_data") + filter_data["medium_data"] = tuple(MB_medium_fk_release) + + if MB_release_gid_redirect_data: + filters.append("release.id in :redirect_data") + filter_data["redirect_data"] = tuple(MB_release_gid_redirect_fk_release) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + release_query = text(""" + SELECT DISTINCT release.id, + release.gid, + release.name, + release.artist_credit, + release.release_group, + release.status, + release.packaging, + release.language, + release.script, + release.barcode, + release.comment, + release.edits_pending, + release.quality, + release.last_updated + FROM release + {filterstr} + """.format(filterstr=filterstr) + ) + + result = connection.execute(release_query, filter_data) + MB_release_data = result.fetchall() + + return MB_release_data + + +def load_release_gid_redirect(connection, artist_credit_from_recording): + """Fetch release_gid_redirect table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Args: + connection: database connection to execute the query. + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + release_gid_redirect data fetched from MusicBrainz database. + """ + release_gid_redirect_query = text(""" + SELECT DISTINCT release_gid_redirect.gid, + release_gid_redirect.new_id, + release_gid_redirect.created + FROM release_gid_redirect + INNER JOIN release + ON release.id = release_gid_redirect.new_id + WHERE release.artist_credit in :data + """) + result = connection.execute(release_gid_redirect_query, {'data': tuple(artist_credit_from_recording)}) + MB_release_gid_redirect_data = result.fetchall() + + return MB_release_gid_redirect_data + + +def load_medium(connection, MB_track_data, artist_credit_from_recording): + """Fetch medium table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Also fetch data corresponding to track table. + + Args: + connection: database connection to execute the query. + MB_track_data (of type - sqlalchemy.resultproxy): data retrieved from the + track table of the MusicBrainz database (should contain medium values). + artist_credit_from_recording: list of artist_credit data from recording data retrieved from + recording table of MusicBrainz database. + Returns: + medium data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to medium column in track table. + MB_track_fk_medium = list({value['medium'] for value in MB_track_data}) + + if artist_credit_from_recording: + filters.append("release.artist_credit in :credit") + filter_data["credit"] = tuple(artist_credit_from_recording) + + if MB_track_data: + filters.append("medium.id in :data") + filter_data["data"] = tuple(MB_track_fk_medium) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + medium_query = text(""" + SELECT DISTINCT medium.id, + medium.release, + medium.position, + medium.format, + medium.name, + medium.edits_pending, + medium.last_updated, + medium.track_count + FROM medium + INNER JOIN release + ON release.id = medium.release + {filterstr} + """.format(filterstr=filterstr) + ) + + result = connection.execute(medium_query, filter_data) + MB_medium_data = result.fetchall() + + return MB_medium_data + + +def load_track(connection, MB_track_gid_redirect_data, id_from_recording): + """Fetch track table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Also fetch data corresponding to track_gid_redirect table. + + Args: + connection: database connection to execute the query. + MB_track_gid_redirect_data (of type - sqlalchemy.resultproxy): data retrieved from the + track_gid_redirect table of the MusicBrainz database (should contain track values). + id_from_recording: list of recording ids from recording data fetched from MusicBrainz database. + Returns: + track data fetched from MusicBrainz database. + """ + filters = [] + filter_data = {} + + # Get data corresponding to new_id column in track_gid_redirect table. + MB_track_gid_redirect_fk_track = list({value['new_id'] for value in MB_track_gid_redirect_data}) + + if id_from_recording: + filters.append("track.recording in :ids") + filter_data["ids"] = tuple(id_from_recording) + + if MB_track_gid_redirect_data: + filters.append("track.id in :redirect_data") + filter_data["redirect_data"] = tuple(MB_track_gid_redirect_fk_track) + + filterstr = " OR ".join(filters) + if filterstr: + filterstr = " WHERE " + filterstr + + track_query = text(""" + SELECT DISTINCT track.id, + track.gid, + track.recording, + track.medium, + track.position, + track.number, + track.name, + track.artist_credit, + track.length, + track.edits_pending, + track.last_updated, + track.is_data_track + FROM track + {filterstr} + """.format(filterstr=filterstr) + ) + result = connection.execute(track_query, filter_data) + MB_track_data = result.fetchall() + + return MB_track_data + + +def load_track_gid_redirect(connection, id_from_recording): + """Fetch track_gid_redirect table data from MusicBrainz database for the + recording MBIDs in AcousticBrainz database. + + Args: + connection: database connection to execute the query. + id_from_recording: list of recording ids from recording data fetched from MusicBrainz database. + Returns: + track_gid_redirect data fetched from MusicBrainz database. + """ + track_gid_redirect_query = text(""" + SELECT DISTINCT track_gid_redirect.gid, + track_gid_redirect.new_id, + track_gid_redirect.created + FROM track_gid_redirect + INNER JOIN track + ON track.id = track_gid_redirect.new_id + WHERE track.recording in :ids + """) + result = connection.execute(track_gid_redirect_query, {'ids': tuple(id_from_recording)}) + MB_track_gid_redirect_data = result.fetchall() + + return MB_track_gid_redirect_data + + +def write_artist_credit(connection, MB_artist_credit_data): + """Insert data into artist_credit table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_artist_credit_data: list of artist_credit data fetched from MusicBrainz database. + """ + artist_credit_query = text(""" + INSERT INTO musicbrainz.artist_credit + VALUES (:id, :name, :artist_count, :ref_count, :created) + ON CONFLICT (id) DO NOTHING + """) + values = [{ + "id" : value[0], + "name" : value[1], + "artist_count" : value[2], + "ref_count" : value[3], + "created" : value[4]} for value in MB_artist_credit_data + ] + connection.execute(artist_credit_query, values) + logging.info('Inserted %d rows in artist credit table!' % len(MB_artist_credit_data)) + + +def write_artist_type(connection, MB_artist_type_data): + """Insert data in artist_type table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_artist_type_data: list of artist_type data fetched from MusicBrainz database. + """ + artist_type_query = text(""" + INSERT INTO musicbrainz.artist_type(id, name, parent, child_order, description, gid) + VALUES (:id, :name, :parent, :child_order, :description, :gid) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "name" : value[01], + "parent" : value[2], + "child_order" : value[3], + "description" : value[4], + "gid" : value[5]} for value in MB_artist_type_data + ] + connection.execute(artist_type_query, values) + logging.info('Inserted %d rows in artist type table!' % len(MB_artist_type_data)) + + +def write_area_type(connection, MB_area_type_data): + """Insert data in area_type table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_area_type_data: list of area_type data fetched from MusicBrainz database. + """ + area_type_query = text(""" + INSERT INTO musicbrainz.area_type + VALUES (:id, :name, :parent, :child_order, :description, :gid) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "name": value[1], + "parent": value[2], + "child_order": value[3], + "description": value[4], + "gid": value[5]} for value in MB_area_type_data + ] + connection.execute(area_type_query, values) + logging.info('Inserted %d rows in area type table!' % len(MB_area_type_data)) + + +def write_begin_area_type(connection, MB_begin_area_type_data): + """Insert data in area_type table in musicbrainz schema in + AcousticBrainz database for begin_area column in artist table. + + Args: + connection: database connection to execute the query. + MB_begin_area_type_data: list of begin_area_type data fetched from MusicBrainz database. + """ + begin_area_type_query = text(""" + INSERT INTO musicbrainz.area_type + VALUES (:id, :name, :parent, :child_order, :description, :gid) + ON CONFLICT (id) DO NOTHING + """) + values = [{ + "id": value[0], + "name": value[1], + "parent": value[2], + "child_order": value[3], + "description": value[4], + "gid": value[5]} for value in MB_begin_area_type_data + ] + connection.execute(begin_area_type_query, values) + logging.info('Inserted %d rows in area type table for begin area data!' % len(MB_begin_area_type_data)) + + +def write_end_area_type(connection, MB_end_area_type_data): + """Insert data in area_type table in musicbrainz schema in + AcousticBrainz database for end area column in artist table. + + Args: + connection: database connection to execute the query. + MB_end_area_type_data: list of end_area_type data fetched from MusicBrainz database. + """ + end_area_type_query = text(""" + INSERT INTO musicbrainz.area_type(id, name, parent, child_order, description, gid) + VALUES (:id, :name, :parent, :child_order, :description, :gid) + ON CONFLICT (id) DO NOTHING + """) + values = [{ + "id": value[0], + "name": value[1], + "parent": value[2], + "child_order": value[3], + "description": value[4], + "gid": value[5]} for value in MB_end_area_type_data + ] + connection.execute(end_area_type_query, values) + logging.info('Inserted %d rows in area type table for end area data!' % len(MB_end_area_type_data)) + + +def write_release_status(connection, MB_release_status_data): + """Insert data in release_status table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_release_status_data: list of release_status data fetched from MusicBrainz database. + """ + release_status_query = text(""" + INSERT INTO musicbrainz.release_status + VALUES (:id, :name, :parent, :child_order, :description, :gid) + ON CONFLICT (gid) DO NOTHING + """) + values= [{ + "id": value[0], + "name": value[1], + "parent": value[2], + "child_order": value[3], + "description": value[4], + "gid": value[5]} for value in MB_release_status_data + ] + result = connection.execute(release_status_query, values) + logging.info('Inserted %d rows in release status table!' % len(MB_release_status_data)) + + +def write_release_group_primary_type(connection, MB_release_group_primary_type_data): + """Insert data in release_group_primary_type table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_release_group_primary_type_data: list of release_group_primary_type data fetched from MusicBrainz database. + """ + release_group_primary_type_query = text(""" + INSERT INTO musicbrainz.release_group_primary_type + VALUES (:id, :name, :parent, :child_order, :description, :gid) + ON CONFLICT (id) DO NOTHING + """) + values = [{ + "id": value[0], + "name": value[1], + "parent": value[2], + "child_order": value[3], + "description": value[4], + "gid": value[5]} for value in MB_release_group_primary_type_data + ] + connection.execute(release_group_primary_type_query, values) + logging.info('Inserted %d rows in release group primary type table!' % len(MB_release_group_primary_type_data)) + + +def write_medium_format(connection, MB_medium_format_data): + """Insert data in medium_format table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_medium_format_data: list of medium_format data fetched from MusicBrainz database. + """ + medium_format_query = text(""" + INSERT INTO musicbrainz.medium_format + VALUES (:id, :name, :parent, :child_order, :year, :has_discids, :description, :gid) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "name": value[1], + "parent": value[2], + "child_order": value[3], + "year": value[4], + "has_discids": value[5], + "description": value[6], + "gid": value[7]} for value in MB_medium_format_data + ] + connection.execute(text("""ALTER TABLE musicbrainz.medium_format DROP CONSTRAINT IF EXISTS medium_format_fk_parent""")) + connection.execute(medium_format_query, values) + logging.info('Inserted %d rows in medium format table!' % len(MB_medium_format_data)) + + +def write_release_packaging(connection, MB_release_packaging_data): + """Insert data in release_packaging table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_release_packaging_data: list of release_packaging data fetched from MusicBrainz database. + """ + release_packaging_query = text(""" + INSERT INTO musicbrainz.release_packaging + VALUES (:id, :name, :parent, :child_order, :description, :gid) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "name": value[1], + "parent": value[2], + "child_order": value[3], + "description": value[4], + "gid": value[5]} for value in MB_release_packaging_data + ] + connection.execute(release_packaging_query, values) + logging.info('Inserted %d rows in release packaging table!' % len(MB_release_packaging_data)) + + +def write_language(connection, MB_language_data): + """Insert data in language table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_language_data: list of language data fetched from MusicBrainz database. + """ + language_query = text(""" + INSERT INTO musicbrainz.language + VALUES (:id, :iso_code_2t, :iso_code_2b, :iso_code_1, :name, :frequency, :iso_code_3) + ON CONFLICT (iso_code_2b) DO NOTHING + """) + values = [{ + "id": value[0], + "iso_code_2t": value[1], + "iso_code_2b": value[2], + "iso_code_1": value[3], + "name": value[4], + "frequency": value[5], + "iso_code_3": value[6]} for value in MB_language_data + ] + connection.execute(language_query, values) + logging.info('Inserted %d rows in language table!' % len(MB_language_data)) + + +def write_script(connection, MB_script_data): + """Insert data in script table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_script_data: list of script data fetched from MusicBrainz database. + """ + script_query = text(""" + INSERT INTO musicbrainz.script + VALUES (:id, :iso_code, :iso_number, :name, :frequency) + ON CONFLICT (iso_code) DO NOTHING + """) + values = [{ + "id": value[0], + "iso_code": value[1], + "iso_number": value[2], + "name": value[3], + "frequency": value[4]} for value in MB_script_data + ] + connection.execute(script_query, values) + logging.info('Inserted %d rows in script table!' % len(MB_script_data)) + + +def write_gender(connection, MB_gender_data): + """Insert data in gender table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_gender_data: list of gender data fetched from MusicBrainz database. + """ + gender_query = text(""" + INSERT INTO musicbrainz.gender + VALUES (:id, :name, :parent, :child_order, :description, :gid) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "name": value[1], + "parent": value[2], + "child_order": value[3], + "description": value[4], + "gid": value[5]} for value in MB_gender_data + ] + connection.execute(gender_query, values) + logging.info('Inserted %d rows in gender table!' % len(MB_gender_data)) + + +def write_area(connection, MB_area_data): + """Insert data in area table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_area_data: list of area data fetched from MusicBrainz database. + """ + area_query = text(""" + INSERT INTO musicbrainz.area + VALUES (:id, :gid, :name, :type, :edits_pending, :last_updated, :begin_date_year, + :begin_date_month, :begin_date_day, :end_date_year, :end_date_month, :end_date_day, + :ended, :comment) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "gid": value[1], + "name": value[2], + "type": value[3], + "edits_pending": value[4], + "last_updated": value[5], + "begin_date_year": value[6], + "begin_date_month": value[7], + "begin_date_day": value[8], + "end_date_year": value[9], + "end_date_month": value[10], + "end_date_day": value[11], + "ended": value[12], + "comment": value[13]} for value in MB_area_data + ] + connection.execute(area_query, values) + logging.info('Inserted %d rows in area table!' % len(MB_area_data)) + + +def write_begin_area(connection, MB_begin_area_data): + """Insert data in area table in musicbrainz schema in + AcousticBrainz database for begin_area column in artist + table. + + Args: + connection: database connection to execute the query. + MB_begin_area_data: list of begin_area data fetched from MusicBrainz database. + """ + begin_area_query = text(""" + INSERT INTO musicbrainz.area + VALUES (:id, :gid, :name, :type, :edits_pending, :last_updated, :begin_date_year, + :begin_date_month, :begin_date_day, :end_date_year, :end_date_month, :end_date_day, + :ended, :comment) + ON CONFLICT (id) DO NOTHING + """) + values = [{ + "id": value[0], + "gid": value[1], + "name": value[2], + "type": value[3], + "edits_pending": value[4], + "last_updated": value[5], + "begin_date_year": value[6], + "begin_date_month": value[7], + "begin_date_day": value[8], + "end_date_year": value[9], + "end_date_month": value[10], + "end_date_day": value[11], + "ended": value[12], + "comment": value[13]} for value in MB_begin_area_data + ] + connection.execute(begin_area_query, values) + logging.info('Inserted %d rows in area table for begin area data!' % len(MB_begin_area_data)) + +def write_end_area(connection, MB_end_area_data): + """Insert data in area table in musicbrainz schema in + AcousticBrainz database for end_area column in artist + table. + + Args: + connection: database connection to execute the query. + MB_end_area_data: list of end_area data fetched from MusicBrainz database. + """ + end_area_query = text(""" + INSERT INTO musicbrainz.area + VALUES (:id, :gid, :name, :type, :edits_pending, :last_updated, :begin_date_year, + :begin_date_month, :begin_date_day, :end_date_year, :end_date_month, :end_date_day, + :ended, :comment) + ON CONFLICT (id) DO NOTHING + """) + values = [{ + "id": value[0], + "gid": value[1], + "name": value[2], + "type": value[3], + "edits_pending": value[4], + "last_updated": value[5], + "begin_date_year": value[6], + "begin_date_month": value[7], + "begin_date_day": value[8], + "end_date_year": value[9], + "end_date_month": value[10], + "end_date_day": value[11], + "ended": value[12], + "comment": value[13]} for value in MB_end_area_data + ] + connection.execute(end_area_query, values) + logging.info('Inserted %d rows in area table for end area data!' % len(MB_end_area_data)) + + +def write_artist(connection, MB_artist_data): + """Insert data in artist table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_artist_data: list of artist data fetched from MusicBrainz database. + """ + artist_query = text(""" + INSERT INTO musicbrainz.artist + VALUES (:id, :gid, :name, :sort_name, :begin_date_year, :begin_date_month, :begin_date_day, + :end_date_year, :end_date_month, :end_date_day, :type, :area, :gender, :comment, :edits_pending, + :last_updated, :ended, :begin_area, :end_area) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "gid": value[1], + "name": value[2], + "sort_name": value[3], + "begin_date_year": value[4], + "begin_date_month": value[5], + "begin_date_day": value[6], + "end_date_year": value[7], + "end_date_month": value[8], + "end_date_day": value[9], + "type": value[10], + "area": value[11], + "gender": value[12], + "comment": value[13], + "edits_pending": value[14], + "last_updated": value[15], + "ended": value[16], + "begin_area": value[17], + "end_area": value[18]} for value in MB_artist_data + ] + connection.execute(artist_query, values) + logging.info('Inserted %d rows in artist table!' % len(MB_artist_data)) + + +def write_artist_credit_name(connection, MB_artist_credit_name_data): + """Insert data in artist_credit_name table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_artist_credit_name_data: list of artist_credit_name data fetched from MusicBrainz database. + """ + artist_credit_name_query = text(""" + INSERT INTO musicbrainz.artist_credit_name + VALUES (:artist_credit, :position, :artist, :name, :join_phrase) + ON CONFLICT (artist_credit, position) DO NOTHING + """) + values = [{ + "artist_credit": value[0], + "position": value[1], + "artist": value[2], + "name": value[3], + "join_phrase": value[4]} for value in MB_artist_credit_name_data + ] + connection.execute(artist_credit_name_query, values) + logging.info('Inserted %d rows in artist credit name table!' % len(MB_artist_credit_name_data)) + + +def write_artist_gid_redirect(connection, MB_artist_gid_redirect_data): + """Insert data in artist_gid_redirect table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_artist_gid_redirect_data: list of artist_gid_redirect data fetched from MusicBrainz database. + """ + artist_gid_redirect_query = text(""" + INSERT INTO musicbrainz.artist_gid_redirect + VALUES (:gid, :new_id, :created) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "gid": value[0], + "new_id": value[1], + "created": value[2]} for value in MB_artist_gid_redirect_data + ] + connection.execute(artist_gid_redirect_query, values) + logging.info('Inserted %d rows in artist gid redirect table!' % len(MB_artist_gid_redirect_data)) + + +def write_recording(connection, MB_recording_data): + """Insert data in recording table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_recording_data: list of recording data fetched from MusicBrainz database. + """ + recording_query = text(""" + INSERT INTO musicbrainz.recording + VALUES (:id, :gid, :name, :artist_credit, :length, :comment, :edits_pending, :last_updated, :video) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "gid": value[1], + "name": value[2], + "artist_credit": value[3], + "length": value[4], + "comment": value[5], + "edits_pending": value[6], + "last_updated": value[7], + "video": value[8]} for value in MB_recording_data + ] + connection.execute(recording_query, values) + logging.info('Inserted %d rows in recording table!' % len(MB_recording_data)) + + +def write_recording_gid_redirect(connection, MB_recording_gid_redirect_data): + """Insert data in recording_gid_redirect table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_recording_gid_redirect_data: list of recording_gid_redirect data fetched from MusicBrainz database. + """ + recording_gid_redirect_query = text(""" + INSERT INTO musicbrainz.recording_gid_redirect + VALUES (:gid, :new_id, :created) + ON CONFLICT (gid) DO NOTHING + """) + values = [{"gid": value[0], + "new_id": value[1], + "created": value[2]} for value in MB_recording_gid_redirect_data + ] + connection.execute(recording_gid_redirect_query, values) + logging.info('Inserted %d rows in recording gid redirect table!' % len(MB_recording_gid_redirect_data)) + + +def write_release_group(connection, MB_release_group_data): + """Insert data in release_group table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_release_group_data: list of release_group data fetched from MusicBrainz database. + """ + release_group_query = text(""" + INSERT INTO musicbrainz.release_group + VALUES (:id, :gid, :name, :artist_credit, :type, :comment, :edits_pending, :last_updated) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "gid": value[1], + "name": value[2], + "artist_credit": value[3], + "type": value[4], + "comment": value[5], + "edits_pending": value[6], + "last_updated": value[7]} for value in MB_release_group_data + ] + connection.execute(release_group_query, values) + logging.info('Inserted %d rows in release group table!' % len(MB_release_group_data)) + + +def write_release_group_gid_redirect(connection, MB_release_gid_redirect_data): + """Insert data in release_group_gid_redirect table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_release_group_gid_redirect_data: list of release_group_gid_redirect data fetched from MusicBrainz database. + """ + release_group_gid_redirect_query = text(""" + INSERT INTO musicbrainz.release_group_gid_redirect + VALUES (:gid, :new_id, :created) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "gid": value[0], + "new_id": value[1], + "created": value[2]} for value in MB_release_gid_redirect_data + ] + connection.execute(release_group_gid_redirect_query, values) + logging.info('Inserted %d rows in release gid redirect table!' % len(MB_release_gid_redirect_data)) + + +def write_release(connection, MB_release_data): + """Insert data in release table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_release_data: list of release data fetched from MusicBrainz database. + """ + release_query = text(""" + INSERT INTO musicbrainz.release + VALUES (:id, :gid, :name, :artist_credit, :release_group, :status, :packaging, :language, + :script, :barcode, :comment, :edits_pending, :quality, :last_updated) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "gid": value[1], + "name": value[2], + "artist_credit": value[3], + "release_group": value[4], + "status": value[5], + "packaging": value[6], + "language": value[7], + "script": value[8], + "barcode": value[9], + "comment": value[10], + "edits_pending": value[11], + "quality": value[12], + "last_updated": value[13]} for value in MB_release_data + ] + connection.execute(release_query, values) + logging.info('Inserted %d rows in release table!' % len(MB_release_data)) + + +def write_release_gid_redirect(connection, MB_release_gid_redirect_data): + """Insert data in release_gid_redirect table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_release_gid_redirect_data: list of release_gid_redirect data fetched from MusicBrainz database. + """ + release_gid_redirect_query = text(""" + INSERT INTO musicbrainz.release_gid_redirect + VALUES (:gid, :new_id, :created) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "gid": value[0], + "new_id": value[1], + "created": value[2]} for value in MB_release_gid_redirect_data + ] + connection.execute(release_gid_redirect_query, values) + logging.info('Inserted %d rows in release gid redirect table!' % len(MB_release_gid_redirect_data)) + + +def write_medium(connection, MB_medium_data): + """Insert data in medium table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_medium_data: list of medium data fetched from MusicBrainz database. + """ + medium_query = text(""" + INSERT INTO musicbrainz.medium + VALUES (:id, :release, :position, :format, :name, :edits_pending, :last_updated, :track_count) + ON CONFLICT (id) DO NOTHING + """) + values = [{ + "id": value[0], + "release": value[1], + "position": value[2], + "format": value[3], + "name": value[4], + "edits_pending": value[5], + "last_updated": value[6], + "track_count": value[7]} for value in MB_medium_data + ] + connection.execute(medium_query, values) + logging.info('Inserted %d rows in medium table!' % len(MB_medium_data)) + + +def write_track(connection, MB_track_data): + """Insert data in track table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_track_data: list of track data fetched from MusicBrainz database. + """ + track_query = text(""" + INSERT INTO musicbrainz.track + VALUES (:id, :gid, :recording, :medium, :position, :number, :name, :artist_credit, :length, + :edits_pending, :last_updated, :is_data_track) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "id": value[0], + "gid": value[1], + "recording": value[2], + "medium": value[3], + "position": value[4], + "number": value[5], + "name": value[6], + "artist_credit": value[7], + "length": value[8], + "edits_pending": value[9], + "last_updated": value[10], + "is_data_track": value[11]} for value in MB_track_data + ] + connection.execute(track_query, values) + logging.info('Inserted %d rows in track table!' % len(MB_track_data)) + + +def write_track_gid_redirect(connection, MB_track_gid_redirect_data): + """Insert data in track_gid_redirect table in musicbrainz schema in + AcousticBrainz database. + + Args: + connection: database connection to execute the query. + MB_track_gid_redirect_data: list of track_gid_redirect data fetched from MusicBrainz database. + """ + track_gid_redirect_query = text(""" + INSERT INTO musicbrainz.track_gid_redirect + VALUES (:gid, :new_id, :created) + ON CONFLICT (gid) DO NOTHING + """) + values = [{ + "gid": value[0], + "new_id": value[1], + "created": value[2]} for value in MB_track_gid_redirect_data + ] + connection.execute(track_gid_redirect_query, values) + logging.info('Inserted %d rows in track gid redirect table!' % len(MB_track_gid_redirect_data)) + + +def fetch_and_insert_musicbrainz_data(gids_in_AB): + # Get MusicBrainz data + logging.info('Getting %d recordings data at a time...\n' % (len(gids_in_AB))) + with musicbrainz_db.engine.begin() as connection: + # recording_gid_redirect + try: + logging.info('Getting recording gid redirect data...') + MB_recording_gid_redirect_data = load_recording_gid_redirect(connection, gids_in_AB) + except ValueError: + logging.info("No Data found from recording gid redirect table for the recordings") + + # recording + try: + logging.info('Getting recording data...') + MB_recording_data = load_recording(connection, gids_in_AB, MB_recording_gid_redirect_data) + artist_credit_from_recording = [value[3] for value in MB_recording_data] + id_from_recording = [value[0] for value in MB_recording_data] + except ValueError: + logging.info("No Data found from recording table for the recordings") + + # track_gid_redirect + try: + logging.info('Getting track gid redirect data...') + MB_track_gid_redirect_data = load_track_gid_redirect(connection, id_from_recording) + except ValueError: + logging.info("No Data found from track gid redirect table for the recordings") + + # track + try: + logging.info('Getting track data...') + MB_track_data = load_track(connection, MB_track_gid_redirect_data, id_from_recording) + except ValueError: + logging.info("No Data found from track table for the recordings") + + # medium + try: + logging.info('Getting medium data...') + MB_medium_data = load_medium(connection, MB_track_data, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from medium table for the recordings") + + # release_gid_redirect + try: + logging.info('Getting release gid redirect data...') + MB_release_gid_redirect_data = load_release_gid_redirect(connection, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from release gid redirect table for the recordings") + + # release + try: + logging.info('Getting release data...') + MB_release_data = load_release(connection, MB_medium_data, MB_release_gid_redirect_data, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from release table for the recordings") + + # artist_credit_name + try: + logging.info('Getting artist credit name data...') + MB_artist_credit_name_data = load_artist_credit_name(connection, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from artist credit name table for the recordings") + + # artist_gid_redirect + try: + logging.info('Getting artist gid redirect data...') + MB_artist_gid_redirect_data = load_artist_gid_redirect(connection, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from artist gid redirect table for the recordings") + + # artist + try: + logging.info('Getting artist data...') + MB_artist_data = load_artist(connection, MB_artist_credit_name_data, MB_artist_gid_redirect_data, artist_credit_from_recording) + artist_type_from_artist = [value[10] for value in MB_artist_data] + except ValueError: + logging.info("No Data found from artist table for the recordings") + + # artist_type + try: + logging.info('Getting artist type data...') + MB_artist_type_data = load_artist_type(connection) + except ValueError: + logging.info("No Data found from artist type table for the recordings") + + # area + try: + logging.info('Getting area data...') + MB_area_data = load_area(connection, MB_artist_data, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from area table for the recordings") + + # begin_area + try: + logging.info('Getting begin area data...') + MB_begin_area_data = load_begin_area(connection, MB_artist_data, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from area table for the recordings") + + # end_area + try: + logging.info('Getting end area data...') + MB_end_area_data = load_end_area(connection, MB_artist_data, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from area table for the recordings") + + # area_type + try: + logging.info('Getting area type data...') + MB_area_type_data = load_area_type(connection) + except ValueError: + logging.info("No Data found from area type table for the recordings") + + # begin_area_type + try: + logging.info('Getting begin area type data...') + MB_begin_area_type_data = load_begin_area_type(connection, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from area type table for the recordings") + + # end_area_type + try: + logging.info('Getting end area data...') + MB_end_area_type_data = load_end_area_type(connection, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from area type table for the recordings") + + # gender + try: + logging.info('Getting gender data...') + MB_gender_data = load_gender(connection) + except ValueError: + logging.info("No Data found from gender table for the recordings") + + # language + try: + logging.info('Getting language data...') + MB_language_data = load_language(connection, MB_release_data, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from language table for the recordings") + + # medium_format + try: + logging.info('Getting medium format data...') + MB_medium_format_data = load_medium_format(connection) + except ValueError: + logging.info("No Data found from medium format table for the recordings") + + # release_group gid redirect + try: + logging.info('Getting release group gid redirect data...') + MB_release_group_gid_redirect_data = load_release_group_gid_redirect(connection, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from release group gid redirect table for the recordings") + + # release_group + try: + logging.info('Getting release group data...') + MB_release_group_data = load_release_group(connection, MB_release_group_gid_redirect_data, MB_release_data, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from release group table for the recordings") + + # artist_credit + try: + logging.info('Getting artist credit data...') + MB_artist_credit_data = load_artist_credit(connection, MB_release_data, MB_release_group_data, MB_track_data, MB_artist_credit_name_data, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from artist credit table for the recordings") + + # release_group_primary_type + try: + logging.info('Getting release group primary type data...') + MB_release_group_primary_type_data = load_release_group_primary_type(connection) + except ValueError: + logging.info("No Data found from release group primary type table for the recordings") + + # release_packaging + try: + logging.info('Getting release packaging data...') + MB_release_packaging_data = load_release_packaging(connection) + except ValueError: + logging.info("No Data found from release packaging table for the recordings") + + # release_status + try: + logging.info('Getting release status data...') + MB_release_status_data = load_release_status(connection) + except ValueError: + logging.info("No Data found from release status table for the recordings") + + # script + try: + logging.info('Getting script data...\n') + MB_script_data = load_script(connection, MB_release_data, artist_credit_from_recording) + except ValueError: + logging.info("No Data found from script table for the recordings") + + # Write MusicBrainz data into AcousticBrainz database + logging.info('Inserting %d recordings data at a time...\n' % (len(gids_in_AB))) + with db.engine.begin() as connection: + if MB_artist_credit_data: + write_artist_credit(connection, MB_artist_credit_data) + + if MB_artist_type_data: + write_artist_type(connection, MB_artist_type_data) + + if MB_area_type_data: + write_area_type(connection, MB_area_type_data) + + if MB_begin_area_type_data: + write_begin_area_type(connection, MB_begin_area_type_data) + + if MB_end_area_type_data: + write_end_area_type(connection, MB_end_area_type_data) + + if MB_release_status_data: + write_release_status(connection, MB_release_status_data) + + if MB_release_group_primary_type_data: + write_release_group_primary_type(connection, MB_release_group_primary_type_data) + + if MB_medium_format_data: + write_medium_format(connection, MB_medium_format_data) + + if MB_release_packaging_data: + write_release_packaging(connection, MB_release_packaging_data) + + if MB_language_data: + write_language(connection, MB_language_data) + + if MB_script_data: + write_script(connection, MB_script_data) + + if MB_gender_data: + write_gender(connection, MB_gender_data) + + if MB_area_data: + write_area(connection, MB_area_data) + + if MB_begin_area_data: + write_begin_area(connection, MB_begin_area_data) + + if MB_end_area_data: + write_end_area(connection, MB_end_area_data) + + if MB_artist_data: + write_artist(connection, MB_artist_data) + + if MB_artist_credit_name_data: + write_artist_credit_name(connection, MB_artist_credit_name_data) + + if MB_artist_gid_redirect_data: + write_artist_gid_redirect(connection, MB_artist_gid_redirect_data) + + if MB_recording_data: + write_recording(connection, MB_recording_data) + + if MB_recording_gid_redirect_data: + write_recording_gid_redirect(connection, MB_recording_gid_redirect_data) + + if MB_release_group_data: + write_release_group(connection, MB_release_group_data) + + if MB_release_group_gid_redirect_data: + write_release_group_gid_redirect(connection, MB_release_group_gid_redirect_data) + + if MB_release_data: + write_release(connection, MB_release_data) + + if MB_release_gid_redirect_data: + write_release_gid_redirect(connection, MB_release_gid_redirect_data) + + if MB_medium_data: + write_medium(connection, MB_medium_data) + + if MB_track_data: + write_track(connection, MB_track_data) + + if MB_track_gid_redirect_data: + write_track_gid_redirect(connection, MB_track_gid_redirect_data) + + +def start_import(): + with db.engine.begin() as connection: + offset = 0 + rows_to_fetch = current_app.config['RECORDINGS_FETCHED_PER_BATCH'] + start_time = time.time() + while True: + lowlevel_query = text("""SELECT gid + FROM lowlevel + ORDER BY id + OFFSET :offset + LIMIT :rows_to_fetch + """) + gids = connection.execute(lowlevel_query, {"offset": offset, "rows_to_fetch": rows_to_fetch}) + gids = gids.fetchall() + gids_in_AB = [value[0] for value in gids] + offset = offset + rows_to_fetch + + if gids_in_AB: + fetch_and_insert_musicbrainz_data(gids_in_AB) + batch_sleep = current_app.config['BATCH_SLEEP_DURATION'] + logging.info("Sleeping %s seconds before starting next batch's import." % batch_sleep) + time.sleep(batch_sleep) + else: + break + logging.info('Done!') + total_time_taken = time.time() - start_time + logging.info('Data imported and inserted in %.2f seconds.' % total_time_taken) diff --git a/db/test/test_data.py b/db/test/test_data.py index 06ae33c6e..90ab1fa56 100644 --- a/db/test/test_data.py +++ b/db/test/test_data.py @@ -3,6 +3,8 @@ import os.path import mock +import copy +import uuid import sqlalchemy import db.data @@ -606,7 +608,7 @@ def test_load_many_individual_features(self): second_data["metadata"]["tags"]["album"] = ["Another album"] db.data.write_low_level(self.test_mbid, self.test_lowlevel_data, gid_types.GID_TYPE_MBID) - db.data.write_low_level(self.test_mbid, second_data, gid_types.GID_TYPE_MBID) + db.data.write_low_level(self.test_mbid, second_data, gid_types.GID_TYPE_MBID) db.data.write_low_level(self.test_mbid_two, self.test_lowlevel_data_two, gid_types.GID_TYPE_MBID) # If no data exists for an (mbid, offset) pair, it is skipped @@ -788,6 +790,18 @@ def test_get_summary_data(self): pass + def test_load_new_recordings_from_lowlevel(self): + """Two mbids are inserted into lowlevel table and then fetch a list of newly added mbids + and then check if both the lists contain similar items""" + recording_mbids = [uuid.UUID('ceec2751-44fe-44ff-b281-de00df9117d8'), uuid.UUID('575519b3-c06b-4157-b172-5d7ca80a8382')] + one = {"data": "one", "metadata": {"audio_properties": {"lossless": True}, "version": {"essentia_build_sha": "x"}}} + two = {"data": "two", "metadata": {"audio_properties": {"lossless": True}, "version": {"essentia_build_sha": "x"}}} + db.data.write_low_level(recording_mbids[0], one, gid_types.GID_TYPE_MBID) + db.data.write_low_level(recording_mbids[1], two, gid_types.GID_TYPE_MBID) + + self.assertEqual(recording_mbids, db.data.get_new_recordings_from_lowlevel()) + + class DataUtilTestCase(AcousticbrainzTestCase): """ Tests for utility methods in db/data. Should be moved out of db at some time. """ diff --git a/db/test/test_import_mb_data.py b/db/test/test_import_mb_data.py new file mode 100644 index 000000000..283e68b5b --- /dev/null +++ b/db/test/test_import_mb_data.py @@ -0,0 +1,875 @@ +from db.testing import DatabaseTestCase, TEST_DATA_PATH, gid_types +from brainzutils import musicbrainz_db +import db +import db.exceptions +import db.import_mb_data +import os.path +import mock +import uuid +import datetime +import psycopg2 + + +class DataMusicBrainzDBTestCase(DatabaseTestCase): + + def setUp(self): + super(DataMusicBrainzDBTestCase, self).setUp() + + + def test_load_and_write_area(self): + """Writing and loading data for area table using values from referenced area_type table""" + + # area_type + data = [(1, u'Country', None, 1, u'Country is used for areas included (or previously included) in ISO 3166-1, e.g. United States.', uuid.UUID('06dd0ae4-8c74-30bb-b43d-95dcedf961de')), + (3, u'City', None, 3, u'City is used for settlements of any size, including towns and villages.', uuid.UUID('6fd8f29a-3d0a-32fc-980d-ea697b69da78')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_area_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'area_type')) + + # area + data = [(24482, uuid.UUID('915a5576-b30c-4160-93cd-e1185cebb6ac'), u'Smithville', 3, 0, + datetime.datetime(2013, 11, 14, 1, 33, 0, 377353, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u''), + (222, uuid.UUID('489ce91b-6658-3307-9877-795b68554c98'), u'United States', 1, 0, + datetime.datetime(2013, 6, 15, 18, 6, 39, 593230, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u''), + (5099, uuid.UUID('29a709d8-0320-493e-8d0c-f2c386662b7f'), u'Chicago', 3, 0, + datetime.datetime(2013, 5, 24, 20, 27, 13, 405462, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u'') + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_area(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'area')) + + + def test_load_and_write_artist(self): + """Writing and loading data for artist table using values from referenced area_type, area, + artist_type & gender tables. + """ + + # area_type + data = [(1, u'Country', None, 1, u'Country is used for areas included (or previously included) in ISO 3166-1, e.g. United States.', uuid.UUID('06dd0ae4-8c74-30bb-b43d-95dcedf961de')), + (3, u'City', None, 3, u'City is used for settlements of any size, including towns and villages.', uuid.UUID('6fd8f29a-3d0a-32fc-980d-ea697b69da78')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_area_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'area_type')) + + # area + data = [(24482, uuid.UUID('915a5576-b30c-4160-93cd-e1185cebb6ac'), u'Smithville', 3, 0, + datetime.datetime(2013, 11, 14, 1, 33, 0, 377353, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u''), + (222, uuid.UUID('489ce91b-6658-3307-9877-795b68554c98'), u'United States', 1, 0, + datetime.datetime(2013, 6, 15, 18, 6, 39, 593230, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u''), + (5099, uuid.UUID('29a709d8-0320-493e-8d0c-f2c386662b7f'), u'Chicago', 3, 0, + datetime.datetime(2013, 5, 24, 20, 27, 13, 405462, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u'') + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_area(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'area')) + + # artist_type + data = [(1, u'Person', None, 1, None, uuid.UUID('b6e035f4-3ce9-331c-97df-83397230b0df')), + (2, u'Group', None, 2, None, uuid.UUID('e431f5f6-b5d2-343d-8b36-72607fffb74b')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_type')) + + # gender + data = [(1, u'Male', None, 1, None, uuid.UUID('36d3d30a-839d-3eda-8cb3-29be4384e4a9')), + (2, u'Female', None, 2, None, uuid.UUID('93452b5a-a947-30c8-934f-6a4056b151c2')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_gender(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'gender')) + + # artist + data = [(6747, uuid.UUID('1b62df85-00d2-464f-81bc-a5c0cdcad278'), u'Tampa Red', u'Tampa Red', 1904, 1, 8, 1981, 3, 19, 1, 222, 1, u'', 0, + datetime.datetime(2016, 8, 21, 5, 0, 58, 662928, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), True, 24482, 5099) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist')) + + + def test_load_and_write_artist_gid_redirect(self): + """Writing and loading data for artist_gid_redirect table using values from referenced area_type, + area, artist_type, gender & artist tables. + """ + + # area_type + data = [(1, u'Country', None, 1, u'Country is used for areas included (or previously included) in ISO 3166-1, e.g. United States.', uuid.UUID('06dd0ae4-8c74-30bb-b43d-95dcedf961de')), + (3, u'City', None, 3, u'City is used for settlements of any size, including towns and villages.', uuid.UUID('6fd8f29a-3d0a-32fc-980d-ea697b69da78')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_area_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'area_type')) + + # area + data = [(24482, uuid.UUID('915a5576-b30c-4160-93cd-e1185cebb6ac'), u'Smithville', 3, 0, + datetime.datetime(2013, 11, 14, 1, 33, 0, 377353, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u''), + (222, uuid.UUID('489ce91b-6658-3307-9877-795b68554c98'), u'United States', 1, 0, + datetime.datetime(2013, 6, 15, 18, 6, 39, 593230, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u''), + (5099, uuid.UUID('29a709d8-0320-493e-8d0c-f2c386662b7f'), u'Chicago', 3, 0, + datetime.datetime(2013, 5, 24, 20, 27, 13, 405462, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u'') + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_area(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'area')) + + # artist_type + data = [(1, u'Person', None, 1, None, uuid.UUID('b6e035f4-3ce9-331c-97df-83397230b0df')), + (2, u'Group', None, 2, None, uuid.UUID('e431f5f6-b5d2-343d-8b36-72607fffb74b')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_type')) + + # gender + data = [(1, u'Male', None, 1, None, uuid.UUID('36d3d30a-839d-3eda-8cb3-29be4384e4a9')), + (2, u'Female', None, 2, None, uuid.UUID('93452b5a-a947-30c8-934f-6a4056b151c2')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_gender(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'gender')) + + # artist + data = [(6747, uuid.UUID('1b62df85-00d2-464f-81bc-a5c0cdcad278'), u'Tampa Red', u'Tampa Red', 1904, 1, 8, 1981, 3, 19, 1, 222, 1, u'', 0, + datetime.datetime(2016, 8, 21, 5, 0, 58, 662928, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), True, 24482, 5099) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist')) + + # artist_gid_redirect + data = [(uuid.UUID('6873559d-8cb9-494d-9f78-4c1eeab1f851'), 6747, datetime.datetime(2016, 3, 13, 23, 0, 21, 981437, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)))] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_gid_redirect(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_gid_redirect')) + + + def test_load_and_write_artist_credit(self): + """Writing and loading data for artist_credit table.""" + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + + def test_load_and_write_artist_credit_name(self): + """Writing and loading data for artist_gid_redirect table using values from referenced area_type, + area, artist_type, gender & artist tables. + """ + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # area_type + data = [(1, u'Country', None, 1, u'Country is used for areas included (or previously included) in ISO 3166-1, e.g. United States.', uuid.UUID('06dd0ae4-8c74-30bb-b43d-95dcedf961de')), + (3, u'City', None, 3, u'City is used for settlements of any size, including towns and villages.', uuid.UUID('6fd8f29a-3d0a-32fc-980d-ea697b69da78')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_area_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'area_type')) + + # area + data = [(24482, uuid.UUID('915a5576-b30c-4160-93cd-e1185cebb6ac'), u'Smithville', 3, 0, + datetime.datetime(2013, 11, 14, 1, 33, 0, 377353, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u''), + (222, uuid.UUID('489ce91b-6658-3307-9877-795b68554c98'), u'United States', 1, 0, + datetime.datetime(2013, 6, 15, 18, 6, 39, 593230, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u''), + (5099, uuid.UUID('29a709d8-0320-493e-8d0c-f2c386662b7f'), u'Chicago', 3, 0, + datetime.datetime(2013, 5, 24, 20, 27, 13, 405462, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), None, None, None, None, None, None, False, u'') + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_area(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'area')) + + # artist_type + data = [(1, u'Person', None, 1, None, uuid.UUID('b6e035f4-3ce9-331c-97df-83397230b0df')), + (2, u'Group', None, 2, None, uuid.UUID('e431f5f6-b5d2-343d-8b36-72607fffb74b')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_type')) + + # gender + data = [(1, u'Male', None, 1, None, uuid.UUID('36d3d30a-839d-3eda-8cb3-29be4384e4a9')), + (2, u'Female', None, 2, None, uuid.UUID('93452b5a-a947-30c8-934f-6a4056b151c2')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_gender(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'gender')) + + # artist + data = [(6747, uuid.UUID('1b62df85-00d2-464f-81bc-a5c0cdcad278'), u'Tampa Red', u'Tampa Red', 1904, 1, 8, 1981, 3, 19, 1, 222, 1, u'', 0, + datetime.datetime(2016, 8, 21, 5, 0, 58, 662928, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), True, 24482, 5099) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist')) + + # artist_credit_name + data = [(6747, 0, 6747, u'Tampa Red', u'')] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit_name(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit_name')) + + + def test_load_and_write_recording(self): + """Writing and loading data for recording table using values from referenced artist_credit table.""" + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # recording + data = [(11768371, uuid.UUID('d51cf7fb-97e1-4070-a40b-b03707f91c92'), u'(Rinaldo, HWV 7: Act I. "Combatti da forte" (Almirena)', 73502, 203000, u'', 0, None, False), + (8598260, uuid.UUID('9086b742-358b-4f73-9a14-84cb1a9ce4ce'), u'Love Story', 399541, 235000, u'', 0, None, False) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_recording(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'recording')) + + + def test_load_and_write_recording_gid_redirect(self): + """Writing and loading data for recording_gid_redirect table using values from referenced\ + artist_credit & recording tables. + """ + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # recording + data = [(11768371, uuid.UUID('d51cf7fb-97e1-4070-a40b-b03707f91c92'), u'(Rinaldo, HWV 7: Act I. "Combatti da forte" (Almirena)', 73502, 203000, u'', 0, None, False), + (8598260, uuid.UUID('9086b742-358b-4f73-9a14-84cb1a9ce4ce'), u'Love Story', 399541, 235000, u'', 0, None, False) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_recording(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'recording')) + + # recording_gid_redirect + data = [(uuid.UUID('05e1ab2e-f54f-464b-a1fd-fcc6bceaaa20'), 8598260, datetime.datetime(2011, 5, 16, 16, 8, 20, 288158, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)))] + + with db.engine.begin() as connection: + db.import_mb_data.write_recording_gid_redirect(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'recording_gid_redirect')) + + + def test_load_and_write_release_group(self): + """Writing and loading data for release_group table using values from referenced artist_credit + & release_group_primary_type tables. + """ + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # release_group_primary_type + data = [(1, u'Album', None, 1, None, uuid.UUID('f529b476-6e62-324f-b0aa-1f3e33d313fc'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group_primary_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group_primary_type')) + + # release_group + data = [(631361, uuid.UUID('2ec35fc4-6797-3324-b775-9a3df3d4723a'), u'The Masterworks', 73502, 1, u'', 0, + datetime.datetime(2012, 5, 15, 19, 1, 58, 718541, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (617137, uuid.UUID('c834f5ee-d362-3da7-966b-8915a86e808c'), u'1981-08-29: Tangerine Tree Volume 53: Berlin 1981', 1418, 1, u'', 0, + datetime.datetime(2016, 9, 21, 23, 0, 26, 94608, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group')) + + + def test_load_and_write_release_group_gid_redirect(self): + """Writing and loading data for release_group_gid_redirect table using values from referenced artist_credit, + release_group_primary_type & release_group tables. + """ + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # release_group_primary_type + data = [(1, u'Album', None, 1, None, uuid.UUID('f529b476-6e62-324f-b0aa-1f3e33d313fc'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group_primary_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group_primary_type')) + + # release_group + data = [(631361, uuid.UUID('2ec35fc4-6797-3324-b775-9a3df3d4723a'), u'The Masterworks', 73502, 1, u'', 0, + datetime.datetime(2012, 5, 15, 19, 1, 58, 718541, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (617137, uuid.UUID('c834f5ee-d362-3da7-966b-8915a86e808c'), u'1981-08-29: Tangerine Tree Volume 53: Berlin 1981', 1418, 1, u'', 0, + datetime.datetime(2016, 9, 21, 23, 0, 26, 94608, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group')) + + # release_group_gid_redirect + data = [(uuid.UUID('21f0a3e8-c37b-33a1-b769-daf16e4e252e'), 617137, datetime.datetime(2011, 5, 16, 14, 57, 6, 530063, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group_gid_redirect(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group_gid_redirect')) + + + def test_load_and_write_release(self): + """Writing and loading data for release table using values from referenced artist_credit, + release_group_primary_type, release_group, language, script, release_status & release_packaging + tables. + """ + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # release_group_primary_type + data = [(1, u'Album', None, 1, None, uuid.UUID('f529b476-6e62-324f-b0aa-1f3e33d313fc'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group_primary_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group_primary_type')) + + # release_group + data = [(631361, uuid.UUID('2ec35fc4-6797-3324-b775-9a3df3d4723a'), u'The Masterworks', 73502, 1, u'', 0, + datetime.datetime(2012, 5, 15, 19, 1, 58, 718541, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (617137, uuid.UUID('c834f5ee-d362-3da7-966b-8915a86e808c'), u'1981-08-29: Tangerine Tree Volume 53: Berlin 1981', 1418, 1, u'', 0, + datetime.datetime(2016, 9, 21, 23, 0, 26, 94608, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group')) + + # language + data = [(120, u'eng', u'eng', u'en', u'English', 2, None)] + + with db.engine.begin() as connection: + db.import_mb_data.write_language(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'language')) + + # release_status + data = [(1, u'Official', None, 1, u'Any release officially sanctioned by the artist and/or their record company. Most releases will fit into this category.', + uuid.UUID('4e304316-386d-3409-af2e-78857eec5cfe'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_status(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_status')) + + # release_packaging + data = [(1, u'Jewel Case', None, 0, u'The traditional CD case, made of hard, brittle plastic.', + uuid.UUID('ec27701a-4a22-37f4-bfac-6616e0f9750a')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_packaging(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_packaging')) + + # script + data = [(28, u'Latn', u'215', u'Latin', 4)] + + with db.engine.begin() as connection: + db.import_mb_data.write_script(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'script')) + + # release + data = [(692283, uuid.UUID('a830f892-6be0-35f0-a392-b91d89d89a94'), u'The Masterworks', 847994, 631361, 1, None, 120, 28, u'5028421923901', u'', 0, -1, + datetime.datetime(2018, 5, 13, 11, 0, 22, 832493, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release')) + + # release_gid_redirect + data = [(uuid.UUID('03c44c5d-cbe5-32b2-af20-376a30fd98a0'), 692283, datetime.datetime(2011, 5, 16, 15, 59, 0, 785958, + tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_gid_redirect(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_gid_redirect')) + + + def test_load_and_write_release_gid_redirect(self): + """Writing and loading data for release_gid_redirect table using values from referenced artist_credit, + release_group_primary_type, release_group, language, release_status, release_packaging, script + & release tables. + """ + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # release_group_primary_type + data = [(1, u'Album', None, 1, None, uuid.UUID('f529b476-6e62-324f-b0aa-1f3e33d313fc'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group_primary_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group_primary_type')) + + # release_group + data = [(631361, uuid.UUID('2ec35fc4-6797-3324-b775-9a3df3d4723a'), u'The Masterworks', 73502, 1, u'', 0, + datetime.datetime(2012, 5, 15, 19, 1, 58, 718541, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (617137, uuid.UUID('c834f5ee-d362-3da7-966b-8915a86e808c'), u'1981-08-29: Tangerine Tree Volume 53: Berlin 1981', 1418, 1, u'', 0, + datetime.datetime(2016, 9, 21, 23, 0, 26, 94608, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group')) + + # language + data = [(120, u'eng', u'eng', u'en', u'English', 2, None)] + + with db.engine.begin() as connection: + db.import_mb_data.write_language(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'language')) + + # release_status + data = [(1, u'Official', None, 1, u'Any release officially sanctioned by the artist and/or their record company. Most releases will fit into this category.', + uuid.UUID('4e304316-386d-3409-af2e-78857eec5cfe'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_status(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_status')) + + # release_packaging + data = [(1, u'Jewel Case', None, 0, u'The traditional CD case, made of hard, brittle plastic.', + uuid.UUID('ec27701a-4a22-37f4-bfac-6616e0f9750a')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_packaging(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_packaging')) + + # script + data = [(28, u'Latn', u'215', u'Latin', 4)] + + with db.engine.begin() as connection: + db.import_mb_data.write_script(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'script')) + + # release + data = [(692283, uuid.UUID('a830f892-6be0-35f0-a392-b91d89d89a94'), u'The Masterworks', 847994, 631361, 1, None, 120, 28, u'5028421923901', u'', 0, -1, + datetime.datetime(2018, 5, 13, 11, 0, 22, 832493, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release')) + + + def test_load_and_write_medium(self): + """Writing and loading data for medium table using values from referenced medium_format, + artist_credit, release_group_primary_type, release_group, language, release_status, release_packaging, + script & release tables. + """ + + # medium_format + data = [(1, u'CD', None, 0, 1982, True, None, uuid.UUID('9712d52a-4509-3d4b-a1a2-67c88c643e31'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_medium_format(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'medium_format')) + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # release_group_primary_type + data = [(1, u'Album', None, 1, None, uuid.UUID('f529b476-6e62-324f-b0aa-1f3e33d313fc'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group_primary_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group_primary_type')) + + # release_group + data = [(631361, uuid.UUID('2ec35fc4-6797-3324-b775-9a3df3d4723a'), u'The Masterworks', 73502, 1, u'', 0, + datetime.datetime(2012, 5, 15, 19, 1, 58, 718541, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (617137, uuid.UUID('c834f5ee-d362-3da7-966b-8915a86e808c'), u'1981-08-29: Tangerine Tree Volume 53: Berlin 1981', 1418, 1, u'', 0, + datetime.datetime(2016, 9, 21, 23, 0, 26, 94608, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group')) + + # language + data = [(120, u'eng', u'eng', u'en', u'English', 2, None)] + + with db.engine.begin() as connection: + db.import_mb_data.write_language(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'language')) + + # release_status + data = [(1, u'Official', None, 1, u'Any release officially sanctioned by the artist and/or their record company. Most releases will fit into this category.', + uuid.UUID('4e304316-386d-3409-af2e-78857eec5cfe'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_status(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_status')) + + # release_packaging + data = [(1, u'Jewel Case', None, 0, u'The traditional CD case, made of hard, brittle plastic.', + uuid.UUID('ec27701a-4a22-37f4-bfac-6616e0f9750a')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_packaging(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_packaging')) + + # script + data = [(28, u'Latn', u'215', u'Latin', 4)] + + with db.engine.begin() as connection: + db.import_mb_data.write_script(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'script')) + + # release + data = [(692283, uuid.UUID('a830f892-6be0-35f0-a392-b91d89d89a94'), u'The Masterworks', 847994, 631361, 1, None, 120, 28, u'5028421923901', u'', 0, -1, + datetime.datetime(2018, 5, 13, 11, 0, 22, 832493, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release')) + + # medium + data = [(1089027, 692283, 20, 1, u'Rinaldo, Part 1', 0, datetime.datetime(2011, 10, 24, 21, 0, 13, 19209, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 21)] + + with db.engine.begin() as connection: + db.import_mb_data.write_medium(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'medium')) + + + def test_and_write_track(self): + """Writing and loading data for track table using values from referenced artist_credit, + recording, medium_format, release_group_primary_type, release_group, language, release_status, + release_packaging, script, release & medium tables. + """ + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # recording + data = [(11768371, uuid.UUID('d51cf7fb-97e1-4070-a40b-b03707f91c92'), u'(Rinaldo, HWV 7: Act I. "Combatti da forte" (Almirena)', 73502, 203000, u'', 0, None, False), + (8598260, uuid.UUID('9086b742-358b-4f73-9a14-84cb1a9ce4ce'), u'Love Story', 399541, 235000, u'', 0, None, False) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_recording(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'recording')) + + # medium_format + data = [(1, u'CD', None, 0, 1982, True, None, uuid.UUID('9712d52a-4509-3d4b-a1a2-67c88c643e31'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_medium_format(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'medium_format')) + + # release_group_primary_type + data = [(1, u'Album', None, 1, None, uuid.UUID('f529b476-6e62-324f-b0aa-1f3e33d313fc'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group_primary_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group_primary_type')) + + # release_group + data = [(631361, uuid.UUID('2ec35fc4-6797-3324-b775-9a3df3d4723a'), u'The Masterworks', 73502, 1, u'', 0, + datetime.datetime(2012, 5, 15, 19, 1, 58, 718541, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (617137, uuid.UUID('c834f5ee-d362-3da7-966b-8915a86e808c'), u'1981-08-29: Tangerine Tree Volume 53: Berlin 1981', 1418, 1, u'', 0, + datetime.datetime(2016, 9, 21, 23, 0, 26, 94608, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group')) + + # language + data = [(120, u'eng', u'eng', u'en', u'English', 2, None)] + + with db.engine.begin() as connection: + db.import_mb_data.write_language(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'language')) + + # release_status + data = [(1, u'Official', None, 1, u'Any release officially sanctioned by the artist and/or their record company. Most releases will fit into this category.', + uuid.UUID('4e304316-386d-3409-af2e-78857eec5cfe'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_status(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_status')) + + # release_packaging + data = [(1, u'Jewel Case', None, 0, u'The traditional CD case, made of hard, brittle plastic.', + uuid.UUID('ec27701a-4a22-37f4-bfac-6616e0f9750a')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_packaging(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_packaging')) + + # script + data = [(28, u'Latn', u'215', u'Latin', 4)] + + with db.engine.begin() as connection: + db.import_mb_data.write_script(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'script')) + + # release + data = [(692283, uuid.UUID('a830f892-6be0-35f0-a392-b91d89d89a94'), u'The Masterworks', 847994, 631361, 1, None, 120, 28, u'5028421923901', u'', 0, -1, + datetime.datetime(2018, 5, 13, 11, 0, 22, 832493, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release')) + + # medium + data = [(1089027, 692283, 20, 1, u'Rinaldo, Part 1', 0, datetime.datetime(2011, 10, 24, 21, 0, 13, 19209, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 21)] + + with db.engine.begin() as connection: + db.import_mb_data.write_medium(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'medium')) + + # track + data = [(11261020, uuid.UUID('e0537cb9-4720-3eb3-a07a-d8a7477519ea'), 11768371, 1089027, 5, u'5', u'Rinaldo, HWV 7: Act I. "Combatti da forte" (Almirena)', + 831440, 203000, 0, datetime.datetime(2013, 7, 13, 11, 0, 38, 285946, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), False)] + + with db.engine.begin() as connection: + db.import_mb_data.write_track(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'track')) + + + def test_load_and_write_track_gid_redirect(self): + """Writing and loading data for track_gid_redirect table using values from referenced artist_credit, + recording, medium_format, release_group_primary_type, release_group, language, release_status, + release_packaging, script, release, medium & track tables. + """ + + # artist_credit + data = [(1418, u'Tangerine Dream', 1, 13729, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (399541, u'Taylor Swift', 1, 3139, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (73502, u'Georg Friedrich Handel', 1, 27041, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (831440, u'George Frideric Handel', 1, 24955, datetime.datetime(2011, 6, 19, 7, 36, 56, 8576, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (847994, u'Handel', 1, 717, datetime.datetime(2011, 8, 11, 19, 43, 1, 279447, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (6747, u'Tampa Red', 1, 1600, datetime.datetime(2011, 5, 16, 16, 32, 11, 963929, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_artist_credit(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'artist_credit')) + + # recording + data = [(11768371, uuid.UUID('d51cf7fb-97e1-4070-a40b-b03707f91c92'), u'(Rinaldo, HWV 7: Act I. "Combatti da forte" (Almirena)', 73502, 203000, u'', 0, None, False), + (8598260, uuid.UUID('9086b742-358b-4f73-9a14-84cb1a9ce4ce'), u'Love Story', 399541, 235000, u'', 0, None, False) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_recording(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'recording')) + + # medium_format + data = [(1, u'CD', None, 0, 1982, True, None, uuid.UUID('9712d52a-4509-3d4b-a1a2-67c88c643e31'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_medium_format(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'medium_format')) + + # release_group_primary_type + data = [(1, u'Album', None, 1, None, uuid.UUID('f529b476-6e62-324f-b0aa-1f3e33d313fc'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group_primary_type(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group_primary_type')) + + # release_group + data = [(631361, uuid.UUID('2ec35fc4-6797-3324-b775-9a3df3d4723a'), u'The Masterworks', 73502, 1, u'', 0, + datetime.datetime(2012, 5, 15, 19, 1, 58, 718541, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))), + (617137, uuid.UUID('c834f5ee-d362-3da7-966b-8915a86e808c'), u'1981-08-29: Tangerine Tree Volume 53: Berlin 1981', 1418, 1, u'', 0, + datetime.datetime(2016, 9, 21, 23, 0, 26, 94608, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_group(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_group')) + + # language + data = [(120, u'eng', u'eng', u'en', u'English', 2, None)] + + with db.engine.begin() as connection: + db.import_mb_data.write_language(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'language')) + + # release_status + data = [(1, u'Official', None, 1, u'Any release officially sanctioned by the artist and/or their record company. Most releases will fit into this category.', + uuid.UUID('4e304316-386d-3409-af2e-78857eec5cfe'))] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_status(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_status')) + + # release_packaging + data = [(1, u'Jewel Case', None, 0, u'The traditional CD case, made of hard, brittle plastic.', + uuid.UUID('ec27701a-4a22-37f4-bfac-6616e0f9750a')) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release_packaging(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release_packaging')) + + # script + data = [(28, u'Latn', u'215', u'Latin', 4)] + + with db.engine.begin() as connection: + db.import_mb_data.write_script(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'script')) + + # release + data = [(692283, uuid.UUID('a830f892-6be0-35f0-a392-b91d89d89a94'), u'The Masterworks', 847994, 631361, 1, None, 120, 28, u'5028421923901', u'', 0, -1, + datetime.datetime(2018, 5, 13, 11, 0, 22, 832493, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))) + ] + + with db.engine.begin() as connection: + db.import_mb_data.write_release(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'release')) + + # medium + data = [(1089027, 692283, 20, 1, u'Rinaldo, Part 1', 0, datetime.datetime(2011, 10, 24, 21, 0, 13, 19209, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 21)] + + with db.engine.begin() as connection: + db.import_mb_data.write_medium(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'medium')) + + # track + data = [(11261020, uuid.UUID('e0537cb9-4720-3eb3-a07a-d8a7477519ea'), 11768371, 1089027, 5, u'5', u'Rinaldo, HWV 7: Act I. "Combatti da forte" (Almirena)', + 831440, 203000, 0, datetime.datetime(2013, 7, 13, 11, 0, 38, 285946, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), False)] + + with db.engine.begin() as connection: + db.import_mb_data.write_track(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'track')) + + # track_gid_redirect + data = [(uuid.UUID('67a0d0cd-fd61-328d-80a2-ca888c5fd15c'), 11261020, datetime.datetime(2014, 10, 15, 0, 0, 9, 772435, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)))] + + with db.engine.begin() as connection: + db.import_mb_data.write_track_gid_redirect(connection, data) + self.assertEqual(data, db.import_mb_data.load_musicbrainz_schema_data(connection, 'track_gid_redirect')) diff --git a/db/testing.py b/db/testing.py new file mode 100644 index 000000000..3bfd009ff --- /dev/null +++ b/db/testing.py @@ -0,0 +1,133 @@ +import db +import db.data +import json +import os +import random +from db import gid_types + +from webserver import create_app + +from flask_testing import TestCase + + +ADMIN_SQL_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'admin', 'sql') +TEST_DATA_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test_data') + + +class DatabaseTestCase(TestCase): + + @staticmethod + def create_app(): + return create_app() + + def setUp(self): + self.reset_db() + + def tearDown(self): + pass + + def reset_db(self): + self.drop_tables() + self.drop_types() + self.init_db() + + def init_db(self): + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_types.sql')) + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_tables.sql')) + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_primary_keys.sql')) + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_foreign_keys.sql')) + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_indexes.sql')) + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_schema.sql')) + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_tables.sql')) + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_primary_keys.sql')) + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_foreign_keys.sql')) + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_indexes.sql')) + + def drop_tables(self): + with db.engine.connect() as connection: + # TODO(roman): See if there's a better way to drop all tables. + connection.execute('DROP TABLE IF EXISTS highlevel_model CASCADE;') + connection.execute('DROP TABLE IF EXISTS highlevel_meta CASCADE;') + connection.execute('DROP TABLE IF EXISTS highlevel CASCADE;') + connection.execute('DROP TABLE IF EXISTS model CASCADE;') + connection.execute('DROP TABLE IF EXISTS lowlevel_json CASCADE;') + connection.execute('DROP TABLE IF EXISTS lowlevel CASCADE;') + connection.execute('DROP TABLE IF EXISTS version CASCADE;') + connection.execute('DROP TABLE IF EXISTS statistics CASCADE;') + connection.execute('DROP TABLE IF EXISTS incremental_dumps CASCADE;') + connection.execute('DROP TABLE IF EXISTS dataset_snapshot CASCADE;') + connection.execute('DROP TABLE IF EXISTS dataset_eval_jobs CASCADE;') + connection.execute('DROP TABLE IF EXISTS dataset_class_member CASCADE;') + connection.execute('DROP TABLE IF EXISTS dataset_class CASCADE;') + connection.execute('DROP TABLE IF EXISTS dataset CASCADE;') + connection.execute('DROP TABLE IF EXISTS dataset_eval_sets CASCADE;') + connection.execute('DROP TABLE IF EXISTS "user" CASCADE;') + connection.execute('DROP TABLE IF EXISTS api_key CASCADE;') + connection.execute('DROP TABLE IF EXISTS challenge CASCADE;') + connection.execute('DROP TABLE IF EXISTS dataset_eval_challenge CASCADE;') + connection.execute('DROP TABLE IF EXISTS feedback CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.artist CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.artist_credit CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.artist_credit_name CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.artist_gid_redirect CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.area CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.area_type CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.recording CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.recording_gid_redirect CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.release CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.release_gid_redirect CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.track CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.track_gid_redirect CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.release_group CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.release_group_gid_redirect CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.medium CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.medium_format CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.release_group_primary_type CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.release_status CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.release_packaging CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.language CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.script CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.gender CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.artist_type CASCADE;') + connection.execute('DROP TABLE IF EXISTS musicbrainz.replication_control CASCADE;') + + def drop_types(self): + with db.engine.connect() as connection: + connection.execute('DROP TYPE IF EXISTS eval_job_status CASCADE;') + connection.execute('DROP TYPE IF EXISTS model_status CASCADE;') + connection.execute('DROP TYPE IF EXISTS version_type CASCADE;') + connection.execute('DROP TYPE IF EXISTS eval_location_type CASCADE;') + connection.execute('DROP TYPE IF EXISTS gid_type CASCADE;') + + def data_filename(self, mbid): + """ Get the expected filename of a test datafile given its mbid """ + return os.path.join(TEST_DATA_PATH, mbid + '.json') + + def load_low_level_data(self, mbid): + """Loads low-level data from JSON file in `test_data` directory into + the database. + """ + with open(self.data_filename(mbid)) as json_file: + db.data.submit_low_level_data(mbid, json.loads(json_file.read()), gid_types.GID_TYPE_MBID) + + def submit_fake_low_level_data(self, mbid): + """Generate a minimal dataset to be submitted in tests for a given + MBID. Several calls to this function generate distinct entries by using + a random value for the 'average_loudness' field""" + db.data.submit_low_level_data( + mbid, + {"lowlevel": {"average_loudness": random.random()}, + "metadata": {"audio_properties": {"length": None, + "bit_rate": None, + "codec": None, + "lossless": True}, + "tags": {"file_name": "fake", + "musicbrainz_recordingid": [mbid]}, + "version": {"essentia": None, + "essentia_build_sha": "", + "essentia_git_sha": None, + "extractor": None}}, + "rhythm": {}, + "tonal": {} + }, + gid_types.GID_TYPE_MBID) diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index c900ff68c..5b5368417 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -35,10 +35,21 @@ services: depends_on: - db - redis + - musicbrainz_db redis: image: redis:4.0-alpine + musicbrainz_db: + image: metabrainz/musicbrainz-test-database:schema-change-2017-q2 + volumes: + - ../data/mbdata:/var/lib/postgresql/data/pgdata + environment: + PGDATA: /var/lib/postgresql/data/pgdata + MB_IMPORT_DUMPS: "true" + ports: + - "5430:5432" + hl_extractor: build: context: .. @@ -62,3 +73,15 @@ services: - ../data/files:/data/files depends_on: - db + + musicbrainz_importer: + build: + context: .. + dockerfile: ./docker/Dockerfile.gaia + command: python2 worker_manage.py musicbrainz_importer + volumes: + - ../:/code + - ../data/app:/data + depends_on: + - db + - musicbrainz_db diff --git a/manage.py b/manage.py index 2f38a0980..ca7513332 100644 --- a/manage.py +++ b/manage.py @@ -16,7 +16,14 @@ import db.exceptions import db.stats import db.user +import db.import_mb_data import webserver +from brainzutils import musicbrainz_db +from db.testing import DatabaseTestCase +import musicbrainz_importer.apply_replication_changes + +import webserver.external.get_entities +import webserver.external.evaluate_mbdatabase_access ADMIN_SQL_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'admin', 'sql') @@ -87,8 +94,47 @@ def init_db(archive, force, skip_create_db=False): current_app.logger.info("Done!") -@cli.command(name='import_data') +@cli.command(name='init_mb_db') @click.option("--drop-constraints", "-d", is_flag=True, help="Drop primary and foreign keys before importing.") +@click.option("--force", "-f", is_flag=True, help="Drop existing MusicBrainz schema and tables.") +def init_mb_db(drop_constraints, force): + """Initialize the MusicBrainz database. + + This process involves several steps: + 1. MusicBrainz schema is created. + 2. MusicBrainz Table structure is created. + 3. Primary keys and foreign keys are created. + 4. Indexes are created. + """ + + musicbrainz_db.init_db_engine(current_app.config['MB_DATABASE_URI']) + + if force: + print('Dropping MusicBrainz schema...') + res = db.run_sql_script_without_transaction(os.path.join(ADMIN_SQL_DIR, 'drop_musicbrainz_schema.sql')) + if not res: + raise Exception('Failed to drop existing musicbrainz schema and tables! Exit code: %i' % res) + + print('Creating MusicBrainz schema...') + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_schema.sql')) + + print('Creating MusicBrainz tables...') + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_tables.sql')) + + print('Creating MusicBrainz primary keys...') + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_primary_keys.sql')) + + if not drop_constraints: + print('Creating MusicBrainz foreign keys...') + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_foreign_keys.sql')) + + print('Creating MusicBrainz indexes...') + db.run_sql_script(os.path.join(ADMIN_SQL_DIR, 'create_musicbrainz_indexes.sql')) + + print("Done!") + + +@cli.command() @click.argument("archive", type=click.Path(exists=True)) def import_data(archive, drop_constraints=False): """Imports data dump into the database.""" @@ -254,6 +300,68 @@ def set_rate_limits(per_ip, window_size): current_app.logger.info("Window size (s): %s" % window_size) +@cli.command() +def import_musicbrainz_db(): + print("\nImporting MusicBrainz data...") + db.import_mb_data.start_import() + + +@cli.command() +def get_entities(): + print('Redirecting mbids to original entities...') + webserver.external.get_entities.main() + + +@cli.command() +def apply_replication_changes(): + print("\nUpdating musicbrainz schema by applying replication packets...") + musicbrainz_importer.apply_replication_changes.main() + + +@cli.command(help="Time imported data from AB first, then time data by directly accessing AB and MB") +def evaluate_access_methods(): + print('Evaluating both MusicBrainz database access methods...') + webserver.external.evaluate_mbdatabase_access.get_AB_and_MB_imported() + webserver.external.evaluate_mbdatabase_access.get_AB_and_MB_direct() + + +@cli.command(help="Time imported data from AB") +def evaluate_import(): + webserver.external.evaluate_mbdatabase_access.get_AB_and_MB_imported() + + +@cli.command(help="Time data by directly accessing AB and MB") +def evaluate_direct(): + webserver.external.evaluate_mbdatabase_access.get_AB_and_MB_direct() + + +@cli.command(help="Time data by directly accessing but only AB") +def evaluate_direct_AB_only(): + webserver.external.evaluate_mbdatabase_access.get_AB_only_direct() + + +@cli.command(help="Time data by importing but using exists clause") +def evaluate_import_exists(): + webserver.external.evaluate_mbdatabase_access.get_AB_only_direct() + +@cli.command(help="Time imported data from AB using given dataset") +@click.argument("dataset", required=True) +def evaluate_import_dataset(dataset): + webserver.external.evaluate_mbdatabase_access.get_AB_and_MB_imported_from_dataset(dataset) + + +@cli.command(help="Time data by directly accessing AB and MB using given dataset") +@click.argument("dataset", required=True) +def evaluate_direct_dataset(dataset): + webserver.external.evaluate_mbdatabase_access.get_AB_and_MB_direct_from_dataset(dataset) + + +@cli.command(help="Time data by directly accessing but only AB using given dataset") +@click.argument("dataset", required=True) +def evaluate_direct_AB_only_dataset(dataset): + webserver.external.evaluate_mbdatabase_access.get_AB_only_direct_from_dataset(dataset) + + # Please keep additional sets of commands down there cli.add_command(db.dump_manage.cli, name="dump") diff --git a/musicbrainz_importer/__init__.py b/musicbrainz_importer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/musicbrainz_importer/apply_replication_changes.py b/musicbrainz_importer/apply_replication_changes.py new file mode 100644 index 000000000..f7f4d8c09 --- /dev/null +++ b/musicbrainz_importer/apply_replication_changes.py @@ -0,0 +1,364 @@ +""" +The MIT License for apply_replication_changes script + +Copyright (c) 2018 Rashi Sah +Copyright (c) 2018 Lukas Lalinsky + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from __future__ import print_function +import tarfile +import os +import re +import urllib2 +import shutil +import tempfile +from flask import current_app +import db +from brainzutils import musicbrainz_db +from sqlalchemy import text +from sqlalchemy.exc import IntegrityError +import db.import_mb_data +import db.data + +include_tables = ['language', 'artist_credit_name', 'artist', 'artist_gid_redirect', 'area', 'area_type', 'recording_gid_redirect', \ + 'script', 'release_gid_redirect', 'recording', 'track', 'artist_credit', 'release_group_primary_type', 'release_group', \ + 'release_group_gid_redirect', 'release', 'medium', 'medium_format', 'release_status', 'release_packaging', 'gender', \ + 'artist_type'] + +ESCAPES = (('\\b', '\b'), ('\\f', '\f'), ('\\n', '\n'), ('\\r', '\r'), + ('\\t', '\t'), ('\\v', '\v'), ('\\\\', '\\')) + +def parse_name(table): + """Store schema name and table name separately in different variables. + + Args: + table: A combined schema and table name of the form - schema.table + + Returns: + separate schema and table names. + """ + if '.' in table: + schema, table = table.split('.', 1) + schema = 'musicbrainz' + table = table.strip('"') + return schema, table + + +def parse_data_fields(s): + """Parses the data present in mbdump files to specific variables for their use. + Removes useless quotes and other punctuations. + + Returns: + Proper string with names of the data and corresponding values. + """ + fields = {} + for name, value in re.findall(r'''"([^"]+)"=('(?:''|[^'])*')? ''', s): + if not value: + value = None + else: + value = value[1:-1].replace("''", "'").replace("\\\\", "\\") + fields[name] = value + return fields + + +def parse_bool(s): + return s == 't' + + +def unescape(s): + """Remove extra escapes from the data. + + Returns: + unescaped string. + """ + if s == '\\N': + return None + for orig, repl in ESCAPES: + s = s.replace(orig, repl) + return s + + +def read_psql_dump(fp, types): + """Read mbdump data, split the values present in rows in mbdump/dbmirror_pending + and mbdata/dbmirror_pendingdata. + + Args: + fp: tar file of replication packet. + types: data types of all data of the rows. + """ + for line in fp: + values = map(unescape, line.rstrip('\r\n').split('\t')) + for i, value in enumerate(values): + if value is not None: + values[i] = types[i](value) + yield values + + +def get_table_and_data(message): + """Get table name and data values from the IntegrityError message (if any) due to + foreign key constraints. + + Args: + message: SqlAlchemy integrity error message. + + Returns: + column name and data values to be updated for a table. + """ + mess = message.split(' ') + word = mess.index('Key') + 1 + column, data = mess[word].split('=') + column, data = column.strip('()'), data.strip('()') + return column, data + + +def insert_new_row(table, data, main_connection, main_transaction, sql, params, todo_list=None): + """This function insert new rows in the tables after we get any IntegrityError due to foreign + key constraints. + + Args: + table: name of the table in which the data is to be inserted. + data: values to be inserted. + main_connection: sql connection to write into the database. + main_transaction: transaction for every write operation. + sql: insert query. + params: values for the query. + todo_list: a list of tuples of type (table, data) used to insert new data + in the respective tables. + """ + if todo_list is None: + todo_list = [] + table_name, columns, values = db.import_mb_data.get_data_from_musicbrainz(table, data) + with db.engine.connect() as conn: + trans = conn.begin() + try: + db.import_mb_data.insert_data_into_musicbrainz_schema(conn, trans, table_name, columns, values) + if len(todo_list): + todo_list.remove((table, data)) + table = todo_list[len(todo_list)-1][0] + data = todo_list[len(todo_list)-1][1] + insert_new_row(table, data, main_connection, main_transaction, sql, params, todo_list) + else: + update_row(sql, params, main_connection, main_transaction) + except IntegrityError as e: + trans.rollback() + table, data = get_table_and_data(e.message) + todo_list.append((table, data)) + insert_new_row(table, data, main_connection, main_transaction, sql, params, todo_list) + + +def update_row(sql, params, main_connection, main_transaction): + """This function is a part of processing the replication packet to update + the data present in database. + + Args: + sql: update query. + params: parameter values for the query. + main_connection: sql connection to write into the database. + main_transaction: transaction for every write operation. + """ + try: + main_connection.execute(sql, params) + main_transaction.commit() + except IntegrityError as e: + main_transaction.rollback() + table, data = get_table_and_data(e.message) + insert_new_row(table, data, main_connection, main_transaction, sql, params) + + +class PacketImporter(object): + """PacketImporter class to process the replication packets for proper changes + in the database. + """ + def __init__(self, replication_seq): + """Initialization of the class objects. + """ + self._data = {} + self._transactions = {} + self._replication_seq = replication_seq + + def load_pending_data(self, fp): + """Load id, key and values from dbmirror_pending data files + and stores them in data dictionary. + + Args: + fp: tar file of replication packet. + """ + dump = read_psql_dump(fp, [int, parse_bool, parse_data_fields]) + for id, key, values in dump: + self._data[(id, key)] = values + + def load_pending(self, fp): + """Load schema name, table names from dbmirror_pending file and + maintain a transaction dictionary for the data specified in the files. + + Args: + fp: tar file of replication packet. + """ + dump = read_psql_dump(fp, [int, str, str, int]) + for id, table, type, xid in dump: + schema, table = parse_name(table) + transaction = self._transactions.setdefault(xid, []) + transaction.append((id, schema, table, type)) + + def process(self): + """Process a replication packet and apply update and deletion + for the data present in the database by running a acousticbrainz + db connection. + """ + with db.engine.connect() as connection: + stats = {} + for xid in sorted(self._transactions.keys()): + transaction = self._transactions[xid] + print ('Running transaction ' + str(xid) + '...') + for id, schema, table, type in sorted(transaction): + trans = connection.begin() + + # Applying the changes for the tables present in musicbrainz + # schema in acousticbrainz db + if schema == 'musicbrainz' and table in include_tables: + fulltable = '%s.%s' % (schema, table) + if fulltable not in stats: + stats[fulltable] = {'d': 0, 'u': 0} + + if type == 'u' or type == 'd': + stats[fulltable][type] += 1 + keys = self._data.get((id, True), {}) + values = self._data.get((id, False), {}) + + params = [] + if type == 'd': + sql = 'DELETE FROM %s' % (fulltable,) + elif type == 'u': + sql_values = ', '.join('%s=%%s' % i for i in values) + sql = 'UPDATE %s SET %s' % (fulltable, sql_values) + params = values.values() + + if type == 'd' or type == 'u': + sql += ' WHERE ' + ' AND '.join('%s%s%%s' % (value, ' IS ' if keys[value] is None else '=') for value in keys.keys()) + params.extend(keys.values()) + + if type == 'd': + if keys or values: + try: + connection.execute(sql, params) + trans.commit() + print ('Deleted rows from ' + table + ' table') + except IntegrityError as e: + trans.rollback() + if type == 'u': + if keys or values: + update_row(sql, params, connection, trans) + print ('Updated rows in ' + table + ' table') + else: + print ('Skipping changes, ' + table + ' table not found in the database') + + +def process_tar(fileobj, expected_schema_seq, replication_seq): + """Processes the compressed replication packet, call the functions to load the data + from mbdump/dbmirror_pending and mbdump.dbmirror_pendingdata files. + Then call the 'process' function from PacketImporter class to apply the changes to + the database. + + Args: + fileobj: tar file of the replication packet. + expected_schema_seq: The expected schema sequence that should be matched with the + one listed in replication packets. + replication_seq: The number of the replication packet. + """ + print ("Processing", fileobj.name) + tar = tarfile.open(fileobj=fileobj, mode='r:bz2') + importer = PacketImporter(replication_seq) + for member in tar: + if member.name == 'SCHEMA_SEQUENCE': + schema_seq = int(tar.extractfile(member).read().strip()) + if schema_seq != expected_schema_seq: + raise Exception("Mismatched schema sequence, %d (database) vs %d (replication packet)" % (expected_schema_seq, schema_seq)) + elif member.name == 'TIMESTAMP': + ts = tar.extractfile(member).read().strip() + print (' - Packet was produced at', ts) + elif member.name in ('mbdump/Pending', 'mbdump/dbmirror_pending'): + importer.load_pending(tar.extractfile(member)) + elif member.name in ('mbdump/PendingData', 'mbdump/dbmirror_pendingdata'): + importer.load_pending_data(tar.extractfile(member)) + importer.process() + tar.close() + + +def download_packet(base_url, token, replication_seq): + """Download the replication packet for the specified replication sequence + and convert the packet into a tar.bz2 file. + + Args: + base_url: The URL to download the replication packets from. + token: An access token to allow download of the packets from MetaBrainz + website. For more information, visit - https://metabrainz.org/api/ + + Returns: tar file of the downloaded replication packet. + """ + url = base_url.rstrip("/") + "/replication-%d.tar.bz2" % replication_seq + if token: + url += '?token=' + token + print ("Downloading", url) + try: + data = urllib2.urlopen(url, timeout=60) + except urllib2.HTTPError, e: + if e.code == 404: + return None + raise + tmp = tempfile.NamedTemporaryFile(suffix='.tar.bz2') + shutil.copyfileobj(data, tmp) + data.close() + tmp.seek(0) + return tmp + + +def main(): + """Fetch the replication sequence from the database and call the function + to download all the replication packets from last replication sequence until + the previous hour. + """ + base_url = current_app.config['REPLICATION_PACKETS_URL'] + if current_app.config['ACCESS_TOKEN']: + token = current_app.config['ACCESS_TOKEN'] + else: + token = None + + schema_seq, mb_replication_seq = db.data.get_current_schema_and_replication_sequence() + + ab_replication_seq = db.data.get_replication_sequence_from_mb_schema() + + if ab_replication_seq is None or ab_replication_seq < mb_replication_seq: + replication_seq = mb_replication_seq + db.data.write_replication_control(replication_seq) + else: + replication_seq = ab_replication_seq + + while True: + replication_seq += 1 + print ("Replication Sequence:", replication_seq) + tmp = download_packet(base_url, token, replication_seq) + if tmp is None: + print ('Not found, stopping') + break + process_tar(tmp, schema_seq, replication_seq) + tmp.close() + db.data.update_replication_sequence(replication_seq) + print ('Done applying all the replication packets till last hour') diff --git a/musicbrainz_importer/musicbrainz_importer.py b/musicbrainz_importer/musicbrainz_importer.py new file mode 100644 index 000000000..396f3a039 --- /dev/null +++ b/musicbrainz_importer/musicbrainz_importer.py @@ -0,0 +1,25 @@ +import logging +import time +import db.data +import db.import_mb_data +from flask import current_app + +SLEEP_DURATION = 30 # number of seconds to wait between runs +BATCH_SLEEP_DURATION = 5 # number of seconds to wait between batches + + +def main(): + logging.info("musicbrainz importer started") + while True: + gids_in_AB = db.data.get_new_recordings_from_lowlevel() + if gids_in_AB: + logging.info("Importing MusicBrainz data...") + logging.info('Inserting data for %d recordings...' % (len(gids_in_AB))) + db.import_mb_data.fetch_and_insert_musicbrainz_data(gids_in_AB) + batch_sleep = current_app.config['BATCH_SLEEP_DURATION'] + logging.info("Sleeping %s seconds before starting next batch's import." % batch_sleep) + time.sleep(batch_sleep) + else: + sleep = current_app.config['SLEEP_DURATION'] + logging.info("No new recording found. Sleeping %s seconds." % sleep) + time.sleep(sleep) diff --git a/requirements.txt b/requirements.txt index 98505d72d..724b08214 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,24 +1,27 @@ git+https://github.com/metabrainz/brainzutils-python.git@v1.18.1 +click == 6.7 +coverage == 4.5.1 +Fabric == 1.14.0 Flask-Admin==1.5.6 Flask-Login==0.5.0 Flask-SQLAlchemy==2.4.1 Flask-Testing==0.8.0 +Flask-UUID==0.2 Flask-WTF == 0.14.3 -futures==3.3.0 +Jinja2==2.11.2 mock==3.0.5 musicbrainzngs==0.7.1 ndg-httpsclient==0.5.1 psycopg2-binary==2.8.5 pytz==2019.3 pyyaml==5.3.1 +futures==3.3.0 rauth == 0.7.3 setproctitle == 1.1.10 six==1.14.0 Flask==1.1.2 -Jinja2==2.11.2 werkzeug==1.0.1 Flask-DebugToolbar==0.11.0 -Flask-UUID==0.2 sentry-sdk[flask]==0.20.3 certifi redis==3.4.1 diff --git a/webserver/__init__.py b/webserver/__init__.py index 131f80519..b4aa01aac 100644 --- a/webserver/__init__.py +++ b/webserver/__init__.py @@ -65,6 +65,10 @@ def create_app(debug=None): from db import init_db_engine init_db_engine(app.config['SQLALCHEMY_DATABASE_URI']) + # MusicBrainz Database + from brainzutils import musicbrainz_db + musicbrainz_db.init_db_engine(app.config.get('MB_DATABASE_URI')) + # Cache if 'REDIS_HOST' in app.config and\ 'REDIS_PORT' in app.config and\ diff --git a/webserver/external/evaluate_mbdatabase_access.py b/webserver/external/evaluate_mbdatabase_access.py new file mode 100644 index 000000000..3306bdd33 --- /dev/null +++ b/webserver/external/evaluate_mbdatabase_access.py @@ -0,0 +1,95 @@ +import db +import db.data +import time +import logging +from sqlalchemy import text +from brainzutils import musicbrainz_db + + +def get_AB_and_MB_imported(): + # Testing with the AcousticBrainz database tables (import MB db method). + logging.info("Querying directly from AcousticBrainz database for import MB database method...") + start_time = time.time() + + data = db.data.load_lowlevel_and_recording_data() + + first_time_taken = time.time() - start_time + logging.info('Data imported from AcousticBrainz database in %.2f seconds.' % first_time_taken) + + +def get_AB_and_MB_direct(): + # Testing with both AcousticBrainz & MusicBrainz database tables (the direct connection method). + logging.info("Separate queries from AcousticBrainz and MusicBrainz databases over the direct connection...") + start_time = time.time() + + lowlevel_data = db.data.load_lowlevel_data() + lowlevel_data = list({value['gid'] for value in lowlevel_data}) + + recording_data = db.data.load_recording_data_from_MB_db(lowlevel_data) + + second_time_taken = time.time() - start_time + logging.info('Data imported from direct connection to MusicBrainz database in %.2f seconds.' % second_time_taken) + + +def get_AB_only_direct(): + # Testing with both AcousticBrainz & MusicBrainz database tables (the direct connection method). + logging.info("Separate queries from AcousticBrainz and MusicBrainz databases over the direct connection...") + start_time = time.time() + + lowlevel_data = db.data.load_lowlevel_data() + lowlevel_data = list({value['gid'] for value in lowlevel_data}) + + recording_data = db.data.load_recording_data_from_AB_db(lowlevel_data) + + second_time_taken = time.time() - start_time + logging.info('Data imported from direct connection to MusicBrainz database in %.2f seconds.' % second_time_taken) + + +def get_AB_and_MB_imported_with_exists(): + # Testing with both AcousticBrainz & MusicBrainz database tables (the direct connection method). + logging.info("Querying directly from AcousticBrainz database for import MB database method using EXISTS clause...") + start_time = time.time() + + data = db.data.load_lowlevel_and_recording_data_using_exists() + + second_time_taken = time.time() - start_time + logging.info('Data imported from direct connection to MusicBrainz database in %.2f seconds.' % second_time_taken) + + +def get_AB_and_MB_imported_from_dataset(dataset): + # Testing with the AcousticBrainz database tables (import MB db method). + logging.info("Querying directly from AcousticBrainz database for import MB database method...") + start_time = time.time() + + data = db.data.load_lowlevel_data_from_dataset(dataset) + + first_time_taken = time.time() - start_time + logging.info('Data imported from AcousticBrainz database in %.2f seconds.' % first_time_taken) + + +def get_AB_and_MB_direct_from_dataset(dataset): + # Testing with both AcousticBrainz & MusicBrainz database tables (the direct connection method). + logging.info("Separate queries from AcousticBrainz and MusicBrainz databases over the direct connection...") + start_time = time.time() + + lowlevel_data = db.data.get_all_recordings_in_dataset(dataset) + lowlevel_data = list({value['mbid'] for value in lowlevel_data}) + + recording_data = db.data.load_recording_data_from_MB_db(lowlevel_data) + + second_time_taken = time.time() - start_time + logging.info('Data imported from direct connection to MusicBrainz database in %.2f seconds.' % second_time_taken) + + +def get_AB_only_direct_from_dataset(dataset): + # Testing with both AcousticBrainz & MusicBrainz database tables (the direct connection method). + logging.info("Separate queries from AcousticBrainz and MusicBrainz databases over the direct connection...") + start_time = time.time() + + lowlevel_data = db.data.get_all_recordings_in_dataset(dataset) + lowlevel_data = list({value['mbid'] for value in lowlevel_data}) + + recording_data = db.data.load_recording_data_from_AB_db(lowlevel_data) + + second_time_taken = time.time() - start_time + logging.info('Data imported from direct connection to MusicBrainz database in %.2f seconds.' % second_time_taken) diff --git a/webserver/external/get_entities.py b/webserver/external/get_entities.py new file mode 100644 index 000000000..0cf8b0847 --- /dev/null +++ b/webserver/external/get_entities.py @@ -0,0 +1,58 @@ +import db +import db.data +import time +import logging +from sqlalchemy import text +from brainzutils.musicbrainz_db import mb_session +from brainzutils.musicbrainz_db.utils import get_entities_by_gids +from mbdata.models import Recording + + +def get_original_entity(database): + """Get original entity information after applying MBID redirect + to many mbids. + + Args: + mbids (list): list of uuid (MBID(gid)) of the recordings. + Returns: + Dictionary containing the redirected original entity ids with MBIDs as keys. + - mbid: Recording mbids of the entities + - id: Original redirected ids of the entities after mbid redirect + """ + if database == 'MB': + mbids = db.data.get_mbids_from_gid_redirect_tables_from_MB_db() + else: + mbids = db.data.get_mbids_from_gid_redirect_tables() + with mb_session() as mb_db: + query = mb_db.query(Recording) + + recordings = get_entities_by_gids( + query=query, + entity_type='recording', + mbids=mbids, + ) + + recording_ids = [recording.id for recording in recordings.values()] + recording_gids = [key for key in recordings] + + gids_with_redirected_ids = dict(zip(recording_gids, recording_ids)) + + return gids_with_redirected_ids + + +def main(): + # Testing with the MusicBrainz schema in AB + start_time = time.time() + + gids_with_redirected_ids = get_original_entity('AB') + + first_time_taken = time.time() - start_time + logging.info('Data imported from AcousticBrainz database in %.2f seconds.' % first_time_taken) + + # Testing with the original MusicBrainz database over the direct connection + start_time = time.time() + + gids_with_redirected_ids = get_original_entity('AB') + + second_time_taken = time.time() - start_time + logging.info('Data imported from direct connection to MusicBrainz database in %.2f seconds.' % second_time_taken) diff --git a/worker_manage.py b/worker_manage.py index cea76c24d..6121650cf 100644 --- a/worker_manage.py +++ b/worker_manage.py @@ -3,6 +3,7 @@ import click from flask.cli import FlaskGroup +import musicbrainz_importer.musicbrainz_importer import dataset_eval.evaluate import hl_extractor.hl_calc import webserver @@ -23,5 +24,10 @@ def command_dataset_evaluator(): dataset_eval.evaluate.main() +@cli.command('musicbrainz_importer') +def command_musicbrainz_importer(): + """Import MusicBrainz metadata""" + musicbrainz_importer.musicbrainz_importer.main() + if __name__ == '__main__': cli()