Skip to content

Commit 9a21925

Browse files
authored
GH-47650: [Archery][Integration] Add option to generate gold files (#47651)
### Rationale for this change Generating "gold" IPC files for integration/regression testing between Arrow implementations is currently a tedious (and undocumented) manual process. ### What changes are included in this PR? Add an option to `archery integration` to automate generation of gold files in the right format, using the Arrow implementation selected on the command line. ### Are these changes tested? Only manually. ### Are there any user-facing changes? No. * GitHub Issue: #47650 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 6a0ea6a commit 9a21925

File tree

4 files changed

+70
-24
lines changed

4 files changed

+70
-24
lines changed

dev/archery/archery/cli.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,8 @@ def _set_default(opt, default):
693693
envvar="ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS")
694694
@click.option('--write-generated-json', default="",
695695
help='Generate test JSON to indicated path')
696+
@click.option('--write-gold-files', default="",
697+
help='Generate gold files to indicated path')
696698
@click.option('--run-ipc', is_flag=True, default=False,
697699
help='Run IPC integration tests')
698700
@click.option('--run-flight', is_flag=True, default=False,
@@ -715,7 +717,7 @@ def _set_default(opt, default):
715717
help=("Substring for test names to include in run, "
716718
"e.g. -k primitive"))
717719
def integration(with_all=False, random_seed=12345, write_generated_json="",
718-
**args):
720+
write_gold_files="", **args):
719721
"""If you don't specify the "--target-implementations" option nor
720722
the "ARCHERY_INTEGRATION_TARGET_IMPLEMENTATIONS" environment
721723
variable, test patterns are product of all specified
@@ -774,8 +776,9 @@ def integration(with_all=False, random_seed=12345, write_generated_json="",
774776
775777
"""
776778

777-
from .integration.datagen import get_generated_json_files
778-
from .integration.runner import run_all_tests
779+
from .integration.datagen import (
780+
get_generated_json_files, generate_gold_files)
781+
from .integration.runner import run_all_tests, select_testers
779782
import numpy as np
780783

781784
# FIXME(bkietz) Include help strings for individual testers.
@@ -784,33 +787,32 @@ def integration(with_all=False, random_seed=12345, write_generated_json="",
784787
# Make runs involving data generation deterministic
785788
np.random.seed(random_seed)
786789

787-
implementations = ['cpp', 'dotnet', 'java', 'js', 'go', 'nanoarrow', 'rust']
788790
formats = ['ipc', 'flight', 'c_data']
789791

790-
enabled_implementations = 0
791-
for lang in implementations:
792-
param = f'with_{lang}'
793-
if with_all:
794-
args[param] = with_all
795-
enabled_implementations += args[param]
796-
797792
enabled_formats = 0
798793
for fmt in formats:
799794
param = f'run_{fmt}'
800795
enabled_formats += args[param]
801796

797+
testers, other_testers = select_testers(**args)
798+
802799
if write_generated_json:
803800
os.makedirs(write_generated_json, exist_ok=True)
804801
get_generated_json_files(tempdir=write_generated_json)
802+
elif write_gold_files:
803+
if len(testers) != 1 or len(other_testers) != 0:
804+
raise click.UsageError(
805+
"Need exactly one implementation to generate gold files; try --help")
806+
generate_gold_files(testers[0], write_gold_files)
805807
else:
806808
if enabled_formats == 0:
807809
raise click.UsageError(
808810
"Need to enable at least one format to test "
809811
"(IPC, Flight, C Data Interface); try --help")
810-
if enabled_implementations == 0:
812+
if len(testers) == 0:
811813
raise click.UsageError(
812814
"Need to enable at least one implementation to test; try --help")
813-
run_all_tests(**args)
815+
run_all_tests(testers, other_testers, **args)
814816

815817

816818
@archery.command()

dev/archery/archery/integration/datagen.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717

1818
from collections import namedtuple, OrderedDict
1919
import binascii
20+
import gzip
2021
import json
2122
import os
2223
import random
2324
import tempfile
25+
import shutil
2426

2527
import numpy as np
2628

@@ -1885,9 +1887,6 @@ def generate_extension_case():
18851887
def get_generated_json_files(tempdir=None):
18861888
tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-')
18871889

1888-
def _temp_path():
1889-
return
1890-
18911890
file_objs = [
18921891
generate_primitive_case([], name='primitive_no_batches'),
18931892
generate_primitive_case([17, 20], name='primitive'),
@@ -2003,3 +2002,25 @@ def _temp_path():
20032002
generated_paths.append(file_obj)
20042003

20052004
return generated_paths
2005+
2006+
2007+
def generate_gold_files(tester, gold_dir):
2008+
os.makedirs(gold_dir, exist_ok=True)
2009+
2010+
# Generate JSON files
2011+
files = get_generated_json_files(gold_dir)
2012+
for f in files:
2013+
# For each JSON file, convert it to Arrow IPC file and stream
2014+
json_path = os.path.join(gold_dir, 'generated_' +
2015+
f.name + '.json')
2016+
arrow_file_path = os.path.join(gold_dir, 'generated_' +
2017+
f.name + '.arrow_file')
2018+
stream_path = os.path.join(gold_dir, 'generated_' +
2019+
f.name + '.stream')
2020+
tester.json_to_file(json_path, arrow_file_path)
2021+
tester.file_to_stream(arrow_file_path, stream_path)
2022+
# And GZip-compress the JSON file
2023+
with open(json_path, 'rb') as f_in:
2024+
with gzip.open(json_path + '.gz', 'wb') as f_out:
2025+
shutil.copyfileobj(f_in, f_out)
2026+
os.unlink(json_path)

dev/archery/archery/integration/runner.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -589,14 +589,12 @@ def get_static_json_files():
589589
]
590590

591591

592-
def run_all_tests(with_cpp=True, with_java=True, with_js=True,
593-
with_dotnet=True, with_go=True, with_rust=False,
594-
with_nanoarrow=False, run_ipc=False, run_flight=False,
595-
run_c_data=False, tempdir=None, target_implementations="",
596-
**kwargs):
597-
tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-')
598-
target_implementations = \
599-
target_implementations.split(",") if target_implementations else []
592+
def select_testers(with_cpp=True, with_java=True, with_js=True,
593+
with_dotnet=True, with_go=True, with_rust=False,
594+
with_nanoarrow=False, target_implementations="",
595+
**kwargs):
596+
target_implementations = (target_implementations.split(",")
597+
if target_implementations else [])
600598

601599
testers: List[Tester] = []
602600
other_testers: List[Tester] = []
@@ -635,6 +633,14 @@ def append_tester(implementation, tester):
635633
from .tester_rust import RustTester
636634
append_tester("rust", RustTester(**kwargs))
637635

636+
return testers, other_testers
637+
638+
639+
def run_all_tests(testers: List[Tester], other_testers: List[Tester],
640+
run_ipc=False, run_flight=False, run_c_data=False,
641+
tempdir=None, **kwargs):
642+
tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-')
643+
638644
static_json_files = get_static_json_files()
639645
generated_json_files = datagen.get_generated_json_files(tempdir=tempdir)
640646
json_files = static_json_files + generated_json_files

docs/source/format/Integration.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,3 +621,20 @@ utility. Below are the test cases which are covered by them:
621621
- ZSTD
622622

623623
* Batches with Shared Dictionaries
624+
625+
Generating new Gold Files
626+
'''''''''''''''''''''''''
627+
628+
From time to time, it is desirable to add new gold files, for example when the
629+
Columnar format or the IPC specification is update. Archery provides a dedicated
630+
option to do that.
631+
632+
It is recommended to generate gold files using a well-known version of a Arrow
633+
implementation. For example, if a build of Arrow C++ exists in ``./build/release/``,
634+
one can generate new gold files in the ``/tmp/gold-files`` directory using the
635+
following command:
636+
637+
.. code-block:: shell
638+
639+
export ARROW_CPP_EXE_PATH=./build/release/
640+
archery integration --with-cpp 1 --write-gold-files=/tmp/gold-files

0 commit comments

Comments
 (0)