Skip to content

Commit 379bb31

Browse files
authored
Merge pull request #175 from dathere/druf-phase1-wip
twdh-0.6-release
2 parents 44af9ff + 77011ee commit 379bb31

File tree

1 file changed

+34
-2
lines changed

1 file changed

+34
-2
lines changed

ckanext/datapusher_plus/jobs.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
POSTGRES_BIGINT_MAX = 9223372036854775807
7676
POSTGRES_BIGINT_MIN = -9223372036854775808
7777

78-
MINIMUM_QSV_VERSION = "0.133.0"
78+
MINIMUM_QSV_VERSION = "2.1.0"
7979
MAX_CONTENT_LENGTH = tk.config.get("ckanext.datapusher_plus.max_content_length")
8080

8181
DATASTORE_URLS = {
@@ -915,6 +915,38 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None):
915915
raise utils.JobError(
916916
"Cannot infer data types and compile statistics: {}".format(e)
917917
)
918+
919+
# remove the last four rows. Do this using the qsv slice command
920+
# the last four rows are qsv__rowcount, qsv__columncount, qsv__filesize_bytes, qsv__fingerprint_hash
921+
# they'll be used in later phases of DRUF, but let's remove them for now until then
922+
qsv_slice_csv = os.path.join(temp_dir, "qsv_slice.csv")
923+
try:
924+
subprocess.run(
925+
[
926+
qsv_bin,
927+
"slice",
928+
"--start",
929+
"-4",
930+
"--invert",
931+
qsv_stats_csv,
932+
"--output",
933+
qsv_slice_csv,
934+
],
935+
check=True,
936+
)
937+
except subprocess.CalledProcessError as e:
938+
raise utils.JobError("Cannot slice CSV: {}".format(e))
939+
940+
# read the sliced CSV and remove the qsv__value column (the last column).
941+
# Do this using the qsv select command
942+
try:
943+
subprocess.run(
944+
[qsv_bin, "select", "!_", qsv_slice_csv, "--output", qsv_stats_csv],
945+
check=True,
946+
)
947+
except subprocess.CalledProcessError as e:
948+
raise utils.JobError("Cannot select CSV: {}".format(e))
949+
918950
with open(qsv_stats_csv, mode="r") as inp:
919951
reader = csv.DictReader(inp)
920952
for row in reader:
@@ -1402,7 +1434,7 @@ def _push_to_datastore(task_id, input, dry_run=False, temp_dir=None):
14021434
except psycopg2.Error as e:
14031435
logger.warning("Could not TRUNCATE: {}".format(e))
14041436

1405-
col_names_list = [h["id"] for h in headers_dicts if not h["id"].startswith("qsv_")]
1437+
col_names_list = [h["id"] for h in headers_dicts]
14061438
column_names = sql.SQL(",").join(sql.Identifier(c) for c in col_names_list)
14071439
copy_sql = sql.SQL(
14081440
"COPY {} ({}) FROM STDIN "

0 commit comments

Comments
 (0)