Skip to content

Commit 7362714

Browse files
Merge branch 'main' into qa
2 parents 63eee82 + 0872b09 commit 7362714

File tree

5 files changed

+69
-181
lines changed

5 files changed

+69
-181
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 2024-11-22 -- v2.0.1
2+
### Added
3+
- Use nypl-py-utils patron_data_helper methods to get Sierra and Redshift patron info
4+
15
## 2024-11-05 -- v2.0.0
26
### Added
37
- Rewrite Sierra barcode --> patron_id query to use more efficient phrase_entry table

helpers/query_helper.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,34 +9,8 @@
99
ORDER BY pcrDateTime, pcrKey
1010
LIMIT {limit};"""
1111

12-
_SIERRA_QUERY = """
13-
SELECT
14-
barcode, id, ptype_code, pcode3,
15-
CASE WHEN LENGTH(TRIM(home_library_code)) = 0
16-
OR TRIM(home_library_code) = 'none' THEN NULL
17-
ELSE TRIM(home_library_code) END
18-
FROM sierra_view.patron_view
19-
WHERE id IN (
20-
SELECT record_id
21-
FROM sierra_view.phrase_entry
22-
WHERE index_tag || index_entry IN ({})
23-
);"""
24-
25-
_REDSHIFT_QUERY = """
26-
SELECT patron_id, postal_code, geoid
27-
FROM {table}
28-
WHERE patron_id IN ({ids});"""
29-
3012

3113
def build_envisionware_query(date_time, key):
3214
return _ENVISIONWARE_QUERY.format(
3315
date_time=date_time, key=key, limit=os.environ["ENVISIONWARE_BATCH_SIZE"]
3416
)
35-
36-
37-
def build_sierra_query(barcodes):
38-
return _SIERRA_QUERY.format(barcodes)
39-
40-
41-
def build_redshift_query(table, ids):
42-
return _REDSHIFT_QUERY.format(table=table, ids=ids)

main.py

Lines changed: 17 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,7 @@
33
import pandas as pd
44

55
from concurrent.futures import ThreadPoolExecutor
6-
from helpers.query_helper import (
7-
build_envisionware_query,
8-
build_redshift_query,
9-
build_sierra_query,
10-
)
6+
from helpers.query_helper import build_envisionware_query
117
from nypl_py_utils.classes.avro_client import AvroEncoder
128
from nypl_py_utils.classes.kinesis_client import KinesisClient
139
from nypl_py_utils.classes.mysql_client import MySQLClient
@@ -17,7 +13,10 @@
1713
from nypl_py_utils.functions.config_helper import load_env_file
1814
from nypl_py_utils.functions.log_helper import create_log
1915
from nypl_py_utils.functions.obfuscation_helper import obfuscate
20-
16+
from nypl_py_utils.functions.patron_data_helper import (
17+
get_redshift_patron_data,
18+
get_sierra_patron_data_from_barcodes,
19+
)
2120

2221
_DTYPE_MAP = {
2322
"patron_id": "string",
@@ -109,60 +108,27 @@ def main():
109108
with ThreadPoolExecutor() as executor:
110109
pc_reserve_df["key"] = list(executor.map(obfuscate, pc_reserve_df["key"]))
111110

112-
# Query Sierra for patron info using the patron barcodes
113-
barcodes_str = "'b" + "','b".join(pc_reserve_df["barcode"].unique()) + "'"
114-
sierra_client.connect()
115-
sierra_raw_data = sierra_client.execute_query(build_sierra_query(barcodes_str))
116-
sierra_client.close_connection()
117-
sierra_df = pd.DataFrame(
118-
data=sierra_raw_data,
119-
columns=[
120-
"barcode",
121-
"patron_id",
122-
"ptype_code",
123-
"pcode3",
124-
"patron_home_library_code",
125-
],
111+
# Get patron info from Sierra, set the patron retrieval status, and obfuscate
112+
# the patron_id. The patron_id is either the Sierra id or, if no Sierra id is
113+
# found for the barcode, the barcode prepended with 'barcode '.
114+
sierra_df = get_sierra_patron_data_from_barcodes(
115+
sierra_client, pc_reserve_df["barcode"]
126116
)
127-
128-
# Some barcodes correspond to multiple patron records. For these barcodes, do
129-
# not use patron information from any of the records.
130-
sierra_df = sierra_df[pd.notnull(sierra_df["barcode"])]
131-
sierra_df = sierra_df.drop_duplicates("barcode", keep=False)
132-
sierra_df["patron_id"] = sierra_df["patron_id"].astype("Int64").astype("string")
133-
134-
# Merge the dataframes, set the patron retrieval status, and obfuscate the
135-
# patron_id. The patron_id is either the Sierra id or, if no Sierra id is found
136-
# for the barcode, the barcode prepended with 'barcode '.
137117
pc_reserve_df = pc_reserve_df.merge(sierra_df, how="left", on="barcode")
138118
pc_reserve_df = pc_reserve_df.apply(_set_patron_retrieval_status, axis=1)
139119
with ThreadPoolExecutor() as executor:
140120
pc_reserve_df["patron_id"] = list(
141121
executor.map(obfuscate, pc_reserve_df["patron_id"])
142122
)
143123

144-
# Query Redshift for the zip code and geoid using the obfuscated Sierra ids
145-
sierra_ids = pc_reserve_df.loc[
146-
pc_reserve_df["patron_retrieval_status"] == "found", "patron_id"
147-
]
148-
if not sierra_ids.empty:
149-
ids_str = "'" + "','".join(sierra_ids.unique()) + "'"
150-
redshift_table = "patron_info"
151-
if os.environ["REDSHIFT_DB_NAME"] != "production":
152-
redshift_table += "_" + os.environ["REDSHIFT_DB_NAME"]
153-
redshift_client.connect()
154-
redshift_raw_data = redshift_client.execute_query(
155-
build_redshift_query(redshift_table, ids_str)
156-
)
157-
redshift_client.close_connection()
158-
else:
159-
logger.info("No Sierra ids found to query Redshift with")
160-
redshift_raw_data = []
161-
redshift_df = pd.DataFrame(
162-
data=redshift_raw_data, columns=["patron_id", "postal_code", "geoid"]
124+
# Get additional patron info from Redshift, merge the dataframes, and convert
125+
# field dtypes
126+
redshift_df = get_redshift_patron_data(
127+
redshift_client,
128+
pc_reserve_df.loc[
129+
pc_reserve_df["patron_retrieval_status"] == "found", "patron_id"
130+
],
163131
)
164-
165-
# Merge the dataframes and convert necessary fields to integers
166132
pc_reserve_df = pc_reserve_df.merge(redshift_df, how="left", on="patron_id")
167133
pc_reserve_df = pc_reserve_df.astype(_DTYPE_MAP)
168134

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
nypl-py-utils[avro-client,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper]==1.4.0
1+
nypl-py-utils[avro-client,kinesis-client,mysql-client,postgresql-client,redshift-client,s3-client,config-helper,obfuscation-helper,patron-data-helper]==1.6.0
22
pandas

tests/test_main.py

Lines changed: 47 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
import logging
21
import main
32
import os
3+
import pandas as pd
44
import pytest
55

66
from datetime import datetime
7-
from nypl_py_utils.classes.postgresql_client import PostgreSQLClientError
7+
from pandas.testing import assert_series_equal
88

99
_ENVISIONWARE_DATA = [
1010
(10000000, "barcode1", 100, datetime(2023, 1, 1, 1, 0, 0), "branch1", "area1",
@@ -17,15 +17,16 @@
1717
"staff_override4"),
1818
]
1919

20-
_SIERRA_DATA = [
21-
("barcode1", 111111111111, 1, 10, "lib1"),
22-
("barcode3", 333333333333, 3, 30, "lib3"),
23-
("barcode3", 300000000000, 30, 300, "lib30"),
24-
("barcode4", 444444444444, 4, 40, "lib4"),
25-
(None, 444444444444, None, None, None),
26-
]
20+
_SIERRA_DF = pd.DataFrame([
21+
("111111111111", "barcode1", 1, 10, "lib1"),
22+
("444444444444", "barcode4", 4, 40, "lib4")],
23+
columns=["patron_id", "barcode", "ptype_code", "pcode3",
24+
"patron_home_library_code"]).astype({"patron_id": "string"})
2725

28-
_REDSHIFT_DATA = (["obf_id1", "zip1", "geoid1"], ["obf_id4", "zip4", None])
26+
_REDSHIFT_DF = pd.DataFrame(
27+
[["obf_id1", "zip1", "geoid1"],],
28+
columns=["patron_id", "postal_code", "geoid"]
29+
)
2930

3031
_RESULTS = [
3132
{"patron_id": "obf_id1", "ptype_code": 1, "patron_home_library_code": "lib1",
@@ -44,7 +45,7 @@
4445
"area": "area3", "staff_override": "staff_override3",
4546
"patron_retrieval_status": "missing"},
4647
{"patron_id": "obf_id4", "ptype_code": 4, "patron_home_library_code": "lib4",
47-
"pcode3": 40, "postal_code": "zip4", "geoid": None, "key": "obf_key4",
48+
"pcode3": 40, "postal_code": None, "geoid": None, "key": "obf_key4",
4849
"minutes_used": 400, "transaction_et": "2023-04-04", "branch": "branch4",
4950
"area": "area4", "staff_override": "staff_override4",
5051
"patron_retrieval_status": "found"},
@@ -69,6 +70,10 @@ def test_main_one_iteration(self, mock_helpers, mocker):
6970
mock_obfuscate = mocker.patch("main.obfuscate", side_effect=[
7071
"obf_key1", "obf_key2", "obf_key3", "obf_key4",
7172
"obf_id1", "obf_id2", "obf_id3", "obf_id4"])
73+
mock_get_sierra_patron_data_from_barcodes = mocker.patch(
74+
"main.get_sierra_patron_data_from_barcodes", return_value=_SIERRA_DF)
75+
mock_get_redshift_patron_data = mocker.patch(
76+
"main.get_redshift_patron_data", return_value=_REDSHIFT_DF)
7277

7378
mock_kinesis_client = self._set_up_mock("main.KinesisClient", mocker)
7479

@@ -86,18 +91,8 @@ def test_main_one_iteration(self, mock_helpers, mocker):
8691
)
8792
mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker)
8893
mock_envisionware_client.execute_query.return_value = _ENVISIONWARE_DATA
89-
90-
mock_sierra_query = mocker.patch(
91-
"main.build_sierra_query", return_value="SIERRA QUERY"
92-
)
9394
mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker)
94-
mock_sierra_client.execute_query.return_value = _SIERRA_DATA
95-
96-
mock_redshift_query = mocker.patch(
97-
"main.build_redshift_query", return_value="REDSHIFT QUERY"
98-
)
9995
mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)
100-
mock_redshift_client.execute_query.return_value = _REDSHIFT_DATA
10196

10297
main.main()
10398

@@ -110,18 +105,20 @@ def test_main_one_iteration(self, mock_helpers, mocker):
110105
)
111106
mock_envisionware_client.close_connection.assert_called_once()
112107

113-
mock_sierra_client.connect.assert_called_once()
114-
mock_sierra_query.assert_called_once_with(
115-
"'bbarcode1','b25555000000000','bbarcode3','bbarcode4'"
116-
)
117-
mock_sierra_client.close_connection.assert_called_once()
118-
119-
mock_redshift_client.connect.assert_called_once()
120-
mock_redshift_query.assert_called_once_with(
121-
"patron_info_test_redshift_name", "'obf_id1','obf_id4'"
122-
)
123-
mock_redshift_client.execute_query.assert_called_once_with("REDSHIFT QUERY")
124-
mock_redshift_client.close_connection.assert_called_once()
108+
mock_get_sierra_patron_data_from_barcodes.assert_called_once()
109+
sierra_args = mock_get_sierra_patron_data_from_barcodes.call_args[0]
110+
assert sierra_args[0] == mock_sierra_client
111+
assert_series_equal(
112+
sierra_args[1],
113+
pd.Series(["barcode1", "25555000000000", "barcode3", "barcode4"],
114+
name="barcode", dtype="string"))
115+
116+
mock_get_redshift_patron_data.assert_called_once()
117+
redshift_args = mock_get_redshift_patron_data.call_args[0]
118+
assert redshift_args[0] == mock_redshift_client
119+
assert_series_equal(
120+
redshift_args[1],
121+
pd.Series(["obf_id1", "obf_id4"], name="patron_id", index=[0, 3]))
125122

126123
mock_obfuscate.assert_has_calls(
127124
[
@@ -152,8 +149,6 @@ def test_main_multiple_iterations(self, mock_helpers, mocker):
152149
"staff_override{}".format(i))
153150
for i in range(1, 7)]
154151
mocker.patch("main.obfuscate")
155-
mocker.patch("main.build_sierra_query")
156-
mocker.patch("main.build_redshift_query")
157152
mocker.patch("main.AvroEncoder")
158153
mocker.patch("main.KinesisClient")
159154
mocker.patch("main.PostgreSQLClient")
@@ -194,14 +189,15 @@ def test_main_no_envisionware_results(self, mock_helpers, mocker):
194189
del os.environ["MAX_BATCHES"]
195190
mocker.patch("main.obfuscate")
196191
mocker.patch("main.build_envisionware_query")
197-
mocker.patch("main.build_sierra_query")
198-
mocker.patch("main.build_redshift_query")
192+
mocker.patch("main.PostgreSQLClient")
193+
mocker.patch("main.RedshiftClient")
194+
mock_get_sierra_patron_data_from_barcodes = mocker.patch(
195+
"main.get_sierra_patron_data_from_barcodes")
196+
mock_get_redshift_patron_data = mocker.patch("main.get_redshift_patron_data")
199197

200198
mock_s3_client = self._set_up_mock("main.S3Client", mocker)
201199
mock_kinesis_client = self._set_up_mock("main.KinesisClient", mocker)
202200
mock_avro_encoder = self._set_up_mock("main.AvroEncoder", mocker)
203-
mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker)
204-
mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)
205201

206202
mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker)
207203
mock_envisionware_client.execute_query.return_value = []
@@ -213,13 +209,13 @@ def test_main_no_envisionware_results(self, mock_helpers, mocker):
213209
mock_s3_client.close.assert_called_once()
214210
mock_kinesis_client.close.assert_called_once()
215211

216-
mock_sierra_client.connect.assert_not_called()
217-
mock_redshift_client.connect.assert_not_called()
212+
mock_get_sierra_patron_data_from_barcodes.assert_not_called()
213+
mock_get_redshift_patron_data.assert_not_called()
218214
mock_avro_encoder.encode_batch.assert_not_called()
219215
mock_kinesis_client.send_records.assert_not_called()
220216
mock_s3_client.set_cache.assert_not_called()
221217

222-
def test_main_no_sierra_results(self, mock_helpers, mocker):
218+
def test_main_no_sierra_redshift_results(self, mock_helpers, mocker):
223219
_TEST_ENVISIONWARE_DATA = [
224220
(i, "barcode{}".format(i), i, datetime(2023, i, i, i, 0, 0),
225221
"branch{}".format(i), "area{}".format(i),
@@ -236,24 +232,26 @@ def test_main_no_sierra_results(self, mock_helpers, mocker):
236232
for i in range(1, 5)]
237233

238234
mocker.patch("main.build_envisionware_query")
239-
mocker.patch("main.build_sierra_query")
240-
mocker.patch("main.build_redshift_query")
241235
mocker.patch("main.KinesisClient")
242236
mocker.patch("main.S3Client")
243237

244238
mock_obfuscate = mocker.patch("main.obfuscate", return_value="obfuscated")
239+
mocker.patch(
240+
"main.get_sierra_patron_data_from_barcodes",
241+
return_value=pd.DataFrame(
242+
[], columns=["patron_id", "barcode", "ptype_code", "pcode3",
243+
"patron_home_library_code"]))
244+
mocker.patch(
245+
"main.get_redshift_patron_data",
246+
return_value=pd.DataFrame(
247+
[], columns=["patron_id", "postal_code", "geoid"]))
245248
mock_avro_encoder = self._set_up_mock("main.AvroEncoder", mocker)
246-
mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)
247249

248250
mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker)
249251
mock_envisionware_client.execute_query.return_value = _TEST_ENVISIONWARE_DATA
250252

251-
mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker)
252-
mock_sierra_client.execute_query.return_value = []
253-
254253
main.main()
255254

256-
mock_redshift_client.connect.assert_not_called()
257255
mock_avro_encoder.encode_batch.assert_called_once_with(_RESULTS)
258256
mock_obfuscate.assert_has_calls(
259257
[
@@ -267,57 +265,3 @@ def test_main_no_sierra_results(self, mock_helpers, mocker):
267265
mocker.call("barcode barcode4"),
268266
]
269267
)
270-
271-
def test_main_no_redshift_results(self, mock_helpers, mocker):
272-
_TEST_ENVISIONWARE_DATA = [
273-
(i, "barcode{}".format(i), i, datetime(2023, i, i, i, 0, 0),
274-
"branch{}".format(i), "area{}".format(i),
275-
"staff_override{}".format(i))
276-
for i in range(1, 5)]
277-
_TEST_SIERRA_DATA = [
278-
("barcode{}".format(i), i+1, i, i, "lib{}".format(i))
279-
for i in range(1, 5)]
280-
_RESULTS = [
281-
{"patron_id": "obfuscated", "ptype_code": i,
282-
"patron_home_library_code": "lib{}".format(i), "pcode3": i,
283-
"postal_code": None, "geoid": None, "key": "obfuscated",
284-
"minutes_used": i, "transaction_et": "2023-0{}-0{}".format(i, i),
285-
"branch": "branch{}".format(i), "area": "area{}".format(i),
286-
"staff_override": "staff_override{}".format(i),
287-
"patron_retrieval_status": "found"}
288-
for i in range(1, 5)]
289-
290-
mocker.patch("main.build_envisionware_query")
291-
mocker.patch("main.build_sierra_query")
292-
mocker.patch("main.build_redshift_query")
293-
mocker.patch("main.KinesisClient")
294-
mocker.patch("main.S3Client")
295-
296-
mock_obfuscate = mocker.patch("main.obfuscate", return_value="obfuscated")
297-
mock_avro_encoder = self._set_up_mock("main.AvroEncoder", mocker)
298-
mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)
299-
300-
mock_envisionware_client = self._set_up_mock("main.MySQLClient", mocker)
301-
mock_envisionware_client.execute_query.return_value = _TEST_ENVISIONWARE_DATA
302-
303-
mock_sierra_client = self._set_up_mock("main.PostgreSQLClient", mocker)
304-
mock_sierra_client.execute_query.return_value = _TEST_SIERRA_DATA
305-
306-
mock_redshift_client = self._set_up_mock("main.RedshiftClient", mocker)
307-
mock_redshift_client.execute_query.return_value = []
308-
309-
main.main()
310-
311-
mock_avro_encoder.encode_batch.assert_called_once_with(_RESULTS)
312-
mock_obfuscate.assert_has_calls(
313-
[
314-
mocker.call("1"),
315-
mocker.call("2"),
316-
mocker.call("3"),
317-
mocker.call("4"),
318-
mocker.call("2"),
319-
mocker.call("3"),
320-
mocker.call("4"),
321-
mocker.call("5"),
322-
]
323-
)

0 commit comments

Comments
 (0)