Skip to content

Commit 01ca4a0

Browse files
author
James Robinson
authored
Use new bulk upload POST route to send introspection results up to gate. (#74)
1 parent c511f6a commit 01ca4a0

File tree

2 files changed

+165
-92
lines changed

2 files changed

+165
-92
lines changed

noteable_magics/sql/meta_commands.py

Lines changed: 90 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -519,9 +519,6 @@ class IntrospectAndStoreDatabaseCommand(MetaCommand):
519519
# new datasource type supported.
520520
AVOID_SCHEMAS = set(('information_schema', 'pg_catalog', 'crdb_internal'))
521521

522-
# As expected in real kernels in Notable. Overridden in test suite.
523-
JWT_PATHNAME = "/vault/secrets/jwt"
524-
525522
def run(self, invoked_as: str, args: List[str]) -> None:
526523
"""Drive introspecting whole database, POSTing results back to Gate for storage.
527524
@@ -542,10 +539,6 @@ def run(self, invoked_as: str, args: List[str]) -> None:
542539

543540
inspector = self.get_inspector()
544541

545-
# On successful completion, will tell Gate to delete any now-orphaned
546-
# relations from prior introspections older than this timestamp.
547-
introspection_started_at = datetime.utcnow()
548-
549542
# This and delta() just for development timing figures. Could become yet another
550543
# timer context manager implementation.
551544
start = time.monotonic()
@@ -565,14 +558,11 @@ def delta() -> float:
565558
relations_and_kinds = self.all_table_and_views(inspector)
566559
print(f'Discovered {len(relations_and_kinds)} relations in {delta()}')
567560

568-
auth_header = self.get_auth_header()
569-
570-
# JSON-able schema descriptions to tell Gate about.
571-
message_queue: List[RelationStructureDescription] = []
572-
573561
# Introspect each relation concurrently.
574562
# TODO: Take minimum concurrency as a param?
575-
with ThreadPoolExecutor(max_workers=self.MAX_INTROSPECTION_THREADS) as executor:
563+
with ThreadPoolExecutor(
564+
max_workers=self.MAX_INTROSPECTION_THREADS
565+
) as executor, RelationStructureMessager(ds_id) as messenger:
576566

577567
future_to_relation = {
578568
executor.submit(
@@ -584,40 +574,15 @@ def delta() -> float:
584574
for future in as_completed(future_to_relation):
585575
schema_name, relation_name, kind = future_to_relation[future]
586576
try:
587-
message_queue.append(future.result())
577+
messenger.queue_for_delivery(future.result())
588578
except Exception as exc:
589579
print(f'Exception for {schema_name}.{relation_name}: {exc}')
590580

591581
table_introspection_delta = delta()
592582
print(
593-
f'Done introspecting in {table_introspection_delta}, amortized {table_introspection_delta / len(relations_and_kinds)}s per relation'
583+
f'Done introspecting and messaging gate in {table_introspection_delta}, amortized {table_introspection_delta / len(relations_and_kinds)}s per relation'
594584
)
595585

596-
# Now report each result back to gate, currently one at a time. Done completely
597-
# after just for relative timing measurements.
598-
599-
# TODO: Could do the messaging back to gate either one-at-a-time in the worker threads,
600-
# or in bulkier chunks either in the main thread when Gate offers a bulk API,
601-
# or could use httpx and an event loop, ...
602-
603-
# But right now doing the messaging back to gate back here in the main thread just one
604-
# request at a time.
605-
606-
session = requests.Session()
607-
session.headers.update(auth_header)
608-
609-
if message_queue:
610-
for message in message_queue:
611-
self.inform_gate_relation(session, ds_id, message)
612-
613-
# Clear out any prior known relations which may not exist anymore in this datasource.
614-
#
615-
# We do this at the tail end of things, and not the beginning, so as to not eagerly delete
616-
# prior known data if we happen to croak due to some unforseen exception while introspecting.
617-
self.inform_gate_completed(session, ds_id, introspection_started_at)
618-
619-
print(f'Done storing discovered table and view structures in {delta()}')
620-
621586
# run() contract: return what to bind to the SQL cell variable name, and if display() needs
622587
# to be called on it. Nothing and nope!
623588
return (None, False)
@@ -873,11 +838,93 @@ def get_datasource_id(self) -> UUID:
873838
handle = self.conn.name
874839
return UUID(handle[1:])
875840

876-
def get_auth_header(self):
877-
"""Set up the auth header dict for making requests directly to Gate"""
841+
842+
class RelationStructureMessager:
843+
"""Context manager that collects the single-relation descriptions discovered
844+
within IntrospectAndStoreDatabaseCommand, buffers up to CAPACITY at a time, then
845+
POSTs the current collected group up to Gate in a single bulk POST (which accepts
846+
at most 10).
847+
848+
Upon context exit, be sure to POST any partial remainder, then tell gate
849+
we're all done introspecting, allowing it to delete any old now no longer
850+
existing structures from prior introspections.
851+
852+
Helper class simplifying IntrospectAndStoreDatabaseCommand.run().
853+
"""
854+
855+
# Overridden in test suite.
856+
CAPACITY = 10
857+
# As expected in real kernels in Notable. Overridden in test suite.
858+
JWT_PATHNAME = "/vault/secrets/jwt"
859+
860+
_relations: List[RelationStructureDescription]
861+
_session: requests.Session
862+
_datasource_id: UUID
863+
_partition_counter: int
864+
_started_at: datetime
865+
866+
def __init__(self, datasource_id: UUID):
867+
self._datasource_id = datasource_id
868+
869+
# Set up HTTP session to message Gate about what is discovered.
870+
self._session = requests.Session()
878871
jwt = pathlib.Path(self.JWT_PATHNAME).read_text()
872+
self._session.headers.update({"Authorization": f"Bearer {jwt}"})
873+
874+
def __enter__(self):
875+
self._relations = []
876+
self._started_at = datetime.utcnow()
877+
self._partition_counter = 1
878+
879+
return self
880+
881+
def queue_for_delivery(self, relation: RelationStructureDescription):
882+
"""Stash this discovered relation. If we have enough already
883+
to send up to Gate, then do so.
884+
"""
885+
self._relations.append(relation)
886+
887+
# Time for an intermediate flush?
888+
if len(self._relations) == self.CAPACITY:
889+
self._message_gate()
890+
891+
def __exit__(self, exc_type, exc_value, exc_traceback):
892+
self._message_gate(completed_introspection=True)
893+
894+
def _message_gate(self, completed_introspection: bool = False):
895+
896+
base_url = f"http://gate.default/api/v1/datasources/{self._datasource_id}/schema/relations"
897+
898+
if self._relations:
899+
# Assemble buffered relation descriptions into a single bulk upload payload
900+
jsonable_message = [
901+
relation_description.dict() for relation_description in self._relations
902+
]
903+
904+
# Upload in a single message.
905+
resp = self._session.post(
906+
base_url,
907+
json=jsonable_message,
908+
)
909+
910+
if resp.status_code == 204:
911+
for relation_description in self._relations:
912+
print(
913+
f'Stored structure of {relation_description.schema_name}.{relation_description.relation_name} in partition {self._partition_counter}'
914+
)
915+
else:
916+
print(
917+
f'Failed storing partition {self._partition_counter}: {resp.status_code}, {resp.text}'
918+
)
919+
920+
# Prepare for next partition.
921+
self._partition_counter += 1
922+
self._relations = []
879923

880-
return {"Authorization": f"Bearer {jwt}"}
924+
if completed_introspection:
925+
# Message indicating all done through asking to clear out any stored relation structures
926+
# older than when we started.
927+
self._session.delete(f"{base_url}?older_than={self._started_at.isoformat()}")
881928

882929

883930
def constraints_dataframe(

tests/test_sql_magic_meta_commands.py

Lines changed: 75 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from noteable_magics.sql.connection import Connection
1313
from noteable_magics.sql.gate_messaging_types import RelationStructureDescription
1414
from noteable_magics.sql.meta_commands import (
15-
IntrospectAndStoreDatabaseCommand,
15+
RelationStructureMessager,
1616
SchemaStrippingInspector,
1717
_all_command_classes,
1818
convert_relation_glob_to_regex,
@@ -586,24 +586,31 @@ def test_nonexistent_schema(self, sql_magic, capsys):
586586
@pytest.mark.usefixtures("populated_cockroach_database")
587587
class TestFullIntrospection:
588588
@pytest.fixture()
589-
def patched_jwt(self, tmp_path):
589+
def patched_relation_structure_messager(self, tmp_path):
590590

591-
original_jwt_pathname = IntrospectAndStoreDatabaseCommand.JWT_PATHNAME
591+
original_jwt_pathname = RelationStructureMessager.JWT_PATHNAME
592592
tmp_jwt_path = tmp_path / 'kernel_jwt'
593593
tmp_jwt_path.write_text('kerneljwtcontents')
594594

595-
IntrospectAndStoreDatabaseCommand.JWT_PATHNAME = str(tmp_jwt_path)
595+
RelationStructureMessager.JWT_PATHNAME = str(tmp_jwt_path)
596+
597+
# Also make it only buffer at most 2 relations so we'll get some intermediate flushes
598+
# when introspecting more than 2 relations.
599+
600+
original_capacity = RelationStructureMessager.CAPACITY
601+
602+
RelationStructureMessager.CAPACITY = 2
596603

597604
yield
598605

599-
IntrospectAndStoreDatabaseCommand.JWT_PATHNAME = original_jwt_pathname
606+
RelationStructureMessager.JWT_PATHNAME = original_jwt_pathname
607+
RelationStructureMessager.CAPACITY = original_capacity
600608

601609
@pytest.fixture()
602-
def patched_requests_mock(self, mocker, patched_jwt, requests_mock):
603-
# IntrospectAndStoreDatabaseCommand.inform_gate_relation() gonna try to do a POST
604-
# to this URL.
610+
def patched_requests_mock(self, mocker, patched_relation_structure_messager, requests_mock):
611+
# RelationStructureMessager will do POSTs to this URL.
605612
requests_mock.post(
606-
f"http://gate.default/api/v1/datasources/{COCKROACH_UUID}/schema/relation",
613+
f"http://gate.default/api/v1/datasources/{COCKROACH_UUID}/schema/relations",
607614
status_code=204,
608615
)
609616

@@ -617,27 +624,32 @@ def patched_requests_mock(self, mocker, patched_jwt, requests_mock):
617624

618625
def test_full_introspection(self, sql_magic, capsys, patched_requests_mock):
619626

627+
# Create one more table, for a total of 5, so that exiting the scope of
628+
# the RelationStructureMessager will have a dreg to POST.
629+
# (see RelationStructureMessager.__exit__())
630+
631+
sql_magic.execute(
632+
rf'{COCKROACH_HANDLE} create table dregs_table (id int primary key not null, name text)'
633+
)
634+
635+
# Now introspect the whole DB.
620636
introspection_start = datetime.utcnow()
621637
sql_magic.execute(rf'{COCKROACH_HANDLE} \introspect')
622638
introspection_end = datetime.utcnow()
623639

624640
out, err = capsys.readouterr()
625641

626642
assert 'Exception' not in out
627-
assert out.startswith('Discovered 4 relations')
643+
644+
# Expect the core tables plus our dregs_table.
645+
# (The 'Done.\n' comes from the 'create table' execution creating dregs_table.)
646+
assert out.startswith(f'Done.\nDiscovered {len(KNOWN_TABLES) + 1} relations')
628647

629648
for name, kind in KNOWN_TABLES_AND_KINDS:
630649
assert f'Introspected {kind} public.{name}' in out
631650
assert f'Stored structure of public.{name}' in out
632651

633-
assert 'Done storing discovered table and view structures' in out
634-
635-
# One POST per relation discovered, and then a single DELETE (as currently implemented)
636-
assert len(patched_requests_mock.request_history) == len(KNOWN_TABLES_AND_KINDS) + 1
637-
# Remember, booleans are also integers because filthy historical C.
638-
assert sum(req.method == 'POST' for req in patched_requests_mock.request_history) == len(
639-
KNOWN_TABLES_AND_KINDS
640-
)
652+
assert 'Done introspecting and messaging gate' in out
641653

642654
# The delete should be the last request, and should specify a since_when timestamp param
643655
# which should be between when introspection started and ended (in UTC).
@@ -659,38 +671,52 @@ def test_full_introspection(self, sql_magic, capsys, patched_requests_mock):
659671
had_check_constraints = False
660672
had_foreign_keys = False
661673

674+
post_call_count = 0
662675
for req in patched_requests_mock.request_history:
663-
if req.method == 'POST' and req.url.endswith('/schema/relation'):
664-
665-
# POST body should correspond to RelationStructureDescription describing a single relation.
666-
from_json = RelationStructureDescription(**req.json())
667-
described_relation_names.add(from_json.relation_name)
668-
669-
if from_json.indexes:
670-
had_indexes = True
671-
672-
if from_json.primary_key_name and from_json.primary_key_columns:
673-
had_primary_key_columns = True
674-
else:
675-
#
676-
assert (
677-
from_json.primary_key_name is None and from_json.primary_key_columns == []
678-
)
679-
680-
if from_json.columns:
681-
had_columns = True
682-
683-
if from_json.unique_constraints:
684-
had_unique_constraints = True
685-
686-
if from_json.check_constraints:
687-
had_check_constraints = True
688-
689-
if from_json.foreign_keys:
690-
had_foreign_keys = True
691-
692-
assert described_relation_names == set(KNOWN_TABLES)
693-
676+
if req.method == 'POST' and req.url.endswith('/schema/relations'):
677+
678+
post_call_count += 1
679+
# POST body should correspond to a list of RelationStructureDescription each describing a single relation.
680+
dict_list = req.json()
681+
for member_dict in dict_list:
682+
from_json = RelationStructureDescription(**member_dict)
683+
described_relation_names.add(from_json.relation_name)
684+
685+
if from_json.indexes:
686+
had_indexes = True
687+
688+
if from_json.primary_key_name and from_json.primary_key_columns:
689+
had_primary_key_columns = True
690+
else:
691+
#
692+
assert (
693+
from_json.primary_key_name is None
694+
and from_json.primary_key_columns == []
695+
)
696+
697+
if from_json.columns:
698+
had_columns = True
699+
700+
if from_json.unique_constraints:
701+
had_unique_constraints = True
702+
703+
if from_json.check_constraints:
704+
had_check_constraints = True
705+
706+
if from_json.foreign_keys:
707+
had_foreign_keys = True
708+
709+
# Expected 3 POST calls for the 5 tables, since we fixture
710+
# tuned RelationStructureMessager.CAPACITY down to 2. Two calls of two
711+
# relations each, then a trailing dreg call.
712+
assert post_call_count == 3
713+
714+
# Every expected relation should have been described.
715+
expected_relation_names = set(KNOWN_TABLES) # The default table set
716+
expected_relation_names.add('dregs_table') # plus our per-test extra.
717+
assert described_relation_names == expected_relation_names
718+
719+
# Every substructure variation should have been covered across all these tables/view.
694720
assert had_columns
695721
assert had_primary_key_columns
696722
assert had_indexes

0 commit comments

Comments
 (0)