Skip to content

Commit 611c3e8

Browse files
shirshankaclaudetreff7es
authored
feat(cli): add datahub init --sso for browser-based SSO login (#16715)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Tamas Nemeth <treff7es@gmail.com>
1 parent 81e31d6 commit 611c3e8

File tree

6 files changed

+669
-13
lines changed

6 files changed

+669
-13
lines changed

metadata-ingestion/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,6 +1232,7 @@
12321232
]
12331233
)
12341234
),
1235+
"sso": list(framework_common | {"playwright>=1.40.0,<2.0.0"}),
12351236
"cloud": ["acryl-datahub-cloud"],
12361237
"dev": list(dev_requirements),
12371238
"docs": list(

metadata-ingestion/src/datahub/cli/resources/INIT_AGENT_CONTEXT.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,39 @@ export DATAHUB_GMS_TOKEN=<your-token>
4848
datahub init
4949
```
5050

51+
## SSO browser login
52+
53+
For DataHub instances using SSO (OIDC/SAML), use `--sso` to authenticate via browser:
54+
55+
```bash
56+
# Opens browser — complete SSO, CLI captures session and generates token
57+
datahub init --sso --host https://your-instance.example.com/gms
58+
59+
# Custom token duration
60+
datahub init --sso --host https://your-instance.example.com/gms --token-duration ONE_MONTH
61+
```
62+
63+
**Prerequisites** (one-time setup):
64+
65+
```bash
66+
pip install 'acryl-datahub[sso]' # or: uv pip install 'acryl-datahub[sso]'
67+
playwright install chromium
68+
```
69+
70+
`--sso` is mutually exclusive with `--token`, `--username`, and `--password`.
71+
If Playwright is not installed, the command prints step-by-step install instructions and exits.
72+
73+
### Support login (DataHub Cloud)
74+
75+
For the support team debugging customer instances, add `--support` to use the
76+
`/support/authenticate` login path:
77+
78+
```bash
79+
datahub init --sso --support --host https://customer.acryl.io/gms
80+
```
81+
82+
`--support` requires `--sso`.
83+
5184
## Environment variables
5285

5386
| Variable | CLI equivalent |
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
import logging
2+
import urllib.parse
3+
from datetime import datetime
4+
from typing import Tuple
5+
6+
import click
7+
import requests
8+
9+
logger = logging.getLogger(__name__)
10+
11+
CLI_TOKEN_PREFIX = "cli token "
12+
13+
_INSTALL_HELP = """\
14+
The --sso flag requires Playwright and a Chromium browser.
15+
16+
Step 1 — Install the Python package (pick your package manager):
17+
pip install 'acryl-datahub[sso]'
18+
uv pip install 'acryl-datahub[sso]'
19+
pip install 'playwright>=1.40.0'
20+
21+
Step 2 — Download the Chromium browser binary:
22+
playwright install chromium\
23+
"""
24+
25+
26+
def _check_playwright_ready() -> None:
27+
"""Verify that playwright is importable.
28+
29+
Raises click.UsageError with step-by-step install instructions if not.
30+
If the chromium browser binary is missing, Playwright itself will raise
31+
a clear error at launch time telling the user to run `playwright install`.
32+
"""
33+
try:
34+
from playwright.sync_api import sync_playwright # noqa: F401
35+
except ImportError as e:
36+
raise click.UsageError(
37+
"Playwright is not installed.\n\n" + _INSTALL_HELP
38+
) from e
39+
40+
41+
def _warn_about_existing_cli_tokens(
42+
session: requests.Session,
43+
frontend_url: str,
44+
actor_urn: str,
45+
) -> None:
46+
"""Best-effort warning about existing CLI tokens for the current user."""
47+
try:
48+
response = session.post(
49+
f"{frontend_url}/api/v2/graphql",
50+
json={
51+
"query": """query listAccessTokens($input: ListAccessTokenInput!) {
52+
listAccessTokens(input: $input) {
53+
total
54+
tokens { name }
55+
}
56+
}""",
57+
"variables": {
58+
"input": {
59+
"start": 0,
60+
"count": 100,
61+
"filters": [
62+
{
63+
"field": "ownerUrn",
64+
"values": [actor_urn],
65+
}
66+
],
67+
}
68+
},
69+
},
70+
)
71+
response.raise_for_status()
72+
data = response.json()
73+
tokens = data.get("data", {}).get("listAccessTokens", {}).get("tokens", [])
74+
cli_token_count = sum(
75+
1 for t in tokens if t.get("name", "").startswith(CLI_TOKEN_PREFIX)
76+
)
77+
if cli_token_count > 0:
78+
click.echo(
79+
f"⚠ You have {cli_token_count} existing CLI token(s). "
80+
f"Manage them at {frontend_url}/settings/tokens"
81+
)
82+
except Exception:
83+
logger.debug("Failed to check existing CLI tokens", exc_info=True)
84+
85+
86+
def browser_sso_login(
87+
frontend_url: str,
88+
token_duration: str,
89+
timeout_ms: int = 120_000,
90+
support: bool = False,
91+
) -> Tuple[str, str]:
92+
"""Open browser for SSO login, extract session, generate access token.
93+
94+
Args:
95+
frontend_url: The DataHub frontend URL (e.g. http://localhost:9002).
96+
token_duration: Token validity duration (e.g. ONE_HOUR).
97+
timeout_ms: How long to wait for SSO login to complete, in milliseconds.
98+
support: If True, use /support/authenticate path for DataHub Cloud
99+
support team access to customer instances.
100+
101+
Returns:
102+
Tuple of (token_name, access_token).
103+
104+
Raises:
105+
click.ClickException: On timeout or missing session cookies.
106+
"""
107+
_check_playwright_ready()
108+
109+
from playwright.sync_api import sync_playwright
110+
111+
auth_path = "/support/authenticate" if support else "/authenticate"
112+
if support:
113+
click.echo("Opening browser for support SSO login...")
114+
else:
115+
click.echo("Opening browser for SSO login...")
116+
click.echo("Complete the login in your browser.\n")
117+
118+
with sync_playwright() as p:
119+
browser = p.chromium.launch(headless=False)
120+
try:
121+
context = browser.new_context()
122+
page = context.new_page()
123+
124+
page.goto(f"{frontend_url}{auth_path}")
125+
126+
# Wait for the actor cookie, which signals successful SSO login.
127+
actor_urn = None
128+
try:
129+
page.wait_for_function(
130+
"""() => document.cookie.split('; ').some(c => c.startsWith('actor='))""",
131+
timeout=timeout_ms,
132+
)
133+
except Exception as e:
134+
raise click.ClickException(
135+
f"SSO login timed out after {timeout_ms // 1000} seconds. "
136+
"Please try again."
137+
) from e
138+
139+
# Extract cookies from the browser context
140+
cookies = context.cookies()
141+
finally:
142+
browser.close()
143+
144+
# Build a requests.Session with the extracted cookies
145+
session = requests.Session()
146+
for cookie in cookies:
147+
session.cookies.set(
148+
cookie["name"],
149+
cookie["value"],
150+
domain=cookie.get("domain", ""),
151+
path=cookie.get("path", "/"),
152+
)
153+
154+
# Extract actor URN from the actor cookie
155+
for cookie in cookies:
156+
if cookie["name"] == "actor":
157+
actor_urn = urllib.parse.unquote(cookie["value"])
158+
break
159+
160+
if not actor_urn:
161+
raise click.ClickException(
162+
"SSO login succeeded but no actor cookie found. "
163+
"This may indicate an incompatible DataHub version."
164+
)
165+
166+
click.echo(f"✓ Logged in as {actor_urn}")
167+
168+
_warn_about_existing_cli_tokens(session, frontend_url, actor_urn)
169+
170+
# Generate an access token via the frontend GraphQL API
171+
now = datetime.now()
172+
timestamp = now.astimezone().isoformat()
173+
token_name = f"cli token {timestamp}"
174+
175+
json_payload = {
176+
"query": """mutation createAccessToken($input: CreateAccessTokenInput!) {
177+
createAccessToken(input: $input) {
178+
accessToken
179+
metadata {
180+
id
181+
actorUrn
182+
ownerUrn
183+
name
184+
description
185+
}
186+
}
187+
}""",
188+
"variables": {
189+
"input": {
190+
"type": "PERSONAL",
191+
"actorUrn": actor_urn,
192+
"duration": token_duration,
193+
"name": token_name,
194+
}
195+
},
196+
}
197+
198+
response = session.post(
199+
f"{frontend_url}/api/v2/graphql", json=json_payload, timeout=30
200+
)
201+
response.raise_for_status()
202+
203+
data = response.json()
204+
if data.get("errors"):
205+
error_msg = data["errors"][0].get("message", str(data["errors"]))
206+
raise click.ClickException(
207+
f"Failed to create access token: {error_msg}\n"
208+
"Check that personal access tokens are enabled and your account has permission."
209+
)
210+
access_token = data.get("data", {}).get("createAccessToken", {}).get("accessToken")
211+
if not access_token:
212+
raise click.ClickException(
213+
"Server returned empty access token. Contact your DataHub administrator."
214+
)
215+
216+
return token_name, access_token

0 commit comments

Comments
 (0)