Skip to content

Commit 20ae435

Browse files
shirshankaclaude
authored andcommitted
feat(cli): add datahub init --sso for browser-based SSO login
Users authenticating via SSO (OIDC/SAML) can now configure the CLI without manual token copy-paste. The `--sso` flag opens a Chromium browser via Playwright, lets the user complete SSO, then automatically extracts the session and generates a personal access token. Flow: browser opens → user completes SSO → CLI captures actor cookie → generates token via GraphQL → writes ~/.datahubenv. No server-side changes required. Playwright is an optional dependency behind the `sso` extra (`pip install 'acryl-datahub[sso]'`). Clear install instructions are shown if it's missing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ba9ef83 commit 20ae435

File tree

6 files changed

+483
-13
lines changed

6 files changed

+483
-13
lines changed

metadata-ingestion/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1226,6 +1226,7 @@
12261226
]
12271227
)
12281228
),
1229+
"sso": list(framework_common | {"playwright>=1.40.0"}),
12291230
"cloud": ["acryl-datahub-cloud"],
12301231
"dev": list(dev_requirements),
12311232
"docs": list(

metadata-ingestion/src/datahub/cli/resources/INIT_AGENT_CONTEXT.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,27 @@ export DATAHUB_GMS_TOKEN=<your-token>
4848
datahub init
4949
```
5050

51+
## SSO browser login
52+
53+
For DataHub instances using SSO (OIDC/SAML), use `--sso` to authenticate via browser:
54+
55+
```bash
56+
# Opens browser — complete SSO, CLI captures session and generates token
57+
datahub init --sso --host https://your-instance.example.com/gms
58+
59+
# Custom token duration
60+
datahub init --sso --host https://your-instance.example.com/gms --token-duration ONE_MONTH
61+
```
62+
63+
**Prerequisites** (one-time setup):
64+
```bash
65+
pip install 'acryl-datahub[sso]' # or: uv pip install 'acryl-datahub[sso]'
66+
playwright install chromium
67+
```
68+
69+
`--sso` is mutually exclusive with `--token`, `--username`, and `--password`.
70+
If Playwright is not installed, the command prints step-by-step install instructions and exits.
71+
5172
## Environment variables
5273

5374
| Variable | CLI equivalent |
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import urllib.parse
2+
from datetime import datetime
3+
from typing import Tuple
4+
5+
import click
6+
import requests
7+
8+
_INSTALL_HELP = """\
9+
The --sso flag requires Playwright and a Chromium browser.
10+
11+
Step 1 — Install the Python package (pick your package manager):
12+
pip install 'acryl-datahub[sso]'
13+
uv pip install 'acryl-datahub[sso]'
14+
pip install 'playwright>=1.40.0'
15+
16+
Step 2 — Download the Chromium browser binary:
17+
playwright install chromium\
18+
"""
19+
20+
21+
def _check_playwright_ready() -> None:
22+
"""Verify that playwright is importable.
23+
24+
Raises click.UsageError with step-by-step install instructions if not.
25+
If the chromium browser binary is missing, Playwright itself will raise
26+
a clear error at launch time telling the user to run `playwright install`.
27+
"""
28+
try:
29+
from playwright.sync_api import sync_playwright # noqa: F401
30+
except ImportError as e:
31+
raise click.UsageError(
32+
"Playwright is not installed.\n\n" + _INSTALL_HELP
33+
) from e
34+
35+
36+
def browser_sso_login(
37+
frontend_url: str, token_duration: str, timeout_ms: int = 120_000
38+
) -> Tuple[str, str]:
39+
"""Open browser for SSO login, extract session, generate access token.
40+
41+
Args:
42+
frontend_url: The DataHub frontend URL (e.g. http://localhost:9002).
43+
token_duration: Token validity duration (e.g. ONE_HOUR).
44+
timeout_ms: How long to wait for SSO login to complete, in milliseconds.
45+
46+
Returns:
47+
Tuple of (token_name, access_token).
48+
49+
Raises:
50+
click.ClickException: On timeout or missing session cookies.
51+
"""
52+
_check_playwright_ready()
53+
54+
from playwright.sync_api import sync_playwright
55+
56+
click.echo("Opening browser for SSO login...")
57+
click.echo("Complete the login in your browser.\n")
58+
59+
with sync_playwright() as p:
60+
browser = p.chromium.launch(headless=False)
61+
context = browser.new_context()
62+
page = context.new_page()
63+
64+
page.goto(f"{frontend_url}/authenticate")
65+
66+
# Wait for the actor cookie, which signals successful SSO login.
67+
actor_urn = None
68+
try:
69+
page.wait_for_function(
70+
"""() => document.cookie.split('; ').some(c => c.startsWith('actor='))""",
71+
timeout=timeout_ms,
72+
)
73+
except Exception as e:
74+
browser.close()
75+
raise click.ClickException(
76+
f"SSO login timed out after {timeout_ms // 1000} seconds. "
77+
"Please try again."
78+
) from e
79+
80+
# Extract cookies from the browser context
81+
cookies = context.cookies()
82+
browser.close()
83+
84+
# Build a requests.Session with the extracted cookies
85+
session = requests.Session()
86+
for cookie in cookies:
87+
session.cookies.set(
88+
cookie["name"],
89+
cookie["value"],
90+
domain=cookie.get("domain", ""),
91+
path=cookie.get("path", "/"),
92+
)
93+
94+
# Extract actor URN from the actor cookie
95+
for cookie in cookies:
96+
if cookie["name"] == "actor":
97+
actor_urn = urllib.parse.unquote(cookie["value"])
98+
break
99+
100+
if not actor_urn:
101+
raise click.ClickException(
102+
"SSO login succeeded but no actor cookie found. "
103+
"This may indicate an incompatible DataHub version."
104+
)
105+
106+
click.echo(f"✓ Logged in as {actor_urn}")
107+
108+
# Generate an access token via the frontend GraphQL API
109+
now = datetime.now()
110+
timestamp = now.astimezone().isoformat()
111+
token_name = f"cli token {timestamp}"
112+
113+
json_payload = {
114+
"query": """mutation createAccessToken($input: CreateAccessTokenInput!) {
115+
createAccessToken(input: $input) {
116+
accessToken
117+
metadata {
118+
id
119+
actorUrn
120+
ownerUrn
121+
name
122+
description
123+
}
124+
}
125+
}""",
126+
"variables": {
127+
"input": {
128+
"type": "PERSONAL",
129+
"actorUrn": actor_urn,
130+
"duration": token_duration,
131+
"name": token_name,
132+
}
133+
},
134+
}
135+
136+
response = session.post(f"{frontend_url}/api/v2/graphql", json=json_payload)
137+
response.raise_for_status()
138+
139+
data = response.json()
140+
access_token = data.get("data", {}).get("createAccessToken", {}).get("accessToken")
141+
142+
if not access_token:
143+
errors = data.get("errors", [])
144+
error_msg = errors[0]["message"] if errors else "Unknown error"
145+
raise click.ClickException(f"Failed to generate access token: {error_msg}")
146+
147+
return token_name, access_token

metadata-ingestion/src/datahub/entrypoints.py

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def _validate_init_inputs(
154154
username: Optional[str],
155155
password: Optional[str],
156156
token_duration: Optional[str],
157+
sso: bool = False,
157158
) -> None:
158159
"""Validate init command inputs for consistency.
159160
@@ -163,10 +164,25 @@ def _validate_init_inputs(
163164
username: Username value (if provided)
164165
password: Password value (if provided)
165166
token_duration: Token expiration duration (if provided)
167+
sso: Whether SSO browser login is requested
166168
167169
Raises:
168170
click.UsageError: If inputs are invalid or inconsistent
169171
"""
172+
# SSO is mutually exclusive with other auth methods
173+
if sso:
174+
if token or os.environ.get("DATAHUB_GMS_TOKEN"):
175+
raise click.UsageError(
176+
"--sso cannot be used with --token. "
177+
"Use --sso alone to authenticate via browser SSO."
178+
)
179+
if username or password or get_username() or get_password():
180+
raise click.UsageError(
181+
"--sso cannot be used with --username/--password. "
182+
"Use --sso alone to authenticate via browser SSO."
183+
)
184+
return
185+
170186
# Check if credentials will come from CLI args or env vars
171187
username_provided = username or get_username()
172188
password_provided = password or get_password()
@@ -258,6 +274,12 @@ def _validate_init_inputs(
258274
default=False,
259275
help="Overwrite existing config without confirmation",
260276
)
277+
@click.option(
278+
"--sso",
279+
is_flag=True,
280+
default=False,
281+
help="Open browser for SSO login (requires: pip install 'acryl-datahub[sso]' && playwright install chromium)",
282+
)
261283
@click.option(
262284
"--agent-context",
263285
is_flag=True,
@@ -272,6 +294,7 @@ def init(
272294
password: Optional[str] = None,
273295
token_duration: Optional[str] = None,
274296
force: bool = False,
297+
sso: bool = False,
275298
agent_context: bool = False,
276299
) -> None:
277300
"""Configure which DataHub instance to connect to.
@@ -290,19 +313,22 @@ def init(
290313
291314
\b
292315
Mode 1: Auto-Generate Token from Username/Password
293-
# Default duration: ONE_MONTH for localhost, ONE_HOUR for remote instances
294316
datahub init --username alice --password secret
295-
296-
# Custom duration (for long-running jobs)
297-
datahub init --username alice --password secret --token-duration ONE_MONTH
298-
299-
# Non-expiring token (for CI/CD)
300-
datahub init --username alice --password secret --token-duration NO_EXPIRY
317+
datahub init --username alice --password secret \\
318+
--token-duration ONE_MONTH
319+
datahub init --username alice --password secret \\
320+
--token-duration NO_EXPIRY
301321
302322
\b
303323
Mode 2: Use Existing Token
304324
datahub init --token <your-existing-token>
305325
326+
\b
327+
Mode 3: SSO Browser Login (OIDC/SAML)
328+
datahub init --sso --host https://example.com/gms
329+
datahub init --sso --host https://example.com/gms \\
330+
--token-duration ONE_MONTH
331+
306332
\b
307333
Environment Variables (for automation):
308334
export DATAHUB_GMS_URL=http://localhost:8080
@@ -318,7 +344,8 @@ def init(
318344
319345
\b
320346
DataHub Cloud (Acryl-hosted instances):
321-
datahub init --host https://your-instance.acryl.io/gms --token <your-token>
347+
datahub init --token <token> \\
348+
--host https://your-instance.acryl.io/gms
322349
"""
323350
if agent_context:
324351
text: str = (
@@ -338,7 +365,9 @@ def init(
338365
)
339366

340367
# Validate input combinations
341-
_validate_init_inputs(use_password, token, username, password, token_duration)
368+
_validate_init_inputs(
369+
use_password, token, username, password, token_duration, sso=sso
370+
)
342371

343372
# Handle overwrite confirmation: prompt only on interactive TTYs.
344373
# Non-TTY environments (agents, CI) silently overwrite — same as --force.
@@ -348,9 +377,11 @@ def init(
348377
# Get host (CLI arg > Env var > silent default if credentials provided > prompt)
349378
# When credentials are supplied non-interactively, skip the prompt and default to
350379
# localhost:8080 — users connecting to a different host will pass --host explicitly.
351-
_credentials_non_interactive = bool(
352-
(username or get_username()) and (password or get_password())
353-
) or bool(token or os.environ.get("DATAHUB_GMS_TOKEN"))
380+
_credentials_non_interactive = (
381+
bool((username or get_username()) and (password or get_password()))
382+
or bool(token or os.environ.get("DATAHUB_GMS_TOKEN"))
383+
or sso
384+
)
354385
if (
355386
host is None
356387
and not os.environ.get("DATAHUB_GMS_URL")
@@ -380,7 +411,15 @@ def init(
380411
password_provided = password or get_password()
381412
should_generate_token = bool(username_provided and password_provided)
382413

383-
if should_generate_token or use_password:
414+
if sso:
415+
# SSO browser login flow
416+
from datahub.cli.cli_utils import guess_frontend_url_from_gms_url
417+
from datahub.cli.sso_cli import browser_sso_login
418+
419+
frontend_url = guess_frontend_url_from_gms_url(host_value)
420+
_, token_value = browser_sso_login(frontend_url, effective_duration)
421+
click.echo(f"✓ Generated token (expires: {effective_duration})")
422+
elif should_generate_token or use_password:
384423
# Generate token from credentials
385424
username_value = get_init_config_value(
386425
arg_value=username,

0 commit comments

Comments
 (0)