diff --git a/docs-website/src/components/Feedback/index.js b/docs-website/src/components/Feedback/index.js
index ecabca445bd48c..990df7ce1c444f 100644
--- a/docs-website/src/components/Feedback/index.js
+++ b/docs-website/src/components/Feedback/index.js
@@ -2,7 +2,11 @@ import React, { useState, useMemo } from "react";
import clsx from "clsx";
import { supabase } from "./supabase";
import styles from "./styles.module.scss";
-import { LikeOutlined, DislikeOutlined, CheckCircleOutlined } from "@ant-design/icons";
+import {
+ LikeOutlined,
+ DislikeOutlined,
+ CheckCircleOutlined,
+} from "@ant-design/icons";
import { v4 as uuidv4 } from "uuid";
const Feedback = () => {
@@ -67,10 +71,22 @@ const Feedback = () => {
Is this page helpful?
- handleReaction("positive")}>
+ handleReaction("positive")}
+ >
- handleReaction("negative")}>
+ handleReaction("negative")}
+ >
diff --git a/docs-website/src/components/InteractiveDiagram/index.jsx b/docs-website/src/components/InteractiveDiagram/index.jsx
new file mode 100644
index 00000000000000..dc7bf21972057a
--- /dev/null
+++ b/docs-website/src/components/InteractiveDiagram/index.jsx
@@ -0,0 +1,195 @@
+import React, { useCallback } from 'react';
+import ReactFlow, {
+ MiniMap,
+ Controls,
+ Background,
+ useNodesState,
+ useEdgesState,
+ addEdge,
+} from 'reactflow';
+import 'reactflow/dist/style.css';
+import styles from './styles.module.css';
+
+const Discovery Challenge #2: The Data Detective = ({
+ nodes: initialNodes = [],
+ edges: initialEdges = [],
+ title,
+ height = '400px',
+ showMiniMap = true,
+ showControls = true,
+ showBackground = true,
+ backgroundType = 'dots'
+}) => {
+ const [nodes, setNodes, onNodesChange] = useNodesState(initialNodes);
+ const [edges, setEdges, onEdgesChange] = useEdgesState(initialEdges);
+
+ const onConnect = useCallback(
+ (params) => setEdges((eds) => addEdge(params, eds)),
+ [setEdges],
+ );
+
+ return (
+
+ {title &&
{title} }
+
+
+ {showControls && }
+ {showMiniMap && (
+
+ )}
+ {showBackground && (
+
+ )}
+
+
+
+ );
+};
+
+// Pre-defined diagram configurations for common DataHub workflows
+export const DataHubWorkflows = {
+ ingestionFlow: {
+ nodes: [
+ {
+ id: '1',
+ type: 'input',
+ data: { label: '🗄️ Data Sources\n(Kafka, Hive, HDFS)' },
+ position: { x: 0, y: 0 },
+ className: 'source-node',
+ },
+ {
+ id: '2',
+ data: { label: '⚙️ DataHub Ingestion\nExtract Metadata' },
+ position: { x: 200, y: 0 },
+ className: 'process-node',
+ },
+ {
+ id: '3',
+ data: { label: '📊 Metadata Storage\nElasticsearch + MySQL' },
+ position: { x: 400, y: 0 },
+ className: 'storage-node',
+ },
+ {
+ id: '4',
+ type: 'output',
+ data: { label: '🔍 DataHub UI\nDiscovery & Lineage' },
+ position: { x: 600, y: 0 },
+ className: 'output-node',
+ },
+ ],
+ edges: [
+ { id: 'e1-2', source: '1', target: '2', animated: true, label: 'metadata' },
+ { id: 'e2-3', source: '2', target: '3', animated: true, label: 'store' },
+ { id: 'e3-4', source: '3', target: '4', animated: true, label: 'serve' },
+ ],
+ },
+
+ discoveryFlow: {
+ nodes: [
+ {
+ id: '1',
+ type: 'input',
+ data: { label: '👤 Data Analyst\nNeeds user metrics' },
+ position: { x: 0, y: 100 },
+ className: 'user-node',
+ },
+ {
+ id: '2',
+ data: { label: '🔍 Search DataHub\n"user created deleted"' },
+ position: { x: 200, y: 0 },
+ className: 'search-node',
+ },
+ {
+ id: '3',
+ data: { label: '📋 Browse Results\nfct_users_created' },
+ position: { x: 200, y: 100 },
+ className: 'browse-node',
+ },
+ {
+ id: '4',
+ data: { label: '📊 Examine Schema\nColumns & Types' },
+ position: { x: 200, y: 200 },
+ className: 'schema-node',
+ },
+ {
+ id: '5',
+ type: 'output',
+ data: { label: '✅ Found Data\nReady for Analysis' },
+ position: { x: 400, y: 100 },
+ className: 'success-node',
+ },
+ ],
+ edges: [
+ { id: 'e1-2', source: '1', target: '2', label: 'search' },
+ { id: 'e1-3', source: '1', target: '3', label: 'browse' },
+ { id: 'e1-4', source: '1', target: '4', label: 'explore' },
+ { id: 'e2-5', source: '2', target: '5' },
+ { id: 'e3-5', source: '3', target: '5' },
+ { id: 'e4-5', source: '4', target: '5' },
+ ],
+ },
+
+ lineageFlow: {
+ nodes: [
+ {
+ id: '1',
+ data: { label: '📥 Raw Events\nKafka Stream' },
+ position: { x: 0, y: 0 },
+ className: 'source-node',
+ },
+ {
+ id: '2',
+ data: { label: '⚙️ ETL Process\nSpark Job' },
+ position: { x: 200, y: 0 },
+ className: 'process-node',
+ },
+ {
+ id: '3',
+ data: { label: '🗄️ Analytics Table\nfct_users_created' },
+ position: { x: 400, y: 0 },
+ className: 'table-node',
+ },
+ {
+ id: '4',
+ data: { label: '📊 Dashboard\nUser Metrics' },
+ position: { x: 600, y: 0 },
+ className: 'output-node',
+ },
+ {
+ id: '5',
+ data: { label: '🔧 Data Quality\nValidation Rules' },
+ position: { x: 200, y: 100 },
+ className: 'quality-node',
+ },
+ ],
+ edges: [
+ { id: 'e1-2', source: '1', target: '2', animated: true, label: 'raw data' },
+ { id: 'e2-3', source: '2', target: '3', animated: true, label: 'processed' },
+ { id: 'e3-4', source: '3', target: '4', animated: true, label: 'visualize' },
+ { id: 'e2-5', source: '2', target: '5', label: 'validate' },
+ { id: 'e5-3', source: '5', target: '3', label: 'quality check' },
+ ],
+ },
+};
+
+export default InteractiveDiagram;
diff --git a/docs-website/src/components/InteractiveDiagram/styles.module.css b/docs-website/src/components/InteractiveDiagram/styles.module.css
new file mode 100644
index 00000000000000..6ce55ad010f023
--- /dev/null
+++ b/docs-website/src/components/InteractiveDiagram/styles.module.css
@@ -0,0 +1,222 @@
+/* Interactive Diagram Styling for DataHub */
+
+.diagramContainer {
+ margin: 24px 0;
+ border-radius: 12px;
+ overflow: hidden;
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+ border: 1px solid var(--ifm-color-emphasis-200);
+ background: var(--ifm-background-color);
+}
+
+.diagramTitle {
+ margin: 0;
+ padding: 16px 20px;
+ background: var(--ifm-color-primary-lightest);
+ color: var(--ifm-color-primary-darkest);
+ font-weight: 600;
+ font-size: 16px;
+ border-bottom: 1px solid var(--ifm-color-emphasis-200);
+}
+
+.reactFlowWrapper {
+ position: relative;
+ background: var(--ifm-background-color);
+}
+
+.reactFlow {
+ background: var(--ifm-background-color);
+}
+
+/* Node Styling */
+.reactFlow :global(.react-flow__node) {
+ font-family: var(--ifm-font-family-base);
+ font-size: 12px;
+ font-weight: 500;
+ border-radius: 8px;
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+ border: 2px solid var(--ifm-color-emphasis-300);
+ background: var(--ifm-background-color);
+ color: var(--ifm-color-content);
+ padding: 8px 12px;
+ min-width: 120px;
+ text-align: center;
+ transition: all 0.2s ease;
+}
+
+.reactFlow :global(.react-flow__node:hover) {
+ transform: translateY(-2px);
+ box-shadow: 0 4px 16px rgba(0, 0, 0, 0.15);
+}
+
+.reactFlow :global(.react-flow__node.selected) {
+ border-color: var(--ifm-color-primary);
+ box-shadow: 0 0 0 2px var(--ifm-color-primary-lightest);
+}
+
+/* Specific Node Types */
+.reactFlow :global(.source-node) {
+ background: var(--ifm-color-success-lightest);
+ border-color: var(--ifm-color-success);
+ color: var(--ifm-color-success-darkest);
+}
+
+.reactFlow :global(.process-node) {
+ background: var(--ifm-color-info-lightest);
+ border-color: var(--ifm-color-info);
+ color: var(--ifm-color-info-darkest);
+}
+
+.reactFlow :global(.storage-node) {
+ background: var(--ifm-color-warning-lightest);
+ border-color: var(--ifm-color-warning);
+ color: var(--ifm-color-warning-darkest);
+}
+
+.reactFlow :global(.output-node) {
+ background: var(--ifm-color-primary-lightest);
+ border-color: var(--ifm-color-primary);
+ color: var(--ifm-color-primary-darkest);
+}
+
+.reactFlow :global(.user-node) {
+ background: #f0f9ff;
+ border-color: #0ea5e9;
+ color: #0c4a6e;
+}
+
+.reactFlow :global(.search-node) {
+ background: #fef3c7;
+ border-color: #f59e0b;
+ color: #92400e;
+}
+
+.reactFlow :global(.browse-node) {
+ background: #ecfdf5;
+ border-color: #10b981;
+ color: #065f46;
+}
+
+.reactFlow :global(.schema-node) {
+ background: #f3e8ff;
+ border-color: #8b5cf6;
+ color: #581c87;
+}
+
+.reactFlow :global(.success-node) {
+ background: var(--ifm-color-success-lightest);
+ border-color: var(--ifm-color-success);
+ color: var(--ifm-color-success-darkest);
+}
+
+.reactFlow :global(.table-node) {
+ background: #fdf2f8;
+ border-color: #ec4899;
+ color: #9d174d;
+}
+
+.reactFlow :global(.quality-node) {
+ background: #fff7ed;
+ border-color: #f97316;
+ color: #9a3412;
+}
+
+/* Edge Styling */
+.reactFlow :global(.react-flow__edge-path) {
+ stroke: var(--ifm-color-primary);
+ stroke-width: 2px;
+}
+
+.reactFlow :global(.react-flow__edge.animated path) {
+ stroke-dasharray: 5;
+ animation: dashdraw 0.5s linear infinite;
+}
+
+.reactFlow :global(.react-flow__edge-text) {
+ font-family: var(--ifm-font-family-base);
+ font-size: 11px;
+ font-weight: 500;
+ fill: var(--ifm-color-content);
+ background: var(--ifm-background-color);
+ padding: 2px 4px;
+ border-radius: 4px;
+}
+
+/* Controls Styling */
+.controls {
+ background: var(--ifm-background-color);
+ border: 1px solid var(--ifm-color-emphasis-200);
+ border-radius: 8px;
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+}
+
+.controls :global(.react-flow__controls-button) {
+ background: var(--ifm-background-color);
+ border-color: var(--ifm-color-emphasis-200);
+ color: var(--ifm-color-content);
+ transition: all 0.2s ease;
+}
+
+.controls :global(.react-flow__controls-button:hover) {
+ background: var(--ifm-color-emphasis-100);
+ border-color: var(--ifm-color-primary);
+}
+
+/* MiniMap Styling */
+.miniMap {
+ background: var(--ifm-color-emphasis-100);
+ border: 1px solid var(--ifm-color-emphasis-200);
+ border-radius: 8px;
+ overflow: hidden;
+}
+
+/* Background Styling */
+.background :global(.react-flow__background) {
+ background-color: var(--ifm-background-color);
+}
+
+/* Dark Mode Adjustments */
+[data-theme="dark"] .diagramContainer {
+ border-color: var(--ifm-color-emphasis-300);
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
+}
+
+[data-theme="dark"] .diagramTitle {
+ background: var(--ifm-color-emphasis-200);
+ color: var(--ifm-color-content);
+}
+
+[data-theme="dark"] .reactFlow :global(.react-flow__node) {
+ background: var(--ifm-color-emphasis-100);
+ border-color: var(--ifm-color-emphasis-400);
+ color: var(--ifm-color-content);
+}
+
+[data-theme="dark"] .reactFlow :global(.react-flow__edge-text) {
+ fill: var(--ifm-color-content);
+}
+
+/* Animation */
+@keyframes dashdraw {
+ to {
+ stroke-dashoffset: -10;
+ }
+}
+
+/* Responsive Design */
+@media (max-width: 768px) {
+ .diagramContainer {
+ margin: 16px 0;
+ }
+
+ .diagramTitle {
+ padding: 12px 16px;
+ font-size: 14px;
+ }
+
+ .reactFlow :global(.react-flow__node) {
+ font-size: 11px;
+ padding: 6px 8px;
+ min-width: 100px;
+ }
+}
diff --git a/docs-website/src/components/LineageLayoutGrid/index.jsx b/docs-website/src/components/LineageLayoutGrid/index.jsx
new file mode 100644
index 00000000000000..9d833821b9999c
--- /dev/null
+++ b/docs-website/src/components/LineageLayoutGrid/index.jsx
@@ -0,0 +1,436 @@
+import React, { useRef, useEffect, useState } from "react";
+import styles from "./styles.module.css";
+import DataHubLineageNode from "../DataHubLineageNode";
+
+const LineageLayoutGrid = ({
+ title,
+ layers = [],
+ showConnections = true,
+ allExpanded = false,
+ onToggleExpand = () => {},
+ connectionColors = {},
+ defaultColors = ["#533FD1", "#10b981", "#f59e0b", "#ef4444", "#8b5cf6"],
+}) => {
+ const containerRef = useRef(null);
+ const [connections, setConnections] = useState([]);
+
+ // Build a map of all nodes with their positions (supports nested sub-layers)
+ const buildNodeMap = () => {
+ const nodeMap = new Map();
+
+ const addNodesFromLayer = (layer, layerIndex) => {
+ // Add standalone nodes if present
+ if (Array.isArray(layer.nodes)) {
+ layer.nodes.forEach((node, nodeIndex) => {
+ const element = containerRef.current.querySelector(
+ `[data-node-id="${node.name}"]`,
+ );
+ if (element) {
+ const containerRect = containerRef.current.getBoundingClientRect();
+ const nodeElement = element.firstElementChild || element;
+ const nodeRect = nodeElement.getBoundingClientRect();
+ const containerScrollLeft = containerRef.current.scrollLeft;
+ const containerScrollTop = containerRef.current.scrollTop;
+
+ nodeMap.set(node.name, {
+ node,
+ layerIndex,
+ nodeIndex,
+ x: nodeRect.left - containerRect.left + containerScrollLeft,
+ y: nodeRect.top - containerRect.top + containerScrollTop,
+ width: nodeRect.width,
+ height: nodeRect.height,
+ centerX:
+ nodeRect.left -
+ containerRect.left +
+ nodeRect.width / 2 +
+ containerScrollLeft,
+ centerY:
+ nodeRect.top -
+ containerRect.top +
+ nodeRect.height / 2 +
+ containerScrollTop,
+ rightEdge:
+ nodeRect.right - containerRect.left + containerScrollLeft,
+ leftEdge:
+ nodeRect.left - containerRect.left + containerScrollLeft,
+ });
+ }
+ });
+ }
+
+ // Recurse into sub-layers if present
+ if (Array.isArray(layer.subLayers)) {
+ layer.subLayers.forEach((subLayer) =>
+ addNodesFromLayer(subLayer, layerIndex),
+ );
+ }
+ };
+
+ layers.forEach((layer, layerIndex) => addNodesFromLayer(layer, layerIndex));
+
+ return nodeMap;
+ };
+
+ // Calculate connections with proper routing around nodes
+ const calculateConnections = () => {
+ if (!containerRef.current || !showConnections) return;
+
+ const nodeMap = buildNodeMap();
+ const newConnections = [];
+
+ // Find all connections across all layers (including nested sub-layers)
+ const processNodes = (nodes) => {
+ nodes.forEach((sourceNode, sourceIndex) => {
+ if (!sourceNode.downstreamConnections) return;
+
+ sourceNode.downstreamConnections.forEach((targetNodeName) => {
+ const sourceNodeData = nodeMap.get(sourceNode.name);
+ const targetNodeData = nodeMap.get(targetNodeName);
+
+ if (sourceNodeData && targetNodeData) {
+ const connectionColor =
+ connectionColors[sourceNode.name] ||
+ defaultColors[sourceIndex % defaultColors.length];
+ const path = calculateRoutingPath(
+ sourceNodeData,
+ targetNodeData,
+ nodeMap,
+ );
+
+ const arrowMarkerWidth = 10; // keep in sync with marker path and viewBox size
+ const backoffPx = 10; // small gap to avoid overlapping the node border
+ newConnections.push({
+ id: `${sourceNode.name}-${targetNodeName}`,
+ sourceX: sourceNodeData.rightEdge,
+ sourceY: sourceNodeData.centerY,
+ // End the path at the center of the back of the arrowhead, with a slight gap before the node.
+ targetX: targetNodeData.leftEdge - (arrowMarkerWidth + backoffPx),
+ targetY: targetNodeData.centerY,
+ color: connectionColor,
+ path: path,
+ layerIndex: sourceNodeData.layerIndex,
+ sourceIndex,
+ });
+ }
+ });
+ });
+ };
+
+ const traverseForConnections = (layer) => {
+ if (Array.isArray(layer.nodes)) processNodes(layer.nodes);
+ if (Array.isArray(layer.subLayers))
+ layer.subLayers.forEach(traverseForConnections);
+ };
+
+ layers.forEach(traverseForConnections);
+
+ setConnections(newConnections);
+ };
+
+ // Calculate routing path that avoids nodes with proper collision detection
+ const calculateRoutingPath = (sourceData, targetData, nodeMap) => {
+ const sourceX = sourceData.rightEdge;
+ const sourceY = sourceData.centerY;
+ const targetX = targetData.leftEdge - 16;
+ const targetY = targetData.centerY;
+
+ // Check if there are nodes between source and target that we need to route around
+ const intermediateNodes = Array.from(nodeMap.values()).filter(
+ (nodeData) => {
+ // Add buffer zones around nodes to ensure we don't clip them
+ const nodeBuffer = 25; // Extra space around nodes
+ const nodeLeft = nodeData.x - nodeBuffer;
+ const nodeRight = nodeData.x + nodeData.width + nodeBuffer;
+ const nodeTop = nodeData.y - nodeBuffer;
+ const nodeBottom = nodeData.y + nodeData.height + nodeBuffer;
+
+ // Check if this node is in the horizontal path between source and target
+ const isInHorizontalPath = nodeLeft < targetX && nodeRight > sourceX;
+ const isNotSourceOrTarget =
+ nodeData.node.name !== sourceData.node.name &&
+ nodeData.node.name !== targetData.node.name;
+
+ // Also check if the direct line from source to target would intersect this node
+ const directLineIntersectsNode =
+ // Line passes through the node's Y range
+ (sourceY <= nodeBottom && sourceY >= nodeTop) ||
+ (targetY <= nodeBottom && targetY >= nodeTop) ||
+ (sourceY <= nodeTop && targetY >= nodeBottom) ||
+ (sourceY >= nodeBottom && targetY <= nodeTop);
+
+ return (
+ isInHorizontalPath && isNotSourceOrTarget && directLineIntersectsNode
+ );
+ },
+ );
+
+ if (intermediateNodes.length === 0) {
+ // Direct path if no obstacles
+ return null; // Will use default curve
+ }
+
+ // Calculate routing paths that avoid all intermediate nodes
+ const nodeObstacles = intermediateNodes.map((nodeData) => ({
+ top: nodeData.y - 25, // Buffer above node
+ bottom: nodeData.y + nodeData.height + 25, // Buffer below node
+ left: nodeData.x - 25,
+ right: nodeData.x + nodeData.width + 25,
+ centerY: nodeData.centerY,
+ name: nodeData.node.name,
+ }));
+
+ // Find the best routing level (above or below obstacles)
+ const allTops = nodeObstacles.map((n) => n.top);
+ const allBottoms = nodeObstacles.map((n) => n.bottom);
+
+ const highestTop = Math.min(...allTops);
+ const lowestBottom = Math.max(...allBottoms);
+
+ // Calculate routing options with more clearance
+ const routingOffset = 60; // Larger offset to more clearly bend around nodes
+ const routeAbove = highestTop - routingOffset;
+ const routeBelow = lowestBottom + routingOffset;
+
+ // Choose the route that's closer to the average of source and target Y positions
+ const avgY = (sourceY + targetY) / 2;
+ const routingY =
+ Math.abs(routeAbove - avgY) < Math.abs(routeBelow - avgY)
+ ? routeAbove
+ : routeBelow;
+
+ return {
+ type: "routed",
+ routingY,
+ obstacles: nodeObstacles,
+ };
+ };
+
+ // Recalculate connections when layout changes
+ useEffect(() => {
+ const timer = setTimeout(calculateConnections, 100); // Small delay for DOM updates
+ return () => clearTimeout(timer);
+ }, [layers, allExpanded]);
+
+ // Recalculate on window resize and scroll
+ useEffect(() => {
+ const handleResize = () => calculateConnections();
+ const handleScroll = () => calculateConnections();
+
+ window.addEventListener("resize", handleResize);
+
+ // Add scroll listener to the container
+ if (containerRef.current) {
+ containerRef.current.addEventListener("scroll", handleScroll);
+ }
+
+ return () => {
+ window.removeEventListener("resize", handleResize);
+ if (containerRef.current) {
+ containerRef.current.removeEventListener("scroll", handleScroll);
+ }
+ };
+ }, []);
+
+ // Generate path that routes around nodes when needed
+ const generatePath = (connection) => {
+ const { sourceX, sourceY, targetX, targetY, path } = connection;
+
+ if (!path || path.type !== "routed") {
+ // Simple Bezier curve for direct connections
+ const horizontalDistance = targetX - sourceX;
+ const cp1X = sourceX + horizontalDistance * 0.5;
+ const cp1Y = sourceY;
+ const cp2X = sourceX + horizontalDistance * 0.5;
+ const cp2Y = targetY;
+
+ return `M ${sourceX} ${sourceY} C ${cp1X} ${cp1Y}, ${cp2X} ${cp2Y}, ${targetX} ${targetY}`;
+ }
+
+ // Routed path using two cubic segments that pass through a safe routing level
+ const { routingY } = path;
+ const midX = sourceX + (targetX - sourceX) * 0.5;
+ const bend = 40; // Horizontal control point offset for smooth bends
+
+ // First curve: from source to mid point at routingY
+ const cp1X = sourceX + bend;
+ const cp1Y = sourceY;
+ const cp2X = midX - bend;
+ const cp2Y = routingY;
+
+ // Second curve: from mid point at routingY to target
+ const cp3X = midX + bend;
+ const cp3Y = routingY;
+ const cp4X = targetX - bend;
+ const cp4Y = targetY;
+
+ return `M ${sourceX} ${sourceY} C ${cp1X} ${cp1Y}, ${cp2X} ${cp2Y}, ${midX} ${routingY} C ${cp3X} ${cp3Y}, ${cp4X} ${cp4Y}, ${targetX} ${targetY}`;
+ };
+
+ // Recursive renderer for layers and sublayers
+ const renderLayerContent = (layer) => {
+ const hasSubLayers =
+ Array.isArray(layer.subLayers) && layer.subLayers.length > 0;
+
+ if (!hasSubLayers) {
+ // Render standalone nodes
+ return (
+
+ {(layer.nodes || []).map((node, nodeIndex) => (
+
+
+
+ ))}
+
+ );
+ }
+
+ // Render sublayers: support horizontal columns or vertical stacks at each level
+ if (layer.subLayersLayout === "columns") {
+ return (
+
+ {/* Leftmost column for standalone nodes at this level */}
+ {Array.isArray(layer.nodes) && layer.nodes.length > 0 && (
+
+
+ {layer.nodes.map((node, nodeIndex) => (
+
+
+
+ ))}
+
+
+ )}
+ {layer.subLayers.map((subLayer, subLayerIndex) => (
+
+ {subLayer.title && (
+
{subLayer.title}
+ )}
+ {renderLayerContent(subLayer)}
+
+ ))}
+
+ );
+ }
+
+ // Default vertical stack
+ return (
+
+ {layer.subLayers.map((subLayer, subLayerIndex) => (
+
+ {subLayer.title && (
+
{subLayer.title}
+ )}
+ {renderLayerContent(subLayer)}
+
+ ))}
+
+ );
+ };
+
+ return (
+
+ {title &&
{title} }
+
+
+ {layers.map((layer, layerIndex) => (
+
+ {layer.title && (
+
{layer.title}
+ )}
+
+ {renderLayerContent(layer)}
+
+ ))}
+
+
+ {/* SVG overlay for connections */}
+ {showConnections && connections.length > 0 && (
+
+
+ {connections.map((connection, index) => (
+
+
+
+ ))}
+
+
+ {connections.map((connection) => (
+
+
+
+
+
+ ))}
+
+ )}
+
+ );
+};
+
+export default LineageLayoutGrid;
+export { LineageLayoutGrid };
diff --git a/docs-website/src/components/LineageLayoutGrid/styles.module.css b/docs-website/src/components/LineageLayoutGrid/styles.module.css
new file mode 100644
index 00000000000000..b189d70f66bc3a
--- /dev/null
+++ b/docs-website/src/components/LineageLayoutGrid/styles.module.css
@@ -0,0 +1,204 @@
+/* LineageLayoutGrid Component Styles */
+.lineageContainer {
+ position: relative;
+ width: 100%;
+ max-width: 100%;
+ overflow-x: auto;
+ padding: 20px 40px; /* More horizontal padding to prevent cutoff */
+ background: var(--ifm-background-color);
+ border-radius: 8px;
+ border: 1px solid var(--ifm-color-emphasis-300);
+}
+
+.title {
+ position: sticky;
+ top: 0; /* stick to top of scroll container */
+ left: 50%; /* horizontally center within visible area */
+ transform: translateX(-50%);
+ z-index: 2; /* above connections overlay */
+ text-align: center;
+ margin: 0 0 24px 0;
+ font-size: 18px;
+ font-weight: 600;
+ color: var(--ifm-color-content);
+ background: var(
+ --ifm-background-color
+ ); /* readable over content while scrolling */
+ padding: 8px 12px;
+ border-radius: 12px;
+}
+
+/* CSS Grid Layout for Layers */
+.layersGrid {
+ display: grid;
+ grid-auto-flow: column;
+ grid-auto-columns: minmax(320px, 1fr);
+ gap: 100px; /* More space for connections */
+ align-items: start;
+ justify-content: start; /* Align to start to prevent cutoff */
+ min-height: 400px;
+ padding: 20px 0;
+ min-width: fit-content; /* Ensure grid doesn't shrink below content */
+}
+
+/* Individual Layer Styling */
+.layer {
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ justify-content: flex-start;
+ min-width: 320px;
+ width: 100%; /* Take full grid column width */
+ height: 100%;
+}
+
+.layerTitle {
+ font-size: 14px;
+ font-weight: 600;
+ color: var(--ifm-color-emphasis-700);
+ text-align: center;
+ margin-bottom: 24px;
+ padding: 8px 16px;
+ background: var(--ifm-background-color);
+ border: 1px solid var(--ifm-color-emphasis-300);
+ border-radius: 20px;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+ white-space: nowrap;
+}
+
+/* Sub-layers Container */
+.subLayersContainer {
+ display: flex;
+ flex-direction: column;
+ gap: 32px;
+ width: 100%;
+ align-items: center;
+}
+
+/* Horizontal sub-layer columns within a single layer */
+.subLayersRowContainer {
+ display: grid;
+ grid-auto-flow: column;
+ grid-auto-columns: minmax(320px, 1fr);
+ gap: 24px;
+ width: 100%;
+ align-items: start;
+ justify-content: start; /* ensure first subcolumn starts at left edge of layer */
+ justify-items: start; /* ensure subcolumn content aligns to left */
+}
+
+.subLayerColumn {
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ width: 100%;
+}
+
+.subLayer {
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ width: 100%;
+}
+
+.subLayerTitle {
+ font-size: 12px;
+ font-weight: 500;
+ color: var(--ifm-color-emphasis-600);
+ text-align: center;
+ margin-bottom: 16px;
+ padding: 6px 12px;
+ background: var(--ifm-color-emphasis-100);
+ border-radius: 16px;
+ letter-spacing: 0.3px;
+ white-space: nowrap;
+}
+
+/* Nodes within each layer */
+.layerNodes {
+ display: flex;
+ flex-direction: column;
+ gap: 20px;
+ align-items: center;
+ width: 100%;
+ flex: 1;
+ justify-content: center;
+}
+
+/* Left-aligned variant for sublayer columns */
+.layerNodesLeft {
+ display: flex;
+ flex-direction: column;
+ gap: 20px;
+ align-items: flex-start;
+ width: 100%;
+ flex: 1;
+ justify-content: flex-start;
+}
+
+.nodeWrapper {
+ width: 100%;
+ max-width: 300px;
+ position: relative;
+ display: flex;
+ justify-content: center; /* Center nodes within their wrapper */
+}
+
+/* SVG Overlay for Connections */
+.connectionsOverlay {
+ position: absolute;
+ top: 0;
+ left: 0;
+ width: 100%;
+ height: 100%;
+ pointer-events: none;
+ z-index: 1;
+ overflow: visible;
+}
+
+/* Connection Path Styling */
+.connectionPath {
+ transition: all 0.3s ease;
+ filter: drop-shadow(0 1px 2px rgba(0, 0, 0, 0.1));
+}
+
+.connectionPath:hover {
+ stroke-width: 3;
+ opacity: 1;
+ filter: drop-shadow(0 2px 4px rgba(0, 0, 0, 0.2));
+}
+
+/* Responsive Design */
+@media (max-width: 768px) {
+ .layersGrid {
+ grid-auto-columns: minmax(280px, 1fr);
+ gap: 60px;
+ }
+
+ .nodeWrapper {
+ max-width: 260px;
+ }
+
+ .subLayersRowContainer {
+ grid-auto-columns: minmax(260px, 1fr);
+ gap: 16px;
+ }
+}
+
+/* Dark mode support */
+[data-theme="dark"] .lineageContainer {
+ background: var(--ifm-background-color);
+ border-color: var(--ifm-color-emphasis-300);
+}
+
+[data-theme="dark"] .layerTitle {
+ background: var(--ifm-background-color);
+ border-color: var(--ifm-color-emphasis-300);
+ color: var(--ifm-color-emphasis-800);
+}
+
+[data-theme="dark"] .subLayerTitle {
+ background: var(--ifm-color-emphasis-200);
+ color: var(--ifm-color-emphasis-700);
+}
diff --git a/docs-website/src/components/NextStepButton/index.jsx b/docs-website/src/components/NextStepButton/index.jsx
new file mode 100644
index 00000000000000..b62e31e9b18a76
--- /dev/null
+++ b/docs-website/src/components/NextStepButton/index.jsx
@@ -0,0 +1,47 @@
+import React from "react";
+import Link from "@docusaurus/Link";
+import styles from "./styles.module.css";
+
+const NextStepButton = ({
+ to,
+ children,
+ tutorialId,
+ currentStep,
+ variant = "primary",
+ icon = "→",
+}) => {
+ const handleClick = () => {
+ if (tutorialId && currentStep !== undefined) {
+ const storageKey = `datahub-tutorial-${tutorialId}`;
+ const savedProgress = localStorage.getItem(storageKey);
+ let completedSteps = new Set();
+
+ if (savedProgress) {
+ try {
+ completedSteps = new Set(JSON.parse(savedProgress));
+ } catch (e) {
+ console.warn("Failed to parse tutorial progress:", e);
+ }
+ }
+
+ // Mark current step as completed
+ completedSteps.add(`step-${currentStep}`);
+ localStorage.setItem(storageKey, JSON.stringify([...completedSteps]));
+ }
+ };
+
+ return (
+
+
+ {children}
+ {icon}
+
+
+ );
+};
+
+export default NextStepButton;
diff --git a/docs-website/src/components/NextStepButton/styles.module.css b/docs-website/src/components/NextStepButton/styles.module.css
new file mode 100644
index 00000000000000..e24bb0dd6b2fdc
--- /dev/null
+++ b/docs-website/src/components/NextStepButton/styles.module.css
@@ -0,0 +1,65 @@
+.nextStepButton {
+ display: inline-flex;
+ align-items: center;
+ padding: 12px 24px;
+ border-radius: 8px;
+ text-decoration: none;
+ font-weight: 600;
+ font-size: 16px;
+ transition: all 0.2s ease;
+ border: 2px solid transparent;
+ margin: 16px 0;
+}
+
+.nextStepButton:hover {
+ text-decoration: none;
+ transform: translateY(-1px);
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
+}
+
+.primary {
+ background: var(--ifm-color-primary);
+ color: white;
+}
+
+.primary:hover {
+ background: var(--ifm-color-primary-dark);
+ color: white;
+}
+
+.secondary {
+ background: transparent;
+ color: var(--ifm-color-primary);
+ border-color: var(--ifm-color-primary);
+}
+
+.secondary:hover {
+ background: var(--ifm-color-primary);
+ color: white;
+}
+
+.content {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+}
+
+.icon {
+ font-size: 18px;
+ transition: transform 0.2s ease;
+}
+
+.nextStepButton:hover .icon {
+ transform: translateX(2px);
+}
+
+/* Dark mode support */
+[data-theme="dark"] .secondary {
+ border-color: var(--ifm-color-primary-light);
+ color: var(--ifm-color-primary-light);
+}
+
+[data-theme="dark"] .secondary:hover {
+ background: var(--ifm-color-primary-light);
+ color: var(--ifm-color-primary-darkest);
+}
diff --git a/docs-website/src/components/OSDetectionTabs/index.jsx b/docs-website/src/components/OSDetectionTabs/index.jsx
new file mode 100644
index 00000000000000..75b948c7081eb4
--- /dev/null
+++ b/docs-website/src/components/OSDetectionTabs/index.jsx
@@ -0,0 +1,107 @@
+import React, { useState, useEffect } from "react";
+import Tabs from "@theme/Tabs";
+import TabItem from "@theme/TabItem";
+import styles from "./styles.module.css";
+
+const OSDetectionTabs = ({ children, defaultOS = null }) => {
+ // Detect OS immediately during initialization
+ const detectOS = () => {
+ if (typeof window === "undefined") return "linux"; // SSR fallback
+
+ const userAgent = window.navigator.userAgent;
+ const platform = window.navigator.platform;
+
+ console.log("Detecting OS - UserAgent:", userAgent, "Platform:", platform);
+
+ // More specific macOS detection
+ if (
+ platform.indexOf("Mac") !== -1 ||
+ userAgent.indexOf("Mac") !== -1 ||
+ userAgent.indexOf("macOS") !== -1 ||
+ platform === "MacIntel" ||
+ platform === "MacPPC"
+ ) {
+ return "macos";
+ } else if (
+ userAgent.indexOf("Win") !== -1 ||
+ platform.indexOf("Win") !== -1
+ ) {
+ return "windows";
+ } else if (
+ userAgent.indexOf("Linux") !== -1 ||
+ platform.indexOf("Linux") !== -1
+ ) {
+ return "linux";
+ } else {
+ return "linux"; // Default fallback
+ }
+ };
+
+ const [detectedOS, setDetectedOS] = useState(() => detectOS());
+ const [defaultValue, setDefaultValue] = useState(
+ () => defaultOS || detectOS(),
+ );
+
+ useEffect(() => {
+ // Re-detect OS on client side to handle SSR
+ const os = detectOS();
+ console.log("Detected OS:", os);
+ setDetectedOS(os);
+
+ // Set default tab to detected OS if no explicit default provided
+ if (!defaultOS) {
+ setDefaultValue(os);
+ }
+ }, [defaultOS]);
+
+ // Get OS icon
+ const getOSIcon = (osValue) => {
+ switch (osValue) {
+ case "windows":
+ return "🪟";
+ case "macos":
+ return "🍎";
+ case "linux":
+ return "🐧";
+ default:
+ return "";
+ }
+ };
+
+ // Add OS detection info to child components
+ const enhancedChildren = React.Children.map(children, (child) => {
+ if (React.isValidElement(child) && child.type === TabItem) {
+ const isDetected = child.props.value === detectedOS;
+ const icon = getOSIcon(child.props.value);
+ const label = isDetected
+ ? `${icon} ${child.props.label} (Your OS)`
+ : `${icon} ${child.props.label}`;
+
+ return React.cloneElement(child, {
+ ...child.props,
+ label,
+ className: isDetected ? styles.detectedTab : "",
+ });
+ }
+ return child;
+ });
+
+ console.log(
+ "Rendering OSDetectionTabs with defaultValue:",
+ defaultValue,
+ "detectedOS:",
+ detectedOS,
+ );
+
+ return (
+
+ {enhancedChildren}
+
+ );
+};
+
+export default OSDetectionTabs;
diff --git a/docs-website/src/components/OSDetectionTabs/styles.module.css b/docs-website/src/components/OSDetectionTabs/styles.module.css
new file mode 100644
index 00000000000000..89f09f2b0d9a2a
--- /dev/null
+++ b/docs-website/src/components/OSDetectionTabs/styles.module.css
@@ -0,0 +1,77 @@
+/* OS Detection Tabs Styling */
+.osDetectionTabs {
+ margin: 1rem 0;
+}
+
+.detectedLabel {
+ color: var(--ifm-color-primary);
+ font-weight: 600;
+}
+
+.osIcon {
+ margin-right: 0.5rem;
+ font-size: 1.1em;
+}
+
+/* OS-specific icons */
+.windowsIcon::before {
+ content: "🪟";
+}
+
+.macosIcon::before {
+ content: "🍎";
+}
+
+.linuxIcon::before {
+ content: "🐧";
+}
+
+/* Enhanced tab styling for detected OS */
+.detectedTab {
+ background-color: var(--ifm-color-primary-lightest);
+ border-color: var(--ifm-color-primary);
+}
+
+/* Code block enhancements for different shells */
+.windowsCode {
+ background-color: #1e1e1e;
+ color: #d4d4d4;
+}
+
+.macosCode {
+ background-color: #2d2d2d;
+ color: #f8f8f2;
+}
+
+.linuxCode {
+ background-color: #300a24;
+ color: #ffffff;
+}
+
+/* Troubleshooting sections */
+.troubleshooting {
+ background-color: var(--ifm-color-warning-lightest);
+ border-left: 4px solid var(--ifm-color-warning);
+ padding: 1rem;
+ margin: 1rem 0;
+ border-radius: 0 4px 4px 0;
+}
+
+.troubleshooting h4 {
+ color: var(--ifm-color-warning-dark);
+ margin-bottom: 0.5rem;
+}
+
+/* System requirements styling */
+.systemRequirements {
+ background-color: var(--ifm-color-info-lightest);
+ border: 1px solid var(--ifm-color-info-light);
+ border-radius: 4px;
+ padding: 1rem;
+ margin: 1rem 0;
+}
+
+.systemRequirements h4 {
+ color: var(--ifm-color-info-dark);
+ margin-bottom: 0.5rem;
+}
diff --git a/docs-website/src/components/Pills/GlossaryTermPill.jsx b/docs-website/src/components/Pills/GlossaryTermPill.jsx
new file mode 100644
index 00000000000000..4ec71be7b14e38
--- /dev/null
+++ b/docs-website/src/components/Pills/GlossaryTermPill.jsx
@@ -0,0 +1,35 @@
+import React from "react";
+import styles from "./styles.module.css";
+
+const generateTermColor = (termName) => {
+ const colors = [
+ "#1890ff",
+ "#52c41a",
+ "#faad14",
+ "#f5222d",
+ "#722ed1",
+ "#fa541c",
+ "#13c2c2",
+ "#eb2f96",
+ "#a0d911",
+ "#fadb14",
+ ];
+ let hash = 0;
+ for (let i = 0; i < termName.length; i++) {
+ hash = (hash << 5) - hash + termName.charCodeAt(i);
+ }
+ return colors[Math.abs(hash) % colors.length];
+};
+
+export const GlossaryTermPill = ({ term }) => (
+
+);
+
+export default GlossaryTermPill;
diff --git a/docs-website/src/components/Pills/TagPill.jsx b/docs-website/src/components/Pills/TagPill.jsx
new file mode 100644
index 00000000000000..e9a963decc6b37
--- /dev/null
+++ b/docs-website/src/components/Pills/TagPill.jsx
@@ -0,0 +1,25 @@
+import React from "react";
+import styles from "./styles.module.css";
+
+const generateTagColor = (tagName) => {
+ let hash = 0;
+ for (let i = 0; i < tagName.length; i++) {
+ const char = tagName.charCodeAt(i);
+ hash = (hash << 5) - hash + char;
+ hash = hash & hash;
+ }
+ const hue = Math.abs(hash) % 360;
+ return `hsl(${hue}, 70%, 45%)`;
+};
+
+export const TagPill = ({ tag }) => (
+
+);
+
+export default TagPill;
diff --git a/docs-website/src/components/Pills/styles.module.css b/docs-website/src/components/Pills/styles.module.css
new file mode 100644
index 00000000000000..dd4f289e4983f7
--- /dev/null
+++ b/docs-website/src/components/Pills/styles.module.css
@@ -0,0 +1,63 @@
+.tagPill {
+ display: flex;
+ align-items: center;
+ background: var(--ifm-background-color, #fff);
+ border: 1px solid var(--ifm-color-emphasis-300, #e9eaee);
+ border-radius: 4px;
+ padding: 2px 6px;
+ font-size: 11px;
+ color: var(--ifm-color-content, #374066);
+ max-width: 140px;
+ overflow: hidden;
+ text-overflow: ellipsis;
+ white-space: nowrap;
+}
+
+.tagColorDot {
+ width: 6px;
+ height: 6px;
+ border-radius: 50%;
+ margin-right: 4px;
+ flex-shrink: 0;
+}
+
+.tagText {
+ overflow: hidden;
+ text-overflow: ellipsis;
+ white-space: nowrap;
+}
+
+.termPill {
+ position: relative;
+ display: inline-flex;
+ align-items: center;
+ background: #f8f8f8;
+ border: 1px solid #ccd1dd;
+ border-radius: 5px;
+ padding: 3px 8px;
+ font-size: 12px;
+ font-weight: 400;
+ color: #565657;
+ max-width: 200px;
+ overflow: hidden;
+ cursor: pointer;
+ margin-left: 8px; /* Make room for ribbon */
+}
+
+.termRibbon {
+ position: absolute;
+ left: -20px;
+ top: 4px;
+ width: 50px;
+ transform: rotate(-45deg);
+ padding: 4px;
+ opacity: 1;
+ background: var(--pill-color, #1890ff);
+}
+
+.termText {
+ overflow: hidden;
+ text-overflow: ellipsis;
+ white-space: nowrap;
+ margin-left: 8px;
+}
diff --git a/docs-website/src/components/ProcessFlow/index.jsx b/docs-website/src/components/ProcessFlow/index.jsx
new file mode 100644
index 00000000000000..5c7a86068f143f
--- /dev/null
+++ b/docs-website/src/components/ProcessFlow/index.jsx
@@ -0,0 +1,193 @@
+import React from "react";
+import styles from "./styles.module.css";
+
+const ProcessFlow = ({
+ title,
+ steps,
+ type = "horizontal", // 'horizontal', 'vertical', 'circular'
+ showNumbers = true,
+ animated = true,
+}) => {
+ const renderStep = (step, index) => (
+
+ {showNumbers &&
{index + 1}
}
+
+
{step.title}
+ {step.description && (
+
{step.description}
+ )}
+ {step.details && (
+
+ {step.details.map((detail, i) => (
+
+ • {detail}
+
+ ))}
+
+ )}
+
+
+ );
+
+ const renderConnector = (index) => (
+
+ {type === "horizontal" ? "→" : "↓"}
+
+ );
+
+ // Detect if we might have overflow (4+ steps in horizontal layout)
+ const hasOverflow = type === "horizontal" && steps.length >= 4;
+
+ return (
+
+ {title &&
{title}
}
+
+
+ {steps.map((step, index) => (
+
+ {renderStep(step, index)}
+ {index < steps.length - 1 && renderConnector(index)}
+
+ ))}
+
+
+ );
+};
+
+// Predefined workflow configurations
+export const DataHubWorkflows = {
+ discoveryProcess: {
+ title: "Enterprise Data Discovery Process",
+ steps: [
+ {
+ title: "Requirements Analysis",
+ description: "Define business objectives",
+ details: [
+ "Identify data needs",
+ "Set success criteria",
+ "Define scope",
+ ],
+ },
+ {
+ title: "Strategic Search",
+ description: "Apply targeted queries",
+ details: ["Use business terms", "Apply filters", "Refine results"],
+ },
+ {
+ title: "Asset Evaluation",
+ description: "Assess data quality",
+ details: ["Check freshness", "Review schema", "Validate completeness"],
+ },
+ {
+ title: "Access Planning",
+ description: "Understand requirements",
+ details: [
+ "Check permissions",
+ "Review documentation",
+ "Plan integration",
+ ],
+ },
+ ],
+ },
+
+ lineageAnalysis: {
+ title: "5-Hop Lineage Analysis Method",
+ steps: [
+ {
+ title: "Start at Target",
+ description: "Begin with dataset of interest",
+ details: [
+ "Open lineage view",
+ "Identify current dataset",
+ "Note business context",
+ ],
+ },
+ {
+ title: "Trace Upstream",
+ description: "Follow data backwards",
+ details: [
+ "Identify transformations",
+ "Check data sources",
+ "Document dependencies",
+ ],
+ },
+ {
+ title: "Analyze Hops",
+ description: "Examine each connection",
+ details: [
+ "Understand business logic",
+ "Check quality gates",
+ "Note critical points",
+ ],
+ },
+ {
+ title: "Impact Assessment",
+ description: "Evaluate change effects",
+ details: [
+ "Identify affected systems",
+ "Assess risk levels",
+ "Plan mitigation",
+ ],
+ },
+ {
+ title: "Validate Understanding",
+ description: "Confirm analysis",
+ details: [
+ "Review with data owners",
+ "Test assumptions",
+ "Document findings",
+ ],
+ },
+ ],
+ },
+
+ ingestionProcess: {
+ title: "Metadata Ingestion Workflow",
+ steps: [
+ {
+ title: "Connection",
+ description: "Establish secure connections",
+ details: [
+ "Configure credentials",
+ "Test connectivity",
+ "Set up authentication",
+ ],
+ },
+ {
+ title: "Discovery",
+ description: "Scan data structures",
+ details: ["Identify schemas", "Map relationships", "Detect patterns"],
+ },
+ {
+ title: "Extraction",
+ description: "Pull comprehensive metadata",
+ details: ["Schema information", "Statistics", "Lineage data"],
+ },
+ {
+ title: "Transformation",
+ description: "Standardize metadata format",
+ details: [
+ "Apply business rules",
+ "Enrich with context",
+ "Validate quality",
+ ],
+ },
+ {
+ title: "Loading",
+ description: "Store in DataHub",
+ details: [
+ "Update knowledge graph",
+ "Index for search",
+ "Enable discovery",
+ ],
+ },
+ ],
+ },
+};
+
+export default ProcessFlow;
diff --git a/docs-website/src/components/ProcessFlow/styles.module.css b/docs-website/src/components/ProcessFlow/styles.module.css
new file mode 100644
index 00000000000000..b41b3bbbe60449
--- /dev/null
+++ b/docs-website/src/components/ProcessFlow/styles.module.css
@@ -0,0 +1,285 @@
+/* Process Flow Styles */
+.processFlow {
+ background: linear-gradient(135deg, #ffffff 0%, #f8fafc 100%);
+ border: 1px solid var(--ifm-color-primary-lightest);
+ border-radius: 12px;
+ padding: 24px;
+ margin: 24px 0;
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
+ position: relative;
+ overflow: hidden;
+}
+
+.flowTitle {
+ text-align: center;
+ font-size: 1.3rem;
+ font-weight: 600;
+ color: var(--ifm-color-primary-dark);
+ margin-bottom: 24px;
+ padding-bottom: 12px;
+ border-bottom: 2px solid var(--ifm-color-primary-lightest);
+}
+
+.flowContainer {
+ display: flex;
+ align-items: stretch;
+ gap: 16px;
+ max-width: 100%;
+ overflow-x: auto;
+ padding: 8px 0;
+ scrollbar-width: thin;
+ scrollbar-color: var(--ifm-color-primary-light) transparent;
+}
+
+/* Custom scrollbar for webkit browsers */
+.flowContainer::-webkit-scrollbar {
+ height: 6px;
+}
+
+.flowContainer::-webkit-scrollbar-track {
+ background: var(--ifm-color-emphasis-200);
+ border-radius: 3px;
+}
+
+.flowContainer::-webkit-scrollbar-thumb {
+ background: var(--ifm-color-primary-light);
+ border-radius: 3px;
+}
+
+.flowContainer::-webkit-scrollbar-thumb:hover {
+ background: var(--ifm-color-primary);
+}
+
+/* Horizontal Layout */
+.horizontal .flowContainer {
+ flex-direction: row;
+ justify-content: flex-start;
+ flex-wrap: nowrap;
+}
+
+.horizontal .step {
+ flex: 0 0 auto;
+ min-width: 180px;
+ max-width: 220px;
+}
+
+/* Scroll hint for horizontal flows with many steps */
+.horizontal.hasOverflow::after {
+ content: "← Scroll horizontally to see all steps →";
+ position: absolute;
+ bottom: 8px;
+ left: 50%;
+ transform: translateX(-50%);
+ font-size: 12px;
+ color: var(--ifm-color-emphasis-600);
+ white-space: nowrap;
+ pointer-events: none;
+ background: rgba(255, 255, 255, 0.9);
+ padding: 4px 8px;
+ border-radius: 4px;
+ border: 1px solid var(--ifm-color-emphasis-300);
+}
+
+.horizontal .connector {
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ font-size: 1.5rem;
+ color: var(--ifm-color-primary);
+ font-weight: bold;
+ flex-shrink: 0;
+ width: 40px;
+}
+
+/* Vertical Layout */
+.vertical .flowContainer {
+ flex-direction: column;
+ align-items: center;
+}
+
+.vertical .step {
+ width: 100%;
+ max-width: 500px;
+}
+
+.vertical .connector {
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ font-size: 1.5rem;
+ color: var(--ifm-color-primary);
+ font-weight: bold;
+ height: 30px;
+ width: 100%;
+}
+
+/* Step Styles */
+.step {
+ background: white;
+ border: 2px solid var(--ifm-color-primary-light);
+ border-radius: 8px;
+ padding: 16px;
+ position: relative;
+ transition: all 0.3s ease;
+ cursor: pointer;
+}
+
+.step:hover {
+ transform: translateY(-2px);
+ box-shadow: 0 4px 16px rgba(0, 0, 0, 0.12);
+ border-color: var(--ifm-color-primary);
+}
+
+.stepNumber {
+ position: absolute;
+ top: -12px;
+ left: 16px;
+ background: var(--ifm-color-primary);
+ color: white;
+ width: 24px;
+ height: 24px;
+ border-radius: 50%;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ font-size: 0.8rem;
+ font-weight: bold;
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
+}
+
+.stepContent {
+ margin-top: 8px;
+}
+
+.stepTitle {
+ font-weight: 600;
+ font-size: 1rem;
+ color: var(--ifm-color-emphasis-800);
+ margin-bottom: 8px;
+}
+
+.stepDescription {
+ font-size: 0.9rem;
+ color: var(--ifm-color-emphasis-600);
+ margin-bottom: 12px;
+ font-style: italic;
+}
+
+.stepDetails {
+ font-size: 0.8rem;
+ color: var(--ifm-color-emphasis-700);
+}
+
+.stepDetail {
+ margin-bottom: 4px;
+ padding-left: 8px;
+}
+
+/* Animation */
+.animated {
+ animation: slideIn 0.6s ease-out forwards;
+ opacity: 0;
+}
+
+.animated:nth-child(1) {
+ animation-delay: 0.1s;
+}
+.animated:nth-child(3) {
+ animation-delay: 0.2s;
+}
+.animated:nth-child(5) {
+ animation-delay: 0.3s;
+}
+.animated:nth-child(7) {
+ animation-delay: 0.4s;
+}
+.animated:nth-child(9) {
+ animation-delay: 0.5s;
+}
+
+@keyframes slideIn {
+ from {
+ opacity: 0;
+ transform: translateY(20px);
+ }
+ to {
+ opacity: 1;
+ transform: translateY(0);
+ }
+}
+
+/* Connector Animation */
+.connector {
+ animation: pulse 2s infinite;
+}
+
+@keyframes pulse {
+ 0%,
+ 100% {
+ opacity: 0.6;
+ }
+ 50% {
+ opacity: 1;
+ }
+}
+
+/* Responsive Design */
+@media (max-width: 768px) {
+ .horizontal .flowContainer {
+ flex-direction: column;
+ align-items: center;
+ }
+
+ .horizontal .step {
+ width: 100%;
+ max-width: 400px;
+ }
+
+ .horizontal .connector {
+ transform: rotate(90deg);
+ width: 100%;
+ height: 30px;
+ }
+
+ .stepNumber {
+ left: 12px;
+ }
+}
+
+/* Dark Mode Support */
+[data-theme="dark"] .processFlow {
+ background: linear-gradient(135deg, #1e293b 0%, #334155 100%);
+ border-color: var(--ifm-color-primary-dark);
+}
+
+[data-theme="dark"] .step {
+ background: var(--ifm-color-emphasis-100);
+ border-color: var(--ifm-color-primary-dark);
+ color: var(--ifm-color-emphasis-800);
+}
+
+[data-theme="dark"] .step:hover {
+ border-color: var(--ifm-color-primary);
+ background: var(--ifm-color-emphasis-200);
+}
+
+/* Step Type Variations */
+.step.start {
+ border-color: #10b981;
+ background: linear-gradient(135deg, #ecfdf5 0%, #d1fae5 100%);
+}
+
+.step.process {
+ border-color: #3b82f6;
+ background: linear-gradient(135deg, #eff6ff 0%, #dbeafe 100%);
+}
+
+.step.decision {
+ border-color: #f59e0b;
+ background: linear-gradient(135deg, #fffbeb 0%, #fef3c7 100%);
+}
+
+.step.end {
+ border-color: #8b5cf6;
+ background: linear-gradient(135deg, #f5f3ff 0%, #ede9fe 100%);
+}
diff --git a/docs-website/src/components/SlackUtm/index.js b/docs-website/src/components/SlackUtm/index.js
index 8cfda600135dad..f50af0498f3fcc 100644
--- a/docs-website/src/components/SlackUtm/index.js
+++ b/docs-website/src/components/SlackUtm/index.js
@@ -1,14 +1,23 @@
import React, { useState, useMemo } from "react";
import styles from "./styles.module.scss";
-import { LikeOutlined, DislikeOutlined, CheckCircleOutlined } from "@ant-design/icons";
+import {
+ LikeOutlined,
+ DislikeOutlined,
+ CheckCircleOutlined,
+} from "@ant-design/icons";
import { v4 as uuidv4 } from "uuid";
const SlackUtm = ({ pageId }) => {
return (
-
- Need more help? Join the conversation in
Slack!
+
+ Need more help? Join the conversation in{" "}
+
+ Slack!
+
);
diff --git a/docs-website/src/components/SlackUtm/styles.module.scss b/docs-website/src/components/SlackUtm/styles.module.scss
index b1e8938dc9d086..d53613d0a5a45f 100644
--- a/docs-website/src/components/SlackUtm/styles.module.scss
+++ b/docs-website/src/components/SlackUtm/styles.module.scss
@@ -1,3 +1,3 @@
.slackUtm {
- padding: 0.5rem 0rem;
-}
\ No newline at end of file
+ padding: 0.5rem 0rem;
+}
diff --git a/docs-website/src/components/SolutionsDropdown/SolutionsDropdown.tsx b/docs-website/src/components/SolutionsDropdown/SolutionsDropdown.tsx
index bbb38355ec0594..bfa058af2128f8 100644
--- a/docs-website/src/components/SolutionsDropdown/SolutionsDropdown.tsx
+++ b/docs-website/src/components/SolutionsDropdown/SolutionsDropdown.tsx
@@ -5,22 +5,25 @@
* LICENSE file in the root directory of this source tree.
*/
-import React, {useState, useRef, useEffect} from 'react';
-import clsx from 'clsx';
+import React, { useState, useRef, useEffect } from "react";
+import clsx from "clsx";
import {
isRegexpStringMatch,
useCollapsible,
Collapsible,
-} from '@docusaurus/theme-common';
-import {isSamePath, useLocalPathname} from '@docusaurus/theme-common/internal';
-import NavbarNavLink from '@theme/NavbarItem/NavbarNavLink';
-import NavbarItem, {type LinkLikeNavbarItemProps} from '@theme/NavbarItem';
+} from "@docusaurus/theme-common";
+import {
+ isSamePath,
+ useLocalPathname,
+} from "@docusaurus/theme-common/internal";
+import NavbarNavLink from "@theme/NavbarItem/NavbarNavLink";
+import NavbarItem, { type LinkLikeNavbarItemProps } from "@theme/NavbarItem";
import type {
DesktopOrMobileNavBarItemProps,
Props,
-} from '@theme/NavbarItem/DropdownNavbarItem';
-import styles from './styles.module.css';
-import SolutionsDropdownContent from './SolutionsDropdownContent';
+} from "@theme/NavbarItem/DropdownNavbarItem";
+import styles from "./styles.module.css";
+import SolutionsDropdownContent from "./SolutionsDropdownContent";
function isItemActive(
item: LinkLikeNavbarItemProps,
@@ -53,7 +56,7 @@ function DropdownNavbarItemDesktop({
...props
}: DesktopOrMobileNavBarItemProps) {
const dropdownRef = useRef
(null);
- const [showDropdown, setShowDropdown] = useState(false);
+ const [showDropdown, setShowDropdown] = useState(false);
useEffect(() => {
const handleClickOutside = (
@@ -68,24 +71,25 @@ function DropdownNavbarItemDesktop({
setShowDropdown(false);
};
- document.addEventListener('mousedown', handleClickOutside);
- document.addEventListener('touchstart', handleClickOutside);
- document.addEventListener('focusin', handleClickOutside);
+ document.addEventListener("mousedown", handleClickOutside);
+ document.addEventListener("touchstart", handleClickOutside);
+ document.addEventListener("focusin", handleClickOutside);
return () => {
- document.removeEventListener('mousedown', handleClickOutside);
- document.removeEventListener('touchstart', handleClickOutside);
- document.removeEventListener('focusin', handleClickOutside);
+ document.removeEventListener("mousedown", handleClickOutside);
+ document.removeEventListener("touchstart", handleClickOutside);
+ document.removeEventListener("focusin", handleClickOutside);
};
}, [dropdownRef]);
return (
+ className={clsx("navbar__item", "dropdown", "dropdown--hoverable", {
+ "dropdown--right": position === "right",
+ "dropdown--show": showDropdown,
+ })}
+ >
tag focusable in case no link target
// See https://github.com/facebook/docusaurus/pull/6003
// There's probably a better solution though...
- href={props.to ? undefined : '#'}
- className={clsx('navbar__link', className)}
+ href={props.to ? undefined : "#"}
+ className={clsx("navbar__link", className)}
{...props}
onClick={props.to ? undefined : (e) => e.preventDefault()}
onKeyDown={(e) => {
- if (e.key === 'Enter') {
+ if (e.key === "Enter") {
e.preventDefault();
setShowDropdown(!showDropdown);
}
- }}>
+ }}
+ >
{props.children ?? props.label}
@@ -132,7 +137,7 @@ function DropdownNavbarItemMobile({
const localPathname = useLocalPathname();
const containsActive = containsActiveItems(items, localPathname);
- const {collapsed, toggleCollapsed, setCollapsed} = useCollapsible({
+ const { collapsed, toggleCollapsed, setCollapsed } = useCollapsible({
initialState: () => !containsActive,
});
@@ -145,21 +150,23 @@ function DropdownNavbarItemMobile({
return (
+ className={clsx("menu__list-item", {
+ "menu__list-item--collapsed": collapsed,
+ })}
+ >
{
e.preventDefault();
toggleCollapsed();
- }}>
+ }}
+ >
{props.children ?? props.label}
@@ -175,4 +182,4 @@ export default function DropdownNavbarItem({
}: Props): JSX.Element {
const Comp = mobile ? DropdownNavbarItemMobile : DropdownNavbarItemDesktop;
return ;
-}
\ No newline at end of file
+}
diff --git a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/index.js b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/index.js
index 79481c52f279e6..44431acdaad22f 100644
--- a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/index.js
+++ b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/index.js
@@ -1,8 +1,8 @@
-import React from 'react';
-import styles from './styles.module.scss';
-import clsx from 'clsx';
-import Link from '@docusaurus/Link';
-import solutionsDropdownContent from './solutionsDropdownContent';
+import React from "react";
+import styles from "./styles.module.scss";
+import clsx from "clsx";
+import Link from "@docusaurus/Link";
+import solutionsDropdownContent from "./solutionsDropdownContent";
function SolutionsDropdownContent() {
const { fullSizeCards, halfSizeCards } = solutionsDropdownContent;
@@ -20,14 +20,16 @@ function SolutionsDropdownContent() {
-
{item.description}
+
+ {item.description}
+
@@ -37,7 +39,10 @@ function SolutionsDropdownContent() {
{/* Half-size cards */}
{halfSizeCards.map((item, index) => (
-
+
-
{item.description}
+
+ {item.description}
+
diff --git a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/solutionsDropdownContent.js b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/solutionsDropdownContent.js
index ad7278a438cf81..43ca27e8957c1d 100644
--- a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/solutionsDropdownContent.js
+++ b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/solutionsDropdownContent.js
@@ -1,38 +1,38 @@
const solutionsDropdownContent = {
- fullSizeCards: [
- {
- title: "Data Discovery",
- description: "Search, Browse, Lineage, and more.",
- iconImage: "/img/solutions/icon-dropdown-discovery.png",
- href: "/solutions/discovery",
- },
- {
- title: "Data Observability",
- description: "Detect and Resolve Data Quality issues",
- iconImage: "/img/solutions/icon-dropdown-observe.png",
- href: "/solutions/observability",
- },
- {
- title: "Data Governance",
- description: "Automate Classifying and Governing data.",
- iconImage: "/img/solutions/icon-dropdown-governance.png",
- href: "/solutions/governance",
- },
- ],
- halfSizeCards: [
- {
- title: "DataHub Core",
- description: "Get started with the Open Source platform.",
- iconImage: "/img/solutions/icon-dropdown-core.png",
- href: "/docs/quickstart",
- },
- {
- title: "Cloud vs Core",
- description: "Understand the differences.",
- iconImage: "/img/solutions/icon-dropdown-cloud.png",
- href: "/cloud",
- },
- ],
+ fullSizeCards: [
+ {
+ title: "Data Discovery",
+ description: "Search, Browse, Lineage, and more.",
+ iconImage: "/img/solutions/icon-dropdown-discovery.png",
+ href: "/solutions/discovery",
+ },
+ {
+ title: "Data Observability",
+ description: "Detect and Resolve Data Quality issues",
+ iconImage: "/img/solutions/icon-dropdown-observe.png",
+ href: "/solutions/observability",
+ },
+ {
+ title: "Data Governance",
+ description: "Automate Classifying and Governing data.",
+ iconImage: "/img/solutions/icon-dropdown-governance.png",
+ href: "/solutions/governance",
+ },
+ ],
+ halfSizeCards: [
+ {
+ title: "DataHub Core",
+ description: "Get started with the Open Source platform.",
+ iconImage: "/img/solutions/icon-dropdown-core.png",
+ href: "/docs/quickstart",
+ },
+ {
+ title: "Cloud vs Core",
+ description: "Understand the differences.",
+ iconImage: "/img/solutions/icon-dropdown-cloud.png",
+ href: "/cloud",
+ },
+ ],
};
-
-export default solutionsDropdownContent
\ No newline at end of file
+
+export default solutionsDropdownContent;
diff --git a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/styles.module.scss b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/styles.module.scss
index b156c3342f24ab..d32ef9a5359f10 100644
--- a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/styles.module.scss
+++ b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/styles.module.scss
@@ -1,137 +1,137 @@
.container {
- display: flex;
+ display: flex;
+}
+
+.row {
+ display: flex;
+ gap: 1rem;
+}
+
+.card {
+ display: flex;
+ width: 12.4375rem;
+ height: 12.5rem;
+ padding: 0;
+ flex-direction: column;
+ justify-content: center;
+ align-items: center;
+ flex-shrink: 0;
+ border-radius: 0.72681rem;
+ background: #f7f7f7;
+ text-align: left;
+ text-decoration: none;
+ transition:
+ transform 0.3s ease,
+ box-shadow 0.3s ease;
+}
+
+.header {
+ display: inline-flex;
+}
+
+.title {
+ color: #1e1e1e;
+ font-family: Manrope;
+ font-style: normal;
+ font-weight: 600;
+}
+
+.description {
+ color: #757575;
+ font-family: Manrope;
+ font-style: normal;
+ font-weight: 300;
+}
+
+.fullSizeCard {
+ background-repeat: no-repeat;
+ background-size: contain;
+ background-position: bottom right;
+ height: 100%;
+ padding: 1.4rem;
+
+ .icon {
+ width: 1.7rem;
+ height: 1.7rem;
+ display: block;
}
-
- .row {
- display: flex;
- gap: 1rem;
+
+ .title {
+ font-size: 1.1rem;
+ font-weight: 600;
+ line-height: 150%; /* 1.6875rem */
+ letter-spacing: -0.01238rem;
+ margin-top: 0.5rem;
}
-
- .card {
+
+ .description {
+ font-size: 0.95rem;
+ line-height: 150%; /* 1.5rem */
+ letter-spacing: -0.011rem;
+ }
+}
+
+.halfSizeWrapper {
+ display: flex;
+ flex-direction: column;
+ gap: 0.98rem;
+}
+
+.halfSizeCard {
+ display: flex;
+ height: 5.75rem;
+ padding: 1.4rem;
+ flex-direction: column;
+ align-items: center;
+ flex-shrink: 0;
+ align-self: stretch;
+
+ .icon {
display: flex;
- width: 12.4375rem;
- height: 12.5rem;
- padding: 0;
- flex-direction: column;
+ width: 1.26806rem;
+ height: 1.26806rem;
+ padding: 0.13206rem 0.13725rem 0.13213rem 0.13213rem;
justify-content: center;
align-items: center;
flex-shrink: 0;
- border-radius: 0.72681rem;
- background: #F7F7F7;
- text-align: left;
- text-decoration: none;
- transition: transform 0.3s ease, box-shadow 0.3s ease;
- }
-
- .header {
- display: inline-flex;
+ margin-right: 0.65rem;
}
.title {
- color: #1E1E1E;
+ color: #1e1e1e;
font-family: Manrope;
+ font-size: 0.95rem;
font-style: normal;
font-weight: 600;
+ line-height: 150%; /* 1.5rem */
+ letter-spacing: -0.011rem;
}
.description {
- color: #757575;
- font-family: Manrope;
- font-style: normal;
- font-weight: 300;
+ font-size: 0.75rem;
+ line-height: 150%; /* 1.125rem */
+ letter-spacing: -0.00825rem;
+ margin-left: 2rem;
}
+}
- .fullSizeCard {
- background-repeat: no-repeat;
- background-size: contain;
- background-position: bottom right;
- height: 100%;
- padding: 1.4rem;
-
- .icon {
- width: 1.7rem;
- height: 1.7rem;
- display: block;
- }
-
- .title {
- font-size: 1.1rem;
- font-weight: 600;
- line-height: 150%; /* 1.6875rem */
- letter-spacing: -0.01238rem;
- margin-top: 0.5rem;
- }
-
- .description {
- font-size: 0.95rem;
- line-height: 150%; /* 1.5rem */
- letter-spacing: -0.011rem;
- };
- }
-
- .halfSizeWrapper {
- display: flex;
- flex-direction: column;
- gap: 0.98rem;
- }
+.card:hover {
+ transform: translateY(-5px);
+ box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1);
+ text-decoration: none;
+ color: inherit;
+}
- .halfSizeCard {
- display: flex;
- height: 5.75rem;
- padding: 1.4rem;
- flex-direction: column;
- align-items: center;
- flex-shrink: 0;
- align-self: stretch;
-
- .icon {
- display: flex;
- width: 1.26806rem;
- height: 1.26806rem;
- padding: 0.13206rem 0.13725rem 0.13213rem 0.13213rem;
- justify-content: center;
- align-items: center;
- flex-shrink: 0;
- margin-right: 0.65rem;
- }
-
- .title {
- color: #1E1E1E;
- font-family: Manrope;
- font-size: 0.95rem;
- font-style: normal;
- font-weight: 600;
- line-height: 150%; /* 1.5rem */
- letter-spacing: -0.011rem;
- }
-
- .description {
- font-size: 0.75rem;
- line-height: 150%; /* 1.125rem */
- letter-spacing: -0.00825rem;
- margin-left: 2rem;
- }
+@media (max-width: 768px) {
+ .col {
+ flex: 1 1 48%;
+ max-width: 48%;
}
-
- .card:hover {
- transform: translateY(-5px);
- box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1);
- text-decoration: none;
- color: inherit;
- }
-
-
- @media (max-width: 768px) {
- .col {
- flex: 1 1 48%;
- max-width: 48%;
- }
- }
-
- @media (max-width: 480px) {
- .col {
- flex: 1 1 100%;
- max-width: 100%;
- }
+}
+
+@media (max-width: 480px) {
+ .col {
+ flex: 1 1 100%;
+ max-width: 100%;
}
-
\ No newline at end of file
+}
diff --git a/docs-website/src/components/SolutionsDropdown/styles.module.css b/docs-website/src/components/SolutionsDropdown/styles.module.css
index 09c71edf0b1850..01e6f6373c3c4a 100644
--- a/docs-website/src/components/SolutionsDropdown/styles.module.css
+++ b/docs-website/src/components/SolutionsDropdown/styles.module.css
@@ -5,7 +5,7 @@
* LICENSE file in the root directory of this source tree.
*/
- .dropdownNavbarItemMobile {
+.dropdownNavbarItemMobile {
cursor: pointer;
}
@@ -17,6 +17,6 @@
align-items: flex-start;
gap: 0.98219rem;
border-radius: var(--number-scales-2s-20, 1.25rem);
- background: #FFF;
+ background: #fff;
box-shadow: 0px 16px 16px 0px rgba(0, 0, 0, 0.25);
-}
\ No newline at end of file
+}
diff --git a/docs-website/src/components/StepCompletion/index.jsx b/docs-website/src/components/StepCompletion/index.jsx
new file mode 100644
index 00000000000000..bdb8abb4b2457a
--- /dev/null
+++ b/docs-website/src/components/StepCompletion/index.jsx
@@ -0,0 +1,52 @@
+import React, { useState, useEffect } from "react";
+import styles from "./styles.module.css";
+
+const StepCompletion = ({
+ stepId,
+ children,
+ completionText = "✅ Completed!",
+}) => {
+ const [isCompleted, setIsCompleted] = useState(false);
+ const storageKey = `datahub-step-${stepId}`;
+
+ // Load completion status from localStorage
+ useEffect(() => {
+ const saved = localStorage.getItem(storageKey);
+ if (saved === "true") {
+ setIsCompleted(true);
+ }
+ }, [storageKey]);
+
+ // Save completion status to localStorage
+ useEffect(() => {
+ localStorage.setItem(storageKey, isCompleted.toString());
+ }, [isCompleted, storageKey]);
+
+ const toggleCompletion = () => {
+ setIsCompleted(!isCompleted);
+ };
+
+ return (
+
+
{children}
+
+
+
+ {isCompleted ? "✅" : "⬜"}
+
+ {isCompleted ? completionText : "Mark as complete"}
+
+
+
+
+ );
+};
+
+export default StepCompletion;
diff --git a/docs-website/src/components/StepCompletion/styles.module.css b/docs-website/src/components/StepCompletion/styles.module.css
new file mode 100644
index 00000000000000..b9f1b1c97e26f6
--- /dev/null
+++ b/docs-website/src/components/StepCompletion/styles.module.css
@@ -0,0 +1,76 @@
+.stepCompletion {
+ background: #f8f9fa;
+ border: 2px solid #e9ecef;
+ border-radius: 8px;
+ padding: 16px;
+ margin: 16px 0;
+ transition: all 0.3s ease;
+}
+
+.stepCompletion.completed {
+ background: #d4edda;
+ border-color: #28a745;
+}
+
+.content {
+ margin-bottom: 12px;
+}
+
+.completionControl {
+ border-top: 1px solid #e9ecef;
+ padding-top: 12px;
+}
+
+.completionLabel {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ cursor: pointer;
+ margin: 0;
+ font-weight: 500;
+}
+
+.checkbox {
+ display: none;
+}
+
+.checkmark {
+ font-size: 16px;
+ transition: transform 0.1s ease;
+}
+
+.completionLabel:hover .checkmark {
+ transform: scale(1.1);
+}
+
+.completionText {
+ color: #495057;
+ font-size: 14px;
+}
+
+.completed .completionText {
+ color: #155724;
+}
+
+/* Dark mode support */
+[data-theme="dark"] .stepCompletion {
+ background: #2d2d2d;
+ border-color: #444;
+}
+
+[data-theme="dark"] .stepCompletion.completed {
+ background: #1e3a1e;
+ border-color: #28a745;
+}
+
+[data-theme="dark"] .completionControl {
+ border-top-color: #444;
+}
+
+[data-theme="dark"] .completionText {
+ color: #e9ecef;
+}
+
+[data-theme="dark"] .completed .completionText {
+ color: #90ee90;
+}
diff --git a/docs-website/src/components/TutorialExercise/index.jsx b/docs-website/src/components/TutorialExercise/index.jsx
new file mode 100644
index 00000000000000..6500f9a3d0bb20
--- /dev/null
+++ b/docs-website/src/components/TutorialExercise/index.jsx
@@ -0,0 +1,127 @@
+import React from "react";
+import styles from "./styles.module.css";
+
+const TutorialExercise = ({
+ title,
+ type = "search",
+ icon,
+ children,
+ difficulty = "beginner",
+ timeEstimate,
+ platform = "DataHub",
+}) => {
+ const getTypeIcon = () => {
+ switch (type) {
+ case "search":
+ return "🔍";
+ case "hands-on":
+ return "💻";
+ case "analysis":
+ return "📊";
+ case "exercise":
+ return "🎯";
+ default:
+ return "📝";
+ }
+ };
+
+ const getDifficultyColor = () => {
+ switch (difficulty) {
+ case "beginner":
+ return "var(--datahub-success)";
+ case "intermediate":
+ return "var(--datahub-warning)";
+ case "advanced":
+ return "var(--datahub-error)";
+ default:
+ return "var(--datahub-primary)";
+ }
+ };
+
+ return (
+
+
+
+
{icon || getTypeIcon()}
+
+
{title}
+
+
+ {difficulty}
+
+ {timeEstimate && (
+ ⏱️ {timeEstimate}
+ )}
+ {platform}
+
+
+
+
+
{children}
+
+ );
+};
+
+export const SearchExercise = ({ title, searches, children, ...props }) => (
+
+ {searches && (
+
+ {searches.map((search, index) => (
+
+
+ {search.query}
+
+ {search.description && (
+
+ {search.description}
+
+ )}
+ {search.expected && (
+
+ Expected: {search.expected}
+
+ )}
+
+ ))}
+
+ )}
+ {children}
+
+);
+
+export const HandsOnExercise = ({ title, steps, children, ...props }) => (
+
+ {steps && (
+
+ {steps.map((step, index) => (
+
+
{index + 1}
+
+
{step.title}
+ {step.description && (
+
{step.description}
+ )}
+ {step.code && (
+
+ {step.code}
+
+ )}
+
+
+ ))}
+
+ )}
+ {children}
+
+);
+
+export const InteractiveDemo = ({ title, children, ...props }) => (
+
+ {children}
+
+);
+
+export default TutorialExercise;
diff --git a/docs-website/src/components/TutorialExercise/styles.module.css b/docs-website/src/components/TutorialExercise/styles.module.css
new file mode 100644
index 00000000000000..583b244c56cf3f
--- /dev/null
+++ b/docs-website/src/components/TutorialExercise/styles.module.css
@@ -0,0 +1,297 @@
+.exerciseContainer {
+ background: var(--ifm-background-color);
+ border: 1px solid var(--ifm-color-emphasis-300);
+ border-radius: var(--ifm-border-radius);
+ box-shadow: var(--ifm-shadow-md);
+ margin: 24px 0;
+ overflow: hidden;
+ transition: all 0.2s ease;
+}
+
+.exerciseContainer:hover {
+ box-shadow: var(--ifm-shadow-lg);
+ border-color: var(--ifm-color-primary-light);
+}
+
+.exerciseHeader {
+ background: linear-gradient(
+ 135deg,
+ var(--ifm-color-emphasis-100) 0%,
+ var(--ifm-background-color) 100%
+ );
+ border-bottom: 1px solid var(--ifm-color-emphasis-300);
+ padding: 16px 20px;
+ display: flex;
+ align-items: center;
+ justify-content: space-between;
+}
+
+.headerLeft {
+ display: flex;
+ align-items: center;
+ gap: 12px;
+}
+
+.typeIcon {
+ width: 40px;
+ height: 40px;
+ background: var(--ifm-color-primary);
+ color: white;
+ border-radius: 8px;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ font-size: 18px;
+ flex-shrink: 0;
+}
+
+.titleSection {
+ display: flex;
+ flex-direction: column;
+ gap: 4px;
+}
+
+.exerciseTitle {
+ margin: 0;
+ font-size: 16px;
+ font-weight: 600;
+ color: var(--ifm-font-color-base);
+ line-height: 1.3;
+}
+
+.metadata {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ flex-wrap: wrap;
+}
+
+.difficulty {
+ padding: 2px 8px;
+ border-radius: 12px;
+ font-size: 11px;
+ font-weight: 500;
+ color: white;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+}
+
+.timeEstimate {
+ font-size: 12px;
+ color: var(--ifm-color-emphasis-700);
+ background: var(--ifm-color-emphasis-200);
+ padding: 2px 6px;
+ border-radius: 4px;
+}
+
+.platform {
+ font-size: 12px;
+ color: var(--ifm-color-primary);
+ background: var(--ifm-color-primary-lightest);
+ padding: 2px 6px;
+ border-radius: 4px;
+ font-weight: 500;
+}
+
+.exerciseContent {
+ padding: 20px;
+}
+
+/* Search Exercise Styles */
+.searchList {
+ display: flex;
+ flex-direction: column;
+ gap: 16px;
+}
+
+.searchItem {
+ background: var(--ifm-color-emphasis-100);
+ border: 1px solid var(--ifm-color-emphasis-200);
+ border-radius: 8px;
+ padding: 16px;
+ transition: all 0.2s ease;
+}
+
+.searchItem:hover {
+ background: var(--ifm-background-color);
+ border-color: var(--ifm-color-primary-light);
+}
+
+.searchQuery {
+ margin-bottom: 8px;
+}
+
+.searchQuery code {
+ background: var(--ifm-color-primary-dark);
+ color: white;
+ padding: 8px 12px;
+ border-radius: 6px;
+ font-family: var(--ifm-font-family-monospace);
+ font-size: 14px;
+ font-weight: 500;
+ display: inline-block;
+ min-width: 200px;
+}
+
+.searchDescription {
+ color: var(--ifm-color-emphasis-700);
+ font-size: 14px;
+ margin-bottom: 6px;
+ line-height: 1.4;
+}
+
+.searchExpected {
+ color: var(--ifm-font-color-base);
+ font-size: 13px;
+ background: var(--ifm-color-success-lightest);
+ padding: 6px 10px;
+ border-radius: 4px;
+ border-left: 3px solid var(--ifm-color-success);
+}
+
+/* Hands-On Exercise Styles */
+.stepsList {
+ display: flex;
+ flex-direction: column;
+ gap: 16px;
+}
+
+.stepItem {
+ display: flex;
+ gap: 12px;
+ align-items: flex-start;
+}
+
+.stepNumber {
+ width: 28px;
+ height: 28px;
+ background: var(--ifm-color-primary);
+ color: white;
+ border-radius: 50%;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ font-size: 14px;
+ font-weight: 600;
+ flex-shrink: 0;
+ margin-top: 2px;
+}
+
+.stepContent {
+ flex: 1;
+ display: flex;
+ flex-direction: column;
+ gap: 6px;
+}
+
+.stepTitle {
+ font-weight: 600;
+ color: var(--ifm-font-color-base);
+ font-size: 15px;
+ line-height: 1.4;
+}
+
+.stepDescription {
+ color: var(--ifm-color-emphasis-700);
+ font-size: 14px;
+ line-height: 1.5;
+}
+
+.stepCode {
+ background: var(--ifm-color-emphasis-900);
+ color: white;
+ padding: 10px 12px;
+ border-radius: 6px;
+ font-family: var(--ifm-font-family-monospace);
+ font-size: 13px;
+ margin-top: 4px;
+}
+
+/* Interactive Demo Styles */
+.interactiveContent {
+ background: linear-gradient(
+ 135deg,
+ var(--ifm-color-primary-lightest) 0%,
+ var(--ifm-background-color) 100%
+ );
+ border: 1px solid var(--ifm-color-primary-light);
+ border-radius: 8px;
+ padding: 20px;
+}
+
+/* Responsive Design */
+@media (max-width: 768px) {
+ .exerciseHeader {
+ padding: 12px 16px;
+ }
+
+ .headerLeft {
+ gap: 8px;
+ }
+
+ .typeIcon {
+ width: 32px;
+ height: 32px;
+ font-size: 16px;
+ }
+
+ .exerciseTitle {
+ font-size: 14px;
+ }
+
+ .exerciseContent {
+ padding: 16px;
+ }
+
+ .metadata {
+ gap: 6px;
+ }
+
+ .searchQuery code {
+ min-width: auto;
+ font-size: 13px;
+ padding: 6px 10px;
+ }
+
+ .stepItem {
+ gap: 8px;
+ }
+
+ .stepNumber {
+ width: 24px;
+ height: 24px;
+ font-size: 12px;
+ }
+}
+
+/* Dark mode support */
+[data-theme="dark"] .exerciseContainer {
+ background: var(--ifm-background-surface-color);
+ border-color: var(--ifm-color-emphasis-300);
+}
+
+[data-theme="dark"] .exerciseHeader {
+ background: linear-gradient(
+ 135deg,
+ var(--ifm-color-emphasis-200) 0%,
+ var(--ifm-background-surface-color) 100%
+ );
+ border-bottom-color: var(--ifm-color-emphasis-300);
+}
+
+[data-theme="dark"] .searchItem {
+ background: var(--ifm-color-emphasis-200);
+ border-color: var(--ifm-color-emphasis-300);
+}
+
+[data-theme="dark"] .searchItem:hover {
+ background: var(--ifm-color-emphasis-300);
+}
+
+[data-theme="dark"] .interactiveContent {
+ background: linear-gradient(
+ 135deg,
+ var(--ifm-color-primary-dark) 0%,
+ var(--ifm-background-surface-color) 100%
+ );
+}
diff --git a/docs-website/src/components/TutorialProgress/index.jsx b/docs-website/src/components/TutorialProgress/index.jsx
new file mode 100644
index 00000000000000..4e4c70568f29fd
--- /dev/null
+++ b/docs-website/src/components/TutorialProgress/index.jsx
@@ -0,0 +1,226 @@
+import React, { useState, useEffect } from "react";
+import { useHistory, useLocation } from "@docusaurus/router";
+import styles from "./styles.module.css";
+
+const TutorialProgress = ({
+ tutorialId,
+ steps,
+ currentStep,
+ compact = false,
+}) => {
+ const [completedSteps, setCompletedSteps] = useState(new Set());
+ const [isMinimized, setIsMinimized] = useState(false);
+ const [isScrolled, setIsScrolled] = useState(false);
+
+ // Handle both old and new formats
+ const actualTutorialId = tutorialId || "tutorial";
+ const actualCurrentStep =
+ typeof currentStep === "string" ? currentStep : `step-${currentStep}`;
+ const storageKey = `datahub-tutorial-${actualTutorialId}`;
+
+ // Load progress from localStorage on component mount
+ useEffect(() => {
+ const savedProgress = localStorage.getItem(storageKey);
+ if (savedProgress) {
+ try {
+ const parsed = JSON.parse(savedProgress);
+ setCompletedSteps(new Set(parsed));
+ } catch (e) {
+ console.warn("Failed to parse tutorial progress:", e);
+ }
+ }
+ }, [storageKey]);
+
+ // Save progress to localStorage whenever completedSteps changes
+ useEffect(() => {
+ localStorage.setItem(storageKey, JSON.stringify([...completedSteps]));
+ }, [completedSteps, storageKey]);
+
+ const toggleStep = (stepId) => {
+ setCompletedSteps((prev) => {
+ const newSet = new Set(prev);
+ if (newSet.has(stepId)) {
+ newSet.delete(stepId);
+ } else {
+ newSet.add(stepId);
+ // Auto-mark previous steps as completed
+ const stepIndex = parseInt(stepId.split("-")[1]);
+ for (let i = 0; i < stepIndex; i++) {
+ newSet.add(`step-${i}`);
+ }
+ }
+ return newSet;
+ });
+ };
+
+ const resetProgress = () => {
+ setCompletedSteps(new Set());
+ localStorage.removeItem(storageKey);
+ };
+
+ // Auto-mark current step as completed when user navigates
+ useEffect(() => {
+ if (currentStep !== undefined) {
+ setCompletedSteps((prev) => {
+ const newSet = new Set(prev);
+ newSet.add(actualCurrentStep);
+ return newSet;
+ });
+ }
+ }, [actualCurrentStep]);
+
+ // Handle scroll behavior for auto-minimizing
+ useEffect(() => {
+ const handleScroll = () => {
+ const scrollTop =
+ window.pageYOffset || document.documentElement.scrollTop;
+ setIsScrolled(scrollTop > 100); // Auto-minimize after scrolling 100px
+ };
+
+ window.addEventListener("scroll", handleScroll);
+ return () => window.removeEventListener("scroll", handleScroll);
+ }, []);
+
+ const toggleMinimized = () => {
+ setIsMinimized(!isMinimized);
+ };
+
+ const completionPercentage = Math.round(
+ (completedSteps.size / steps.length) * 100,
+ );
+
+ if (compact) {
+ return (
+
+
+
+ 📋 Progress: {completedSteps.size}/{steps.length}
+
+
+
+
+ );
+ }
+
+ // Determine if we should show minimized version
+ const shouldShowMinimized = isMinimized || isScrolled;
+
+ if (shouldShowMinimized) {
+ return (
+
+
+
+
+ 📋 {completedSteps.size}/{steps.length} completed (
+ {completionPercentage}%)
+
+
+
+
+ ⬇️
+
+
+
+ );
+ }
+
+ return (
+
+
+
+
📋 Tutorial Progress
+
+ ⬆️
+
+
+
+
+
+ {completedSteps.size} of {steps.length} completed (
+ {completionPercentage}%)
+
+
+
+
+
+ {steps.map((step, index) => {
+ // Handle both old format (step-${index}) and new format (step.id)
+ const stepId = step.id || `step-${index}`;
+ const isCompleted = completedSteps.has(stepId);
+ const isCurrent = actualCurrentStep === stepId;
+
+ return (
+
+
+ toggleStep(stepId)}
+ className={styles.checkbox}
+ />
+
+ {isCompleted ? "✅" : "⬜"}
+
+
+ {step.title || step.label}
+ {step.time && (
+ ({step.time})
+ )}
+ {isCurrent && (
+ ← You are here
+ )}
+
+
+ {step.description && (
+
{step.description}
+ )}
+
+ );
+ })}
+
+
+
+
+ 🔄 Reset Progress
+
+ {completedSteps.size === steps.length && (
+
+ 🎉 Tutorial Complete! Great job finishing all
+ steps!
+
+ )}
+
+
+ );
+};
+
+export default TutorialProgress;
diff --git a/docs-website/src/components/TutorialProgress/styles.module.css b/docs-website/src/components/TutorialProgress/styles.module.css
new file mode 100644
index 00000000000000..b161d2e31301c6
--- /dev/null
+++ b/docs-website/src/components/TutorialProgress/styles.module.css
@@ -0,0 +1,383 @@
+.tutorialProgress {
+ background: var(--ifm-background-color);
+ border: 1px solid var(--ifm-color-emphasis-300);
+ border-radius: 12px;
+ padding: 24px;
+ margin: 32px 0;
+ font-family: var(--ifm-font-family-base);
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+ position: sticky;
+ top: 20px;
+ z-index: 10;
+}
+
+.header {
+ margin-bottom: 16px;
+}
+
+.header h4 {
+ margin: 0 0 16px 0;
+ color: var(--ifm-color-primary);
+ font-size: 18px;
+ font-weight: 700;
+ display: flex;
+ align-items: center;
+ gap: 8px;
+}
+
+.progressBar {
+ position: relative;
+ background: #e9ecef;
+ border-radius: 10px;
+ height: 20px;
+ overflow: hidden;
+}
+
+.progressFill {
+ background: linear-gradient(
+ 90deg,
+ var(--ifm-color-primary) 0%,
+ var(--ifm-color-primary-light) 100%
+ );
+ height: 100%;
+ border-radius: 10px;
+ transition: width 0.4s cubic-bezier(0.4, 0, 0.2, 1);
+}
+
+.progressText {
+ position: absolute;
+ top: 50%;
+ left: 50%;
+ transform: translate(-50%, -50%);
+ font-size: 13px;
+ font-weight: 600;
+ color: var(--ifm-color-content);
+ text-shadow: 0 0 3px var(--ifm-background-color);
+}
+
+.stepsList {
+ display: flex;
+ flex-direction: column;
+ gap: 12px;
+}
+
+.step {
+ padding: 12px;
+ border-radius: 6px;
+ transition: all 0.2s ease;
+}
+
+.step:hover {
+ background: rgba(0, 123, 255, 0.05);
+}
+
+.step.current {
+ background: var(--ifm-color-primary-lightest);
+ border-left: 4px solid var(--ifm-color-primary);
+ padding-left: 16px;
+ border-radius: 8px;
+}
+
+.stepLabel {
+ display: flex;
+ align-items: flex-start;
+ gap: 12px;
+ cursor: pointer;
+ margin: 0;
+}
+
+.checkbox {
+ display: none;
+}
+
+.checkmark {
+ font-size: 18px;
+ line-height: 1;
+ user-select: none;
+ transition: transform 0.1s ease;
+}
+
+.stepLabel:hover .checkmark {
+ transform: scale(1.1);
+}
+
+.stepText {
+ flex: 1;
+ line-height: 1.4;
+}
+
+.stepText strong {
+ color: #495057;
+ font-weight: 600;
+}
+
+.time {
+ color: #6c757d;
+ font-size: 14px;
+ margin-left: 8px;
+}
+
+.currentBadge {
+ color: var(--ifm-color-primary);
+ font-weight: 600;
+ font-size: 14px;
+ margin-left: 12px;
+ background: var(--ifm-color-primary-lightest);
+ padding: 2px 8px;
+ border-radius: 12px;
+ border: 1px solid var(--ifm-color-primary-light);
+}
+
+.stepDescription {
+ margin-top: 6px;
+ margin-left: 30px;
+ font-size: 14px;
+ color: #6c757d;
+ line-height: 1.4;
+}
+
+.actions {
+ margin-top: 20px;
+ padding-top: 16px;
+ border-top: 1px solid #e9ecef;
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+}
+
+.resetButton {
+ background: #f8f9fa;
+ border: 1px solid #dee2e6;
+ border-radius: 4px;
+ padding: 6px 12px;
+ font-size: 12px;
+ color: #6c757d;
+ cursor: pointer;
+ transition: all 0.2s ease;
+}
+
+.resetButton:hover {
+ background: #e9ecef;
+ border-color: #adb5bd;
+}
+
+.completionMessage {
+ color: var(--ifm-color-success);
+ font-weight: 600;
+ font-size: 14px;
+ background: var(--ifm-color-success-lightest);
+ padding: 12px 16px;
+ border-radius: 8px;
+ border: 1px solid var(--ifm-color-success-light);
+ text-align: center;
+}
+
+/* Dark mode support */
+[data-theme="dark"] .tutorialProgress {
+ background: #1e1e1e;
+ border-color: #444;
+ color: #e9ecef;
+}
+
+[data-theme="dark"] .header h4 {
+ color: #e9ecef;
+}
+
+[data-theme="dark"] .progressBar {
+ background: #444;
+}
+
+[data-theme="dark"] .progressText {
+ color: #e9ecef;
+ text-shadow: 0 0 3px rgba(0, 0, 0, 0.8);
+}
+
+[data-theme="dark"] .step:hover {
+ background: rgba(0, 123, 255, 0.15);
+}
+
+[data-theme="dark"] .step.current {
+ background: rgba(0, 123, 255, 0.2);
+}
+
+[data-theme="dark"] .stepText strong {
+ color: #e9ecef;
+}
+
+[data-theme="dark"] .actions {
+ border-top-color: #444;
+}
+
+[data-theme="dark"] .resetButton {
+ background: #2d2d2d;
+ border-color: #444;
+ color: #adb5bd;
+}
+
+[data-theme="dark"] .resetButton:hover {
+ background: #3d3d3d;
+ border-color: #555;
+}
+
+/* Compact mode styles */
+.compact {
+ position: relative;
+ top: auto;
+ margin: 16px 0;
+ padding: 16px;
+ background: var(--ifm-color-emphasis-100);
+ border: 1px solid var(--ifm-color-emphasis-200);
+}
+
+.compactHeader {
+ display: flex;
+ align-items: center;
+ gap: 16px;
+}
+
+.compactTitle {
+ font-weight: 600;
+ color: var(--ifm-color-content);
+ font-size: 14px;
+ white-space: nowrap;
+}
+
+.compactBar {
+ flex: 1;
+ height: 8px;
+ background: var(--ifm-color-emphasis-200);
+ border-radius: 4px;
+ overflow: hidden;
+}
+
+.compactBar .progressFill {
+ height: 100%;
+ border-radius: 4px;
+}
+
+[data-theme="dark"] .compact {
+ background: var(--ifm-color-emphasis-200);
+ border-color: var(--ifm-color-emphasis-300);
+}
+
+/* Header content layout */
+.headerContent {
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ margin-bottom: 12px;
+}
+
+/* Minimize/Expand buttons */
+.minimizeButton,
+.expandButton {
+ background: none;
+ border: none;
+ font-size: 16px;
+ cursor: pointer;
+ padding: 4px 8px;
+ border-radius: 4px;
+ transition: all 0.2s ease;
+ opacity: 0.7;
+}
+
+.minimizeButton:hover,
+.expandButton:hover {
+ opacity: 1;
+ background: var(--ifm-color-emphasis-100);
+}
+
+/* Minimized state styles */
+.minimized {
+ position: fixed;
+ top: 80px; /* Below the DataHub header banner */
+ right: 20px;
+ width: 300px;
+ z-index: 100; /* Lower z-index to stay below header */
+ margin: 0;
+ box-shadow: 0 4px 16px rgba(0, 0, 0, 0.15);
+ border-radius: 8px;
+ transition: all 0.3s ease;
+}
+
+.minimized.scrolled {
+ top: 70px; /* Slightly higher when scrolled but still below header */
+ width: 280px;
+ box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2);
+}
+
+.minimizedHeader {
+ display: flex;
+ align-items: center;
+ justify-content: space-between;
+ padding: 12px 16px;
+ cursor: pointer;
+ transition: all 0.2s ease;
+}
+
+.minimizedHeader:hover {
+ background: var(--ifm-color-emphasis-50);
+}
+
+.minimizedContent {
+ flex: 1;
+ display: flex;
+ flex-direction: column;
+ gap: 8px;
+}
+
+.minimizedTitle {
+ font-weight: 600;
+ font-size: 14px;
+ color: var(--ifm-color-content);
+}
+
+.minimizedBar {
+ height: 6px;
+ background: var(--ifm-color-emphasis-200);
+ border-radius: 3px;
+ overflow: hidden;
+}
+
+.minimizedBar .progressFill {
+ height: 100%;
+ border-radius: 3px;
+}
+
+.expandButton {
+ margin-left: 12px;
+ font-size: 14px;
+}
+
+/* Dark mode adjustments for minimized state */
+[data-theme="dark"] .minimized {
+ background: var(--ifm-color-emphasis-200);
+ border-color: var(--ifm-color-emphasis-400);
+}
+
+[data-theme="dark"] .minimizedHeader:hover {
+ background: var(--ifm-color-emphasis-300);
+}
+
+[data-theme="dark"] .minimizedBar {
+ background: var(--ifm-color-emphasis-400);
+}
+
+/* Responsive design for minimized state */
+@media (max-width: 768px) {
+ .minimized {
+ position: relative;
+ top: auto;
+ right: auto;
+ width: 100%;
+ margin: 16px 0;
+ }
+
+ .minimized.scrolled {
+ position: fixed;
+ top: 60px; /* Account for mobile header */
+ left: 10px;
+ right: 10px;
+ width: auto;
+ }
+}
diff --git a/docs-website/src/css/custom.css b/docs-website/src/css/custom.css
index 0d842f3abdd266..9fb35fe41d9a34 100644
--- a/docs-website/src/css/custom.css
+++ b/docs-website/src/css/custom.css
@@ -58,4 +58,4 @@ body {
100% {
background-position: 0% 50%;
}
-}
\ No newline at end of file
+}
diff --git a/docs-website/src/css/mermaid-custom.css b/docs-website/src/css/mermaid-custom.css
new file mode 100644
index 00000000000000..43a675279e3da2
--- /dev/null
+++ b/docs-website/src/css/mermaid-custom.css
@@ -0,0 +1,166 @@
+/* Enhanced Mermaid Diagram Styling for DataHub */
+
+/* Container styling */
+.mermaid {
+ background: var(--ifm-background-color);
+ border-radius: 12px;
+ padding: 20px;
+ margin: 24px 0;
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+ border: 1px solid var(--ifm-color-emphasis-200);
+ overflow: visible;
+}
+
+/* Dark mode adjustments */
+[data-theme="dark"] .mermaid {
+ background: var(--ifm-color-emphasis-100);
+ border-color: var(--ifm-color-emphasis-300);
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.3);
+}
+
+/* Enhanced node styling */
+.mermaid .node rect,
+.mermaid .node circle,
+.mermaid .node ellipse,
+.mermaid .node polygon {
+ stroke-width: 2px;
+ filter: drop-shadow(0 2px 4px rgba(0, 0, 0, 0.1));
+ transition: all 0.2s ease;
+}
+
+.mermaid .node:hover rect,
+.mermaid .node:hover circle,
+.mermaid .node:hover ellipse,
+.mermaid .node:hover polygon {
+ filter: drop-shadow(0 4px 8px rgba(0, 0, 0, 0.15));
+ transform: translateY(-1px);
+}
+
+/* Enhanced edge/arrow styling */
+.mermaid .edgePath path {
+ stroke-width: 2px;
+ filter: drop-shadow(0 1px 2px rgba(0, 0, 0, 0.1));
+}
+
+.mermaid .arrowheadPath {
+ fill: var(--ifm-color-primary);
+ stroke: var(--ifm-color-primary);
+}
+
+/* Text styling improvements */
+.mermaid .nodeLabel,
+.mermaid .edgeLabel {
+ font-family: var(--ifm-font-family-base);
+ font-weight: 500;
+ text-shadow: 0 1px 2px rgba(255, 255, 255, 0.8);
+}
+
+[data-theme="dark"] .mermaid .nodeLabel,
+[data-theme="dark"] .mermaid .edgeLabel {
+ text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8);
+}
+
+/* Cluster/subgraph styling */
+.mermaid .cluster rect {
+ fill: var(--ifm-color-primary-lightest);
+ stroke: var(--ifm-color-primary-light);
+ stroke-width: 2px;
+ stroke-dasharray: 5, 5;
+ rx: 8px;
+ ry: 8px;
+}
+
+/* Flowchart specific enhancements */
+.mermaid .flowchart-link {
+ stroke: var(--ifm-color-primary);
+ stroke-width: 2px;
+}
+
+/* Sequence diagram enhancements */
+.mermaid .actor {
+ fill: var(--ifm-color-primary-lightest);
+ stroke: var(--ifm-color-primary);
+ stroke-width: 2px;
+}
+
+.mermaid .messageLine0,
+.mermaid .messageLine1 {
+ stroke: var(--ifm-color-primary);
+ stroke-width: 2px;
+}
+
+/* Gantt chart enhancements */
+.mermaid .section0,
+.mermaid .section1,
+.mermaid .section2,
+.mermaid .section3 {
+ fill: var(--ifm-color-primary);
+ opacity: 0.8;
+}
+
+/* Git graph enhancements */
+.mermaid .commit-id,
+.mermaid .commit-msg,
+.mermaid .branch-label {
+ font-family: var(--ifm-font-family-monospace);
+ font-size: 12px;
+}
+
+/* State diagram enhancements */
+.mermaid .state-start circle,
+.mermaid .state-end circle {
+ fill: var(--ifm-color-primary);
+ stroke: var(--ifm-color-primary-dark);
+ stroke-width: 2px;
+}
+
+/* Journey diagram enhancements */
+.mermaid .journey-section {
+ fill: var(--ifm-color-primary-lightest);
+}
+
+/* Responsive design */
+@media (max-width: 768px) {
+ .mermaid {
+ padding: 16px;
+ margin: 16px 0;
+ font-size: 14px;
+ }
+}
+
+/* Animation for diagram loading */
+.mermaid {
+ animation: fadeInUp 0.5s ease-out;
+}
+
+@keyframes fadeInUp {
+ from {
+ opacity: 0;
+ transform: translateY(20px);
+ }
+ to {
+ opacity: 1;
+ transform: translateY(0);
+ }
+}
+
+/* Custom styling for tutorial-specific diagrams */
+.mermaid .tutorial-start {
+ fill: var(--ifm-color-success-lightest) !important;
+ stroke: var(--ifm-color-success) !important;
+}
+
+.mermaid .tutorial-end {
+ fill: var(--ifm-color-primary-lightest) !important;
+ stroke: var(--ifm-color-primary) !important;
+}
+
+.mermaid .tutorial-process {
+ fill: var(--ifm-color-info-lightest) !important;
+ stroke: var(--ifm-color-info) !important;
+}
+
+.mermaid .tutorial-decision {
+ fill: var(--ifm-color-warning-lightest) !important;
+ stroke: var(--ifm-color-warning) !important;
+}
diff --git a/docs-website/src/learn/_components/LearnItemCard/index.jsx b/docs-website/src/learn/_components/LearnItemCard/index.jsx
index 9c6b6cfdc98d87..545557d9e494b7 100644
--- a/docs-website/src/learn/_components/LearnItemCard/index.jsx
+++ b/docs-website/src/learn/_components/LearnItemCard/index.jsx
@@ -6,7 +6,8 @@ import styles from "./styles.module.scss";
export default function LearnItemCard() {
const { metadata } = useBlogPost();
- const { permalink, title, description, formattedDate, frontMatter } = metadata;
+ const { permalink, title, description, formattedDate, frontMatter } =
+ metadata;
return (
@@ -23,8 +24,10 @@ export default function LearnItemCard() {
{description}
- Published on {formattedDate}
+
+ Published on {formattedDate}
+
);
-}
\ No newline at end of file
+}
diff --git a/docs-website/src/learn/_components/LearnItemCard/styles.module.scss b/docs-website/src/learn/_components/LearnItemCard/styles.module.scss
index 2bfaabdc06d498..35a5c93c348c26 100644
--- a/docs-website/src/learn/_components/LearnItemCard/styles.module.scss
+++ b/docs-website/src/learn/_components/LearnItemCard/styles.module.scss
@@ -50,4 +50,4 @@
width: 100%;
height: auto;
}
-}
\ No newline at end of file
+}
diff --git a/docs-website/src/learn/_components/LearnListPage/index.jsx b/docs-website/src/learn/_components/LearnListPage/index.jsx
index 1ceec9afa1e8a3..4fa75be98dc62f 100644
--- a/docs-website/src/learn/_components/LearnListPage/index.jsx
+++ b/docs-website/src/learn/_components/LearnListPage/index.jsx
@@ -2,7 +2,11 @@ import React, { useState } from "react";
import clsx from "clsx";
import useDocusaurusContext from "@docusaurus/useDocusaurusContext";
-import { PageMetadata, HtmlClassNameProvider, ThemeClassNames } from "@docusaurus/theme-common";
+import {
+ PageMetadata,
+ HtmlClassNameProvider,
+ ThemeClassNames,
+} from "@docusaurus/theme-common";
import BlogListPaginator from "@theme/BlogListPaginator";
import SearchMetadata from "@theme/SearchMetadata";
import { BlogPostProvider } from "@docusaurus/theme-common/internal";
@@ -30,10 +34,20 @@ function BlogListPageContent(props) {
const { metadata, items } = props;
const [activeFilters, setActiveFilters] = useState([]);
// These are currently hardcoded, check the frontmatter of the blog posts to see what audiences are available
- const audiences = ["Data Governance Leads", "Data Engineers", "Data Architects", "Data Platform Leads", "Data Analysts"];
+ const audiences = [
+ "Data Governance Leads",
+ "Data Engineers",
+ "Data Architects",
+ "Data Platform Leads",
+ "Data Analysts",
+ ];
const filteredItems = activeFilters?.length
- ? (items || []).filter((post) => activeFilters.some((activeFilter) => post?.content?.frontMatter?.audience?.some((a) => a === activeFilter)))
+ ? (items || []).filter((post) =>
+ activeFilters.some((activeFilter) =>
+ post?.content?.frontMatter?.audience?.some((a) => a === activeFilter),
+ ),
+ )
: items;
const handleFilterToggle = (audience) => {
@@ -51,14 +65,19 @@ function BlogListPageContent(props) {
DataHub Learn
-
Learn about the hot topics in the data ecosystem and how DataHub can help you with your data journey.
+
+ Learn about the hot topics in the data ecosystem and how DataHub
+ can help you with your data journey.
+
For:
{audiences.map((audience) => (
handleFilterToggle(audience)}
key={audience}
>
@@ -71,7 +90,10 @@ function BlogListPageContent(props) {
{(filteredItems || []).map(({ content: BlogPostContent }) => (
-
+
))}
@@ -84,9 +106,14 @@ function BlogListPageContent(props) {
export default function BlogListPage(props) {
return (
-
+
);
-}
\ No newline at end of file
+}
diff --git a/docs-website/src/learn/business-glossary.md b/docs-website/src/learn/business-glossary.md
index 2c882f6adef7a4..57fbcbd0622b78 100644
--- a/docs-website/src/learn/business-glossary.md
+++ b/docs-website/src/learn/business-glossary.md
@@ -24,17 +24,17 @@ A Business Glossary is like a dictionary for your company. It contains definitio
For example, below are some sales-related glossary terms that can be used in an IT company.
-| Term | Definition | Usage |
-| --- | --- | --- |
-| CRM (Customer Relationship Management) | Software that manages a company's interactions with current and potential customers. | CRMs help streamline processes and improve customer relationships. |
-| Lead | A potential customer who has shown interest in a company's product or service. | Leads are nurtured by the sales team to convert into customers. |
-| Pipeline | The stages through which a sales prospect moves from initial contact to final sale. | Sales pipelines track progress and forecast future sales. |
-| Quota | A sales target set for a salesperson or team for a specific period. | Quotas motivate sales teams and measure performance. |
-| Conversion Rate | The percentage of leads that turn into actual sales. | High conversion rates indicate effective sales strategies. |
-| Upselling | Encouraging customers to purchase a more expensive or upgraded version of a product. | Upselling increases revenue by enhancing the customer purchase. |
-| Churn Rate | The percentage of customers who stop using a product or service over a given period. | Reducing churn rate is crucial for maintaining steady growth. |
-| MQL (Marketing Qualified Lead) | A lead that has been deemed more likely to become a customer based on marketing efforts. | MQLs are passed from the marketing team to the sales team for further nurturing. |
-| ARR (Annual Recurring Revenue) | The amount of revenue that a company expects to receive from its customers on an annual basis for subscriptions. | ARR helps in financial forecasting and performance measurement. |
+| Term | Definition | Usage |
+| -------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
+| CRM (Customer Relationship Management) | Software that manages a company's interactions with current and potential customers. | CRMs help streamline processes and improve customer relationships. |
+| Lead | A potential customer who has shown interest in a company's product or service. | Leads are nurtured by the sales team to convert into customers. |
+| Pipeline | The stages through which a sales prospect moves from initial contact to final sale. | Sales pipelines track progress and forecast future sales. |
+| Quota | A sales target set for a salesperson or team for a specific period. | Quotas motivate sales teams and measure performance. |
+| Conversion Rate | The percentage of leads that turn into actual sales. | High conversion rates indicate effective sales strategies. |
+| Upselling | Encouraging customers to purchase a more expensive or upgraded version of a product. | Upselling increases revenue by enhancing the customer purchase. |
+| Churn Rate | The percentage of customers who stop using a product or service over a given period. | Reducing churn rate is crucial for maintaining steady growth. |
+| MQL (Marketing Qualified Lead) | A lead that has been deemed more likely to become a customer based on marketing efforts. | MQLs are passed from the marketing team to the sales team for further nurturing. |
+| ARR (Annual Recurring Revenue) | The amount of revenue that a company expects to receive from its customers on an annual basis for subscriptions. | ARR helps in financial forecasting and performance measurement. |
## What is Business Glossary Standardization?
@@ -61,17 +61,17 @@ Imagine a financial services company where different teams use varied terminolog
Here's how different teams might interpret CLV and the potential implications:
-| Team | Interpretation of CLV | Focus | Implications |
-| --- | --- | --- | --- |
-| Marketing | Total revenue generated from a customer over their entire relationship with the company | Campaign effectiveness, customer acquisition costs, return on marketing investment | Revenue maximization through frequent promotions, potentially ignoring the cost of service and risk associated with certain customer segments |
-| Sales | Projected future sales from a customer based on past purchasing behavior | Sales targets, customer retention, cross-selling/up-selling opportunities | Aggressive sales tactics to boost short-term sales, potentially leading to customer churn if the value delivered does not meet |
-| Finance | Net present value (NPV), factoring in the time value of money and associated costs over the customer relationship period | Profitability, cost management, financial forecasting | Conservative growth strategies, focusing on high-value, low-risk customers, potentially overlooking opportunities for broader market expansion |
+| Team | Interpretation of CLV | Focus | Implications |
+| --------- | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| Marketing | Total revenue generated from a customer over their entire relationship with the company | Campaign effectiveness, customer acquisition costs, return on marketing investment | Revenue maximization through frequent promotions, potentially ignoring the cost of service and risk associated with certain customer segments |
+| Sales | Projected future sales from a customer based on past purchasing behavior | Sales targets, customer retention, cross-selling/up-selling opportunities | Aggressive sales tactics to boost short-term sales, potentially leading to customer churn if the value delivered does not meet |
+| Finance | Net present value (NPV), factoring in the time value of money and associated costs over the customer relationship period | Profitability, cost management, financial forecasting | Conservative growth strategies, focusing on high-value, low-risk customers, potentially overlooking opportunities for broader market expansion |
- Different interpretations can lead to conflicting strategies and objectives across teams. For instance, Marketing’s aggressive acquisition strategy may lead to a significant increase in new customers and short-term revenue. However, if Finance’s NPV analysis reveals that these customers are not profitable long-term, the company may face financial strain due to high acquisition costs and low profitability.
+Different interpretations can lead to conflicting strategies and objectives across teams. For instance, Marketing’s aggressive acquisition strategy may lead to a significant increase in new customers and short-term revenue. However, if Finance’s NPV analysis reveals that these customers are not profitable long-term, the company may face financial strain due to high acquisition costs and low profitability.
- The Sales team’s push for upselling may generate short-term sales increases, aligning with their CLV projections. However, if customers feel pressured and perceive the upsells as unnecessary, this could lead to dissatisfaction and higher churn rates, ultimately reducing the actual lifetime value of these customers.
+The Sales team’s push for upselling may generate short-term sales increases, aligning with their CLV projections. However, if customers feel pressured and perceive the upsells as unnecessary, this could lead to dissatisfaction and higher churn rates, ultimately reducing the actual lifetime value of these customers.
- The conflicting strategies can result in misaligned priorities, where Marketing focuses on volume, Sales on immediate revenue, and Finance on long-term profitability. This misalignment can lead to inefficient resource allocation, where Marketing spends heavily on acquisition, Sales focuses on short-term gains, and Finance restricts budgets due to profitability concerns.
+The conflicting strategies can result in misaligned priorities, where Marketing focuses on volume, Sales on immediate revenue, and Finance on long-term profitability. This misalignment can lead to inefficient resource allocation, where Marketing spends heavily on acquisition, Sales focuses on short-term gains, and Finance restricts budgets due to profitability concerns.
### Example Discovery Questions
@@ -101,14 +101,12 @@ DataHub Cloud offers comprehensive features designed to support the authoring of
- **[Centralized Business Glossary](https://docs.datahub.com/docs/glossary/business-glossary):** A repository for all business terms and definitions, ensuring consistency across the organization.
-
Approval Flows
-
- **[Approval Flows](https://docs.datahub.com/docs/managed-datahub/approval-workflows):** Structured workflows for approving changes to the glossary, maintaining quality and consistency through time
- **Automated Data Classification:** Tools to tag critical data assets - tables, columns, dashboards, and pipelines - with terms from the business glossary using automations and custom rules.
@@ -117,4 +115,4 @@ By implementing these solutions, you can ensure that your business terminology i
## Conclusion
-Standardizing your business glossary is essential for maintaining consistency, ensuring compliance, and optimizing data use. By implementing best practices and leveraging advanced tools, you can achieve a more efficient and reliable data management process. This investment will lead to better decision-making, reduced compliance risks, and a more cohesive organizational understanding of data.
\ No newline at end of file
+Standardizing your business glossary is essential for maintaining consistency, ensuring compliance, and optimizing data use. By implementing best practices and leveraging advanced tools, you can achieve a more efficient and reliable data management process. This investment will lead to better decision-making, reduced compliance risks, and a more cohesive organizational understanding of data.
diff --git a/docs-website/src/learn/business-metric.md b/docs-website/src/learn/business-metric.md
index 2e4a83f9181ceb..08bfac437ea119 100644
--- a/docs-website/src/learn/business-metric.md
+++ b/docs-website/src/learn/business-metric.md
@@ -34,7 +34,7 @@ Standardizing business metrics is crucial because these metrics are direct indic
### Real-World Impact
-Consider a scenario where the finance team defines revenue differently from the product team. If these discrepancies are not reconciled, it could lead to conflicting reports and misguided strategies. For instance, a marketing campaign analyzed with inconsistent metrics might appear successful in one report and unsuccessful in another, causing confusion and potentially leading to incorrect strategic decisions. Disagreements about the source-of-truth or accuracy of a given metric are commonplace; perhaps you can recall some examples from your own experience.
+Consider a scenario where the finance team defines revenue differently from the product team. If these discrepancies are not reconciled, it could lead to conflicting reports and misguided strategies. For instance, a marketing campaign analyzed with inconsistent metrics might appear successful in one report and unsuccessful in another, causing confusion and potentially leading to incorrect strategic decisions. Disagreements about the source-of-truth or accuracy of a given metric are commonplace; perhaps you can recall some examples from your own experience.
### Example Discovery Questions and Explanations
@@ -50,7 +50,7 @@ Start by identifying key business metrics that are actively used to power decisi
### Alternatives and Best Practices
-Some companies try to align metric definitions through emails and meetings. While this is a good place to start, it is often impractical at scale. Instead, best practices involve using a centralized system for defining and discovering key business metrics. Implementing approval flows and lineage tracking can ensure that all changes are reviewed and that the physical origins of a metric - e.g. the actual tables and rows that power it - are immediately clear. By making metrics centrally visible, you can begin to establish accountability and audibility around your key metrics, increasing their reliability through time and improving the quality of your decisions.
+Some companies try to align metric definitions through emails and meetings. While this is a good place to start, it is often impractical at scale. Instead, best practices involve using a centralized system for defining and discovering key business metrics. Implementing approval flows and lineage tracking can ensure that all changes are reviewed and that the physical origins of a metric - e.g. the actual tables and rows that power it - are immediately clear. By making metrics centrally visible, you can begin to establish accountability and audibility around your key metrics, increasing their reliability through time and improving the quality of your decisions.
### Our Solution
@@ -62,7 +62,6 @@ DataHub Cloud offers comprehensive features designed to tackle the challenges of
Business Glossary Center
-
- **[Business Glossary](https://docs.datahub.com/docs/glossary/business-glossary):** A centralized repository for all metrics definitions, ensuring consistency across the organization.
@@ -85,4 +84,4 @@ By implementing these solutions, you can ensure that your business metrics are c
### Conclusion
-Defining and standardizing business metrics is essential for ensuring consistent, accurate, and reliable data analysis and decision-making within an organization. By implementing best practices and leveraging advanced tools like our product’s business glossary, approval flows, and lineage tracking, you can achieve a more cohesive and efficient approach to managing business metrics. This investment will lead to better insights, more informed decisions, and ultimately, a more successful data-driven organization.
\ No newline at end of file
+Defining and standardizing business metrics is essential for ensuring consistent, accurate, and reliable data analysis and decision-making within an organization. By implementing best practices and leveraging advanced tools like our product’s business glossary, approval flows, and lineage tracking, you can achieve a more cohesive and efficient approach to managing business metrics. This investment will lead to better insights, more informed decisions, and ultimately, a more successful data-driven organization.
diff --git a/docs-website/src/learn/data-freshness.md b/docs-website/src/learn/data-freshness.md
index b53bd1a1c9a6b5..84e7dba5722a1f 100644
--- a/docs-website/src/learn/data-freshness.md
+++ b/docs-website/src/learn/data-freshness.md
@@ -20,9 +20,9 @@ Have you ever experienced delays in delivering tables that or machine learning (
## What is Data Freshness?
-Data freshness refers to the timeliness and completeness of data used to build tables and ML models. Specifically, freshness can be measured by the difference in time between when some event *actually occurs* vs when that record of that event is reflected in a dataset or used to train an AI model.
+Data freshness refers to the timeliness and completeness of data used to build tables and ML models. Specifically, freshness can be measured by the difference in time between when some event _actually occurs_ vs when that record of that event is reflected in a dataset or used to train an AI model.
-To make things concrete, let’s imagine you run an e-commerce business selling t-shirts. When a user clicks the final “purchase” button to finalize a purchase, this interaction is recorded, eventually winding up in a consolidated “click_events” table on your data warehouse. Data freshness in this case could be measured by comparing when the actual click was performed against when the record of the click landed in the data warehouse. In reality, freshness can be measured against any reference point - e.g. event time, ingestion time, or something else - in relation to when a target table, model, or other data product is updated with new data.
+To make things concrete, let’s imagine you run an e-commerce business selling t-shirts. When a user clicks the final “purchase” button to finalize a purchase, this interaction is recorded, eventually winding up in a consolidated “click_events” table on your data warehouse. Data freshness in this case could be measured by comparing when the actual click was performed against when the record of the click landed in the data warehouse. In reality, freshness can be measured against any reference point - e.g. event time, ingestion time, or something else - in relation to when a target table, model, or other data product is updated with new data.
@@ -30,15 +30,15 @@ To make things concrete, let’s imagine you run an e-commerce business selling
Data Freshness
-Oftentimes, data pipelines are designed in order meet some well-defined availability latency, or data freshness SLA, with the specifics of this type of agreement dictating how and when the data pipeline is triggered to run.
+Oftentimes, data pipelines are designed in order meet some well-defined availability latency, or data freshness SLA, with the specifics of this type of agreement dictating how and when the data pipeline is triggered to run.
-In the modern data landscape, ensuring that data is up-to-date is vital for building high-quality data products, from reporting dashboards used to drive day-to-day company decisions to personalized and dynamic data- or AI-powered product experiences.
+In the modern data landscape, ensuring that data is up-to-date is vital for building high-quality data products, from reporting dashboards used to drive day-to-day company decisions to personalized and dynamic data- or AI-powered product experiences.
## Why Data Freshness Matters
-For many organizations, fresh data is more than a ‘nice to have’.
+For many organizations, fresh data is more than a ‘nice to have’.
-Mission-critical ML models, like those used for price prediction or fraud detection, depend heavily on fresh data to make accurate predictions. Delays in updating these models can lead to lost revenue and damage to your company's reputation.
+Mission-critical ML models, like those used for price prediction or fraud detection, depend heavily on fresh data to make accurate predictions. Delays in updating these models can lead to lost revenue and damage to your company's reputation.
Customer-facing data products, for example recommendation features, also need timely updates to ensure that customers receive the most recent and relevant information personalized to them. Delays in data freshness can result in customer frustration, user churn, and loss of trust.
@@ -53,7 +53,7 @@ Can you recall examples when your organization faced challenges in maintaining t
Because data is highly interconnected, delays in data freshness can lead to cascading problems, particularly of your organization lacks a robust system for identifying and resolving such problems. How does your organization prioritize and manage such incidents? Processes for quickly identifying and resolving root causes are essential for minimizing negative impacts on revenue and reputation.
**Automated Freshness Monitoring:**
-
+
If data freshness problems often go undetected for long periods of time, there may be opportunities to automate the detection of such problems for core tables and AI models so that your team is first to know when something goes wrong.
## How to Ensure Data Freshness
@@ -76,21 +76,20 @@ Establish clear protocols for incident management to prioritize and resolve data
### Alternatives
-While manual investigation and communication using tools like Slack can help triage issues, they often result in time-consuming, inefficient, and informal processes for addressing data quality issues related to freshness, ultimately leading to lower quality outcomes. Automated freshness incident detection and structured incident management via dedicated data monitoring tools can help improve the situation by providing a single place for detecting, communicating, and coordinating to resolve data freshness issues.
+While manual investigation and communication using tools like Slack can help triage issues, they often result in time-consuming, inefficient, and informal processes for addressing data quality issues related to freshness, ultimately leading to lower quality outcomes. Automated freshness incident detection and structured incident management via dedicated data monitoring tools can help improve the situation by providing a single place for detecting, communicating, and coordinating to resolve data freshness issues.
### How DataHub Can Help
DataHub offers comprehensive features designed to tackle data freshness challenges:
-
**[End-To-End Data Lineage](https://docs.datahub.com/docs/features/feature-guides/lineage) and [Impact Analysis](https://docs.datahub.com/docs/act-on-metadata/impact-analysis):** Easily track the flow of data through your organization to identify, debug, and resolve delays quickly.
+
Data Lineage
-
**Freshness Monitoring & Alerting:** Automatically detect and alert when data freshness issues occur, to ensure timely updates by proactively monitoring key datasets for updates. Check out [Assertions](https://docs.datahub.com/docs/managed-datahub/observe/assertions) and [Freshness Assertions](https://docs.datahub.com/docs/managed-datahub/observe/freshness-assertions), Available in **DataHub Cloud Only.**
@@ -99,23 +98,20 @@ DataHub offers comprehensive features designed to tackle data freshness challeng
Freshness Assertions Results
-
Smart assertions checks for changes on a cadence based on the Table history, by default using the Audit Log.
-
**[Incident Management](https://docs.datahub.com/docs/incidents/incidents)** : Centralize data incident management and begin to effectively triage, prioritize, communicate and resolve data freshness issues to all relevant stakeholders. Check out [subscription & notification](https://docs.datahub.com/docs/managed-datahub/subscription-and-notification) features as well.
-
-By implementing these solutions, you can ensure that your key datasets and models are always up-to-date, maintaining their relevancy, accuracy, and reliability for critical use cases within your organization.
+By implementing these solutions, you can ensure that your key datasets and models are always up-to-date, maintaining their relevancy, accuracy, and reliability for critical use cases within your organization.
## Conclusion
-Ensuring data freshness is essential for the performance and reliability of critical datasets and AI/ML models. By understanding the importance of data freshness and implementing best practices and automated solutions, you can effectively manage and mitigate delays, thereby protecting your revenue and reputation. DataHub is designed to help you achieve this, providing the tools and features necessary to keep your data fresh and your operations running smoothly.
\ No newline at end of file
+Ensuring data freshness is essential for the performance and reliability of critical datasets and AI/ML models. By understanding the importance of data freshness and implementing best practices and automated solutions, you can effectively manage and mitigate delays, thereby protecting your revenue and reputation. DataHub is designed to help you achieve this, providing the tools and features necessary to keep your data fresh and your operations running smoothly.
diff --git a/docs-website/src/learn/data-mesh.md b/docs-website/src/learn/data-mesh.md
index eb3cb23f419548..95182995c3c28e 100644
--- a/docs-website/src/learn/data-mesh.md
+++ b/docs-website/src/learn/data-mesh.md
@@ -1,7 +1,8 @@
---
title: "What is a Data Mesh and How to Implement It in Your Organization"
description: Learn how a data mesh aligns data management with domain expertise, enhancing overall organizational agility.
-tags: ["Data Mesh", "Use Case", "For Data Architects", "For Data Platform Leads"]
+tags:
+ ["Data Mesh", "Use Case", "For Data Architects", "For Data Platform Leads"]
image: /img/learn/use-case-data-mesh.png
hide_table_of_contents: false
audience: ["Data Architects", "Data Platform Leads"]
@@ -20,7 +21,7 @@ Have you faced challenges in managing decentralized data across various business
## What is Data Mesh?
-Data Mesh is a decentralized data architecture that shifts the responsibility of data management from a central team to individual business units, or "domains." Each domain in turn produces “data products”, or consumable data artifacts, ensuring that data management is closely aligned with domain-specific expertise. This approach promotes agility, scalability, and the ability to generate insights more effectively.
+Data Mesh is a decentralized data architecture that shifts the responsibility of data management from a central team to individual business units, or "domains." Each domain in turn produces “data products”, or consumable data artifacts, ensuring that data management is closely aligned with domain-specific expertise. This approach promotes agility, scalability, and the ability to generate insights more effectively.
If you’re familiar with [Service-Oriented Architectures](https://en.wikipedia.org/wiki/Service-oriented_architecture), i.e. micro-services, this might sound familiar. Data Mesh is a somewhat analogous concept, but applied to data!
@@ -30,13 +31,12 @@ If you’re familiar with [Service-Oriented Architectures](https://en.wikipedia.
4 Principles of Data Mesh
-
-| Principle | Explanation |
-| --- | --- |
-| Domain Data Ownership | Organizing data into explicit domains based on the structure of your organization, and then assigning clear accountability to each. This enables you to more easily increase the number of sources of data, variety of use cases, and diversity of access models to the data increases. |
-| Data as a product | Domain data should be highly accessible and highly reliable by default. It should be easy to discover, easy to understand, easy to access securely, and high quality. |
-| Self-Service | Domain teams should be able to independently create, consume, and manage data products on top of a general-purpose platform that can hide the complexity of building, executing and maintaining secure and interoperable data products. |
-| Federated Governance | Consistent standards that are enforced by process and technology around interoperability, compliance, and quality. This makes it easy for data consumers to interact with data products across domains in familiar way and ensures quality is maintained uniformly. |
+| Principle | Explanation |
+| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Domain Data Ownership | Organizing data into explicit domains based on the structure of your organization, and then assigning clear accountability to each. This enables you to more easily increase the number of sources of data, variety of use cases, and diversity of access models to the data increases. |
+| Data as a product | Domain data should be highly accessible and highly reliable by default. It should be easy to discover, easy to understand, easy to access securely, and high quality. |
+| Self-Service | Domain teams should be able to independently create, consume, and manage data products on top of a general-purpose platform that can hide the complexity of building, executing and maintaining secure and interoperable data products. |
+| Federated Governance | Consistent standards that are enforced by process and technology around interoperability, compliance, and quality. This makes it easy for data consumers to interact with data products across domains in familiar way and ensures quality is maintained uniformly. |
@@ -44,15 +44,13 @@ If you’re familiar with [Service-Oriented Architectures](https://en.wikipedia.
Logical architecture of data mesh approach, Image Credit: Zhamak Dehghani
-
-
## Why Implement Data Mesh?
-For data architects and data platform leads, implementing a Data Mesh can resolve various challenges associated with managing decentralized data, particularly as you try to scale up.
+For data architects and data platform leads, implementing a Data Mesh can resolve various challenges associated with managing decentralized data, particularly as you try to scale up.
-Traditional data lakes or warehouses can become central bottlenecks, impairing access, understanding, accountability, and quality of data - ultimately, its usability. These architectures can struggle to meet the diverse needs of different business units, leading to inefficiencies.
+Traditional data lakes or warehouses can become central bottlenecks, impairing access, understanding, accountability, and quality of data - ultimately, its usability. These architectures can struggle to meet the diverse needs of different business units, leading to inefficiencies.
-Data Mesh addresses these issues by formally dividing data into decentralized domains, which are owned by the individual teams who are experts in those domains. This approach allows each business unit or domain to manage its own data, enabling independent creation and consumption of data and increasing the agility, reliability, scalability of an organization’s data practice.
+Data Mesh addresses these issues by formally dividing data into decentralized domains, which are owned by the individual teams who are experts in those domains. This approach allows each business unit or domain to manage its own data, enabling independent creation and consumption of data and increasing the agility, reliability, scalability of an organization’s data practice.
### Key Considerations for Your Organization
@@ -86,7 +84,7 @@ Adopt a federated governance model to balance autonomy and control. While domain
### Alternatives
-While a centralized data lake or warehouse can simplify data governance by virtue of keeping everything in one place, it can become a bottleneck as your data organization grows. Decentralized Data Mesh can provide a more scalable and agile approach, by distributing day-to-day responsibility for accessing, producing, and validating data while enforcing a centralized set of standards and processes.
+While a centralized data lake or warehouse can simplify data governance by virtue of keeping everything in one place, it can become a bottleneck as your data organization grows. Decentralized Data Mesh can provide a more scalable and agile approach, by distributing day-to-day responsibility for accessing, producing, and validating data while enforcing a centralized set of standards and processes.
### Our Solution
@@ -95,7 +93,6 @@ DataHub Cloud offers a comprehensive set of features designed to support the imp
- **[Data Domains](https://docs.datahub.com/docs/domains)**: Clearly define and manage data products within each business unit.
- **[Data Products](https://docs.datahub.com/docs/dataproducts):** Ensure each domain owns and manages its data products, promoting autonomy and agility.
- **[Data Contracts](https://docs.datahub.com/docs/managed-datahub/observe/data-contract)**: Establish clear agreements between domains to ensure consistency and reliability.
-
@@ -103,19 +100,14 @@ DataHub Cloud offers a comprehensive set of features designed to support the imp
Data Contracts in DataHub Cloud UI
-
-
- **[Assertions](https://docs.datahub.com/docs/managed-datahub/observe/assertions)** Monitor data quality using freshness, volume, column validity, schema, and custom SQL checks to get notified first when things go wrong
-
Assertion Results
-
-
- **[Metadata Tests](https://docs.datahub.com/docs/tests/metadata-tests)**: Monitor and enforce a central set of standards or policies across all of your data assets, e.g. to ensure data documentation, data ownership, and data classification.
@@ -128,4 +120,4 @@ By implementing these solutions, you can effectively manage decentralized data,
## Conclusion
-Implementing a Data Mesh can significantly improve your organization's ability to manage and leverage decentralized data. By understanding the benefits of data mesh and following best practices for implementation, you can overcome the limitations of centralized data systems and enhance your agility, scalability, and ability to generate insights. DataHub Cloud was built from the ground up to help you achieve this, providing the tools and features necessary to implement a large-scale Data Mesh successfully.
\ No newline at end of file
+Implementing a Data Mesh can significantly improve your organization's ability to manage and leverage decentralized data. By understanding the benefits of data mesh and following best practices for implementation, you can overcome the limitations of centralized data systems and enhance your agility, scalability, and ability to generate insights. DataHub Cloud was built from the ground up to help you achieve this, providing the tools and features necessary to implement a large-scale Data Mesh successfully.
diff --git a/docs-website/src/learn/data-pipeline.md b/docs-website/src/learn/data-pipeline.md
index 8f7f75f8b16ba9..0307b56884f97f 100644
--- a/docs-website/src/learn/data-pipeline.md
+++ b/docs-website/src/learn/data-pipeline.md
@@ -22,7 +22,6 @@ Have you ever been frustrated by slow and unreliable data pipelines or unexpecte
A data pipeline is a series of processes that move data from one system to another - a key component in the supply chain for data. Think of it like a conveyor belt in a factory, transporting raw materials to different stations where they are processed into the final product. In the context of data, pipelines extract, transform, and load data (ETL) from various sources to destinations like data warehouses, ensuring the data is ready for analysis and use in applications such as machine learning models and business intelligence dashboards.
-
@@ -33,7 +32,7 @@ A data pipeline is a series of processes that move data from one system to anoth
### The Problem
-Over time, data pipelines can slow down or become unreliable due to new dependencies, application code bugs, and poorly optimized queries, leading to missed data freshness SLAs and increased cloud costs. For data engineers, this means more time spent on manual debugging and justifying costs to your executives.
+Over time, data pipelines can slow down or become unreliable due to new dependencies, application code bugs, and poorly optimized queries, leading to missed data freshness SLAs and increased cloud costs. For data engineers, this means more time spent on manual debugging and justifying costs to your executives.
### Importance
@@ -83,8 +82,6 @@ DataHub Cloud offers comprehensive features designed to optimize data pipelines:
By implementing these solutions, you can ensure that your data pipelines are running efficiently, meeting delivery SLAs, and staying within budget.
-
-
## Conclusion
-Optimizing data pipelines is essential for maintaining data reliability, controlling costs, and ultimately ensuring your business continues to run smoothly. By implementing best practices and leveraging advanced tools like our product’s lineage tracking and automated monitoring features, you can achieve efficient and cost-effective data pipelines. Investing time and resources into optimization will ultimately lead to better performance, lower costs, and more satisfied stakeholders.
\ No newline at end of file
+Optimizing data pipelines is essential for maintaining data reliability, controlling costs, and ultimately ensuring your business continues to run smoothly. By implementing best practices and leveraging advanced tools like our product’s lineage tracking and automated monitoring features, you can achieve efficient and cost-effective data pipelines. Investing time and resources into optimization will ultimately lead to better performance, lower costs, and more satisfied stakeholders.
diff --git a/docs-website/src/pages/champions.js b/docs-website/src/pages/champions.js
index 1be11ca2be41e2..b708e964e5501c 100644
--- a/docs-website/src/pages/champions.js
+++ b/docs-website/src/pages/champions.js
@@ -5,4 +5,4 @@ export default function Home() {
window.location.href = "https://www.datahub.com/champions";
}, []);
return null;
-}
\ No newline at end of file
+}
diff --git a/docs-website/src/pages/datahub-components-demo.md b/docs-website/src/pages/datahub-components-demo.md
new file mode 100644
index 00000000000000..3facb320e160e4
--- /dev/null
+++ b/docs-website/src/pages/datahub-components-demo.md
@@ -0,0 +1,170 @@
+# DataHub UI Components Demo
+
+This page demonstrates the DataHub-style UI components that can be embedded in tutorials to provide an authentic DataHub experience.
+
+import DataHubEntityCard, { SampleEntities } from '@site/src/components/DataHubEntityCard';
+import DataHubLineageNode, { DataHubLineageFlow, SampleLineageFlows } from '@site/src/components/DataHubLineageNode';
+
+## Entity Cards
+
+These cards mimic the actual DataHub search results and entity previews:
+
+### Sample User Analytics Tables
+
+
+
+
+### Streaming Data Sources
+
+
+
+### Raw Data Storage
+
+
+
+## Lineage Flows
+
+These components show data pipeline relationships using actual DataHub styling:
+
+### User Metrics Pipeline (Basic)
+
+
+
+### User Metrics Pipeline (with Column-Level Lineage)
+
+
+
+### Troubleshooting Flow
+
+
+
+## Individual Lineage Nodes
+
+### Dataset Nodes (Rectangular) - With Tags & Glossary Terms
+
+
+
+
+
+
+
+### Data Job Nodes (Circular)
+
+
+
+
+
+
+
+## Updated Specifications
+
+The components now match DataHub V3 specifications:
+
+### Dataset Nodes (Rectangular)
+
+- **Width**: 320px (matches `LINEAGE_NODE_WIDTH`)
+- **Height**: 90px base + expandable columns section
+- **Border Radius**: 12px (DataHub V3 styling)
+- **Health Icons**: Actual SVG icons (✓ for Good, ⚠ for Warning/Critical)
+- **Expandable Columns**: Click + button to show/hide column details
+- **Column Types**: Color-coded icons (Aa for strings, 123 for numbers, etc.)
+- **Column Lineage**: → indicator shows columns with lineage connections
+- **Column-Level Lineage**: Visual connections between related columns across nodes (when all nodes expanded)
+- **Tags**: Color-coded dots with tag names (e.g., PII, Daily, Streaming)
+- **Glossary Terms**: Colored ribbon indicators with term names (e.g., User Metrics, Fact Table)
+
+### Data Job Nodes (Circular)
+
+- **Size**: 40px × 40px (matches `TRANSFORMATION_NODE_SIZE`)
+- **Border Radius**: 8px (slightly rounded for transformation nodes)
+- **Health Icons**: Positioned as badges in top-right corner
+- **Platform Logos**: 18px icons centered in the node
+- **No Expansion**: Data jobs don't have column-level details
+
+### Entity Cards
+
+- **Colors**: Synced with DataHub Alchemy design system
+- **Primary**: `#533FD1` (DataHub violet[500])
+- **Border**: `#E9EAEE` (DataHub gray[1400])
+- **Text**: `#374066` (DataHub gray[600])
+
+## Benefits of Using Actual DataHub Components
+
+1. **Pixel-Perfect Accuracy**: Matches exact DataHub V3 dimensions and styling
+2. **Auto-Sync**: Colors and design tokens automatically sync with DataHub updates
+3. **Real Platform Logos**: Uses the actual SVG logos from DataHub's platform library
+4. **Consistent Experience**: Users see the exact same UI they'll encounter in DataHub
+5. **Future-Proof**: Automatically stays in sync as DataHub UI evolves
+
+## Technical Implementation
+
+These components are now precisely calibrated to DataHub's actual specifications:
+
+- **DataHubEntityCard**: Based on `DefaultPreviewCard` with exact color tokens
+- **DataHubLineageNode**: Based on `LineageEntityNode` with V3 dimensions (320x90px)
+- **Platform Logos**: Uses the same SVG assets as production DataHub UI
+- **Design Tokens**: Automatically extracted from `datahub-web-react/src/alchemy-components/theme/`
+
+The styling is automatically synchronized at build time, ensuring tutorial components always match the production DataHub interface.
diff --git a/docs-website/src/pages/docs/_components/FeatureCard/featurecard.module.scss b/docs-website/src/pages/docs/_components/FeatureCard/featurecard.module.scss
index 69558d986ada9b..50e04c9389e01d 100644
--- a/docs-website/src/pages/docs/_components/FeatureCard/featurecard.module.scss
+++ b/docs-website/src/pages/docs/_components/FeatureCard/featurecard.module.scss
@@ -40,4 +40,3 @@
border-color: var(--ifm-color-primary);
}
}
-
diff --git a/docs-website/src/pages/docs/_components/FeatureCard/index.jsx b/docs-website/src/pages/docs/_components/FeatureCard/index.jsx
index 8fb24493e50e9a..6e10785053e69a 100644
--- a/docs-website/src/pages/docs/_components/FeatureCard/index.jsx
+++ b/docs-website/src/pages/docs/_components/FeatureCard/index.jsx
@@ -4,8 +4,8 @@ import styles from "./featurecard.module.scss";
import useBaseUrl from "@docusaurus/useBaseUrl";
import Link from "@docusaurus/Link";
-const FeatureCard = ({icon, title, description, to}) => {
-return (
+const FeatureCard = ({ icon, title, description, to }) => {
+ return (
diff --git a/docs-website/src/pages/docs/_components/FeatureCardSection/featurecardsection.module.scss b/docs-website/src/pages/docs/_components/FeatureCardSection/featurecardsection.module.scss
index 9e08c789c9068b..4fbbc4583d6629 100644
--- a/docs-website/src/pages/docs/_components/FeatureCardSection/featurecardsection.module.scss
+++ b/docs-website/src/pages/docs/_components/FeatureCardSection/featurecardsection.module.scss
@@ -1,5 +1,3 @@
-
-
.feature {
flex-direction: row;
padding: 0.675rem;
diff --git a/docs-website/src/pages/docs/_components/FeatureCardSection/index.jsx b/docs-website/src/pages/docs/_components/FeatureCardSection/index.jsx
index 0d9b56740c065f..8b79b3384806fe 100644
--- a/docs-website/src/pages/docs/_components/FeatureCardSection/index.jsx
+++ b/docs-website/src/pages/docs/_components/FeatureCardSection/index.jsx
@@ -1,5 +1,5 @@
import React from "react";
-import FeatureCard from '../FeatureCard'
+import FeatureCard from "../FeatureCard";
import {
EyeTwoTone,
HeartTwoTone,
@@ -9,49 +9,54 @@ import {
ProfileTwoTone,
} from "@ant-design/icons";
-const featureCardContent = [
-{
+const featureCardContent = [
+ {
title: "Data Discovery",
- description: "Search your entire data ecosystem, including dashboards, datasets, ML models, and raw files.",
+ description:
+ "Search your entire data ecosystem, including dashboards, datasets, ML models, and raw files.",
to: "docs/how/search",
- icon:
+ icon:
,
},
-{
+ {
title: "Data Governance",
description: "Define ownership and track PII.",
to: "https://medium.com/datahub-project/the-3-must-haves-of-metadata-management-part-2-35a649f2e2fb?utm_source=datahub&utm_medium=referral&utm_content=blog",
- icon:
+ icon:
,
},
-{
+ {
title: "Data Quality Control",
- description: "Improve data quality through metadata tests, assertions, data freshness checks, and data contracts.",
+ description:
+ "Improve data quality through metadata tests, assertions, data freshness checks, and data contracts.",
to: "https://www.acryldata.io/blog/data-contracts-in-datahub-combining-verifiability-with-holistic-data-management?utm_source=datahub&utm_medium=referral&utm_content=blog",
- icon:
+ icon:
,
},
-{
- title: "UI-based Ingestion",
- description: "Easily set up integrations in minutes using DataHub's intuitive UI-based ingestion feature.",
- to: "docs/ui-ingestion",
- icon:
-},
-{
- title: "APIs and SDKs",
- description: "For users who prefer programmatic control, DataHub offers a comprehensive set of APIs and SDKs.",
- to: "docs/api/datahub-apis",
- icon:
-},
-{
- title: "Vibrant Community",
- description: "Our community provides support through office hours, workshops, and a Slack channel.",
- to: "docs/slack",
- icon:
-}
-]
+ {
+ title: "UI-based Ingestion",
+ description:
+ "Easily set up integrations in minutes using DataHub's intuitive UI-based ingestion feature.",
+ to: "docs/ui-ingestion",
+ icon:
,
+ },
+ {
+ title: "APIs and SDKs",
+ description:
+ "For users who prefer programmatic control, DataHub offers a comprehensive set of APIs and SDKs.",
+ to: "docs/api/datahub-apis",
+ icon:
,
+ },
+ {
+ title: "Vibrant Community",
+ description:
+ "Our community provides support through office hours, workshops, and a Slack channel.",
+ to: "docs/slack",
+ icon:
,
+ },
+];
const FeatureCards = () => {
-return (
+ return (
-
+
{featureCardContent.map((props, idx) => (
diff --git a/docs-website/src/pages/docs/_components/FilterBar/index.jsx b/docs-website/src/pages/docs/_components/FilterBar/index.jsx
index 354ff1cfa7d3db..5cff20758fb1b8 100644
--- a/docs-website/src/pages/docs/_components/FilterBar/index.jsx
+++ b/docs-website/src/pages/docs/_components/FilterBar/index.jsx
@@ -155,7 +155,7 @@ function FilterBar({
onClick={removeFilters}
className={clsx(
"DocSearch-Reset-Button",
- styles.resetButton
+ styles.resetButton,
)}
style={{ marginRight: "1rem" }}
>
@@ -166,7 +166,7 @@ function FilterBar({
type="primary"
className={clsx(
"DocSearch-Filter-Button",
- styles.filterButton
+ styles.filterButton,
)}
>
Search
diff --git a/docs-website/src/pages/docs/_components/FilterPage/index.jsx b/docs-website/src/pages/docs/_components/FilterPage/index.jsx
index 7c754adb4ca815..5cb61e0859989f 100644
--- a/docs-website/src/pages/docs/_components/FilterPage/index.jsx
+++ b/docs-website/src/pages/docs/_components/FilterPage/index.jsx
@@ -10,7 +10,7 @@ export function FilterPage(
subtitle,
allowExclusivity = false,
useTags = false,
- useFilters = false
+ useFilters = false,
) {
const [textState, setTextState] = React.useState("");
const [filterState, setFilterState] = React.useState([]);
@@ -79,7 +79,7 @@ export function FilterPage(
item.title.toLowerCase().includes(textState.toLowerCase()) ||
item.description.toLowerCase().includes(textState.toLowerCase())
);
- }
+ },
);
return (
diff --git a/docs-website/src/pages/docs/_components/QuickstartCTA/index.jsx b/docs-website/src/pages/docs/_components/QuickstartCTA/index.jsx
index ef535c0e9d4b49..b866f6db54f29c 100644
--- a/docs-website/src/pages/docs/_components/QuickstartCTA/index.jsx
+++ b/docs-website/src/pages/docs/_components/QuickstartCTA/index.jsx
@@ -8,7 +8,9 @@ const QuickstartCTA = () => {
return (
Get Started Now
-
Run the following command to get started with DataHub.
+
+ Run the following command to get started with DataHub.
+
python3 -m pip install --upgrade pip wheel setuptools
@@ -17,14 +19,20 @@ const QuickstartCTA = () => {
-
+
Quickstart With Open Source
-
+
Learn About DataHub Cloud
-
-
+
+
);
};
diff --git a/docs-website/src/pages/docs/_components/QuickstartCTA/quickstartcta.module.scss b/docs-website/src/pages/docs/_components/QuickstartCTA/quickstartcta.module.scss
index 9a79151d24fcb8..6cf36c146deedf 100644
--- a/docs-website/src/pages/docs/_components/QuickstartCTA/quickstartcta.module.scss
+++ b/docs-website/src/pages/docs/_components/QuickstartCTA/quickstartcta.module.scss
@@ -1,4 +1,3 @@
-
.quickstart__content {
text-align: center;
padding: 2rem 0;
@@ -21,14 +20,10 @@
.quickstart__codeblock {
text-align: left;
padding: 0 20vh;
- }
-
+}
.quickstart__buttons {
gap: 1rem;
display: flex;
justify-content: center;
-
}
-
-
diff --git a/docs-website/src/pages/docs/_components/SearchBar/index.jsx b/docs-website/src/pages/docs/_components/SearchBar/index.jsx
index e3b61fb3cb4764..8a3e211c40dba6 100644
--- a/docs-website/src/pages/docs/_components/SearchBar/index.jsx
+++ b/docs-website/src/pages/docs/_components/SearchBar/index.jsx
@@ -36,8 +36,8 @@ function useDocumentsFoundPlural() {
'Pluralized label for "{count} documents found". Use as much plural forms (separated by "|") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)',
message: "One document found|{count} documents found",
},
- { count }
- )
+ { count },
+ ),
);
}
@@ -53,9 +53,12 @@ function useDocsSearchVersionsHelpers() {
});
// Set the value of a single select menu
- const setSearchVersion = (pluginId, searchVersion) => setSearchVersions((s) => ({ ...s, [pluginId]: searchVersion }));
+ const setSearchVersion = (pluginId, searchVersion) =>
+ setSearchVersions((s) => ({ ...s, [pluginId]: searchVersion }));
- const versioningEnabled = Object.values(allDocsData).some((docsData) => docsData.versions.length > 1);
+ const versioningEnabled = Object.values(allDocsData).some(
+ (docsData) => docsData.versions.length > 1,
+ );
return {
allDocsData,
@@ -67,23 +70,35 @@ function useDocsSearchVersionsHelpers() {
// We want to display one select per versioned docs plugin instance
const SearchVersionSelectList = ({ docsSearchVersionsHelpers }) => {
- const versionedPluginEntries = Object.entries(docsSearchVersionsHelpers.allDocsData)
+ const versionedPluginEntries = Object.entries(
+ docsSearchVersionsHelpers.allDocsData,
+ )
// Do not show a version select for unversioned docs plugin instances
.filter(([, docsData]) => docsData.versions.length > 1);
return (
<>
{versionedPluginEntries.map(([pluginId, docsData]) => {
- const labelPrefix = versionedPluginEntries.length > 1 ? `${pluginId}: ` : "";
+ const labelPrefix =
+ versionedPluginEntries.length > 1 ? `${pluginId}: ` : "";
return (
docsSearchVersionsHelpers.setSearchVersion(pluginId, e.target.value)}
+ onChange={(e) =>
+ docsSearchVersionsHelpers.setSearchVersion(
+ pluginId,
+ e.target.value,
+ )
+ }
defaultValue={docsSearchVersionsHelpers.searchVersions[pluginId]}
className={styles.searchVersionInput}
>
{docsData.versions.map((version, i) => (
-
+
))}
);
@@ -114,37 +129,43 @@ function SearchBar() {
hasMore: null,
loading: null,
};
- const [searchResultState, searchResultStateDispatcher] = useReducer((prevState, { type, value: state }) => {
- switch (type) {
- case "reset": {
- return initialSearchResultState;
- }
- case "loading": {
- return { ...prevState, loading: true };
- }
- case "update": {
- if (searchQuery !== state.query) {
- return prevState;
+ const [searchResultState, searchResultStateDispatcher] = useReducer(
+ (prevState, { type, value: state }) => {
+ switch (type) {
+ case "reset": {
+ return initialSearchResultState;
+ }
+ case "loading": {
+ return { ...prevState, loading: true };
}
+ case "update": {
+ if (searchQuery !== state.query) {
+ return prevState;
+ }
- return {
- ...state,
- items: state.lastPage === 0 ? state.items : prevState.items.concat(state.items),
- };
- }
- case "advance": {
- const hasMore = prevState.totalPages > prevState.lastPage + 1;
-
- return {
- ...prevState,
- lastPage: hasMore ? prevState.lastPage + 1 : prevState.lastPage,
- hasMore,
- };
+ return {
+ ...state,
+ items:
+ state.lastPage === 0
+ ? state.items
+ : prevState.items.concat(state.items),
+ };
+ }
+ case "advance": {
+ const hasMore = prevState.totalPages > prevState.lastPage + 1;
+
+ return {
+ ...prevState,
+ lastPage: hasMore ? prevState.lastPage + 1 : prevState.lastPage,
+ hasMore,
+ };
+ }
+ default:
+ return prevState;
}
- default:
- return prevState;
- }
- }, initialSearchResultState);
+ },
+ initialSearchResultState,
+ );
const algoliaClient = algoliaSearch(appId, apiKey);
const algoliaHelper = algoliaSearchHelper(algoliaClient, indexName, {
@@ -153,43 +174,57 @@ function SearchBar() {
disjunctiveFacets: ["language", "docusaurus_tag"],
});
- algoliaHelper.on("result", ({ results: { query, hits, page, nbHits, nbPages } }) => {
- if (query === "" || !(hits instanceof Array)) {
- searchResultStateDispatcher({ type: "reset" });
- return;
- }
+ algoliaHelper.on(
+ "result",
+ ({ results: { query, hits, page, nbHits, nbPages } }) => {
+ if (query === "" || !(hits instanceof Array)) {
+ searchResultStateDispatcher({ type: "reset" });
+ return;
+ }
- const sanitizeValue = (value) => {
- return value.replace(/algolia-docsearch-suggestion--highlight/g, "search-result-match");
- };
+ const sanitizeValue = (value) => {
+ return value.replace(
+ /algolia-docsearch-suggestion--highlight/g,
+ "search-result-match",
+ );
+ };
- const items = hits.map(({ url, _highlightResult: { hierarchy }, _snippetResult: snippet = {} }) => {
- const { pathname, hash } = new URL(url);
- const titles = Object.keys(hierarchy).map((key) => {
- return sanitizeValue(hierarchy[key].value);
+ const items = hits.map(
+ ({
+ url,
+ _highlightResult: { hierarchy },
+ _snippetResult: snippet = {},
+ }) => {
+ const { pathname, hash } = new URL(url);
+ const titles = Object.keys(hierarchy).map((key) => {
+ return sanitizeValue(hierarchy[key].value);
+ });
+
+ return {
+ title: titles.pop(),
+ url: pathname + hash,
+ summary: snippet.content
+ ? `${sanitizeValue(snippet.content.value)}...`
+ : "",
+ breadcrumbs: titles,
+ };
+ },
+ );
+
+ searchResultStateDispatcher({
+ type: "update",
+ value: {
+ items,
+ query,
+ totalResults: nbHits,
+ totalPages: nbPages,
+ lastPage: page,
+ hasMore: nbPages > page + 1,
+ loading: false,
+ },
});
-
- return {
- title: titles.pop(),
- url: pathname + hash,
- summary: snippet.content ? `${sanitizeValue(snippet.content.value)}...` : "",
- breadcrumbs: titles,
- };
- });
-
- searchResultStateDispatcher({
- type: "update",
- value: {
- items,
- query,
- totalResults: nbHits,
- totalPages: nbPages,
- lastPage: page,
- hasMore: nbPages > page + 1,
- loading: false,
- },
- });
- });
+ },
+ );
const [loaderRef, setLoaderRef] = useState(null);
const prevY = useRef(0);
@@ -208,8 +243,8 @@ function SearchBar() {
prevY.current = currentY;
},
- { threshold: 1 }
- )
+ { threshold: 1 },
+ ),
);
const getTitle = () =>
@@ -222,7 +257,7 @@ function SearchBar() {
},
{
query: searchQuery,
- }
+ },
)
: translate({
id: "theme.SearchPage.emptyResultsTitle",
@@ -234,9 +269,14 @@ function SearchBar() {
algoliaHelper.addDisjunctiveFacetRefinement("docusaurus_tag", "default");
algoliaHelper.addDisjunctiveFacetRefinement("language", currentLocale);
- Object.entries(docsSearchVersionsHelpers.searchVersions).forEach(([pluginId, searchVersion]) => {
- algoliaHelper.addDisjunctiveFacetRefinement("docusaurus_tag", `docs-${pluginId}-${searchVersion}`);
- });
+ Object.entries(docsSearchVersionsHelpers.searchVersions).forEach(
+ ([pluginId, searchVersion]) => {
+ algoliaHelper.addDisjunctiveFacetRefinement(
+ "docusaurus_tag",
+ `docs-${pluginId}-${searchVersion}`,
+ );
+ },
+ );
algoliaHelper.setQuery(searchQuery).setPage(page).search();
});
@@ -275,7 +315,10 @@ function SearchBar() {
-
- {docsSearchVersionsHelpers.versioningEnabled &&
}
+ {docsSearchVersionsHelpers.versioningEnabled && (
+
+ )}
+
+
+ {!!searchResultState.totalResults &&
+ documentsFoundPlural(searchResultState.totalResults)}
-
{!!searchResultState.totalResults && documentsFoundPlural(searchResultState.totalResults)}
{searchResultState.items.length > 0 ? (
- {searchResultState.items.map(({ title, url, summary, breadcrumbs }, i) => (
-
-
-
-
-
- {breadcrumbs.length > 0 && (
-
-
- {breadcrumbs.map((html, index) => (
-
- ))}
-
-
- )}
-
- {summary && (
-
- )}
-
- ))}
+ {searchResultState.items.map(
+ ({ title, url, summary, breadcrumbs }, i) => (
+
+
+
+
+
+ {breadcrumbs.length > 0 && (
+
+
+ {breadcrumbs.map((html, index) => (
+
+ ))}
+
+
+ )}
+
+ {summary && (
+
+ )}
+
+ ),
+ )}
) : (
[
searchQuery && !searchResultState.loading && (
-
+
No results were found
),
- !!searchResultState.loading &&
,
+ !!searchResultState.loading && (
+
+ ),
]
)}
{searchResultState.hasMore && (
-
+
Fetching new results...
diff --git a/docs-website/src/pages/docs/index.js b/docs-website/src/pages/docs/index.js
index 7ffa1a275b5ac1..29042b67d28de9 100644
--- a/docs-website/src/pages/docs/index.js
+++ b/docs-website/src/pages/docs/index.js
@@ -1,8 +1,8 @@
-import React from 'react';
-import { Redirect } from '@docusaurus/router';
+import React from "react";
+import { Redirect } from "@docusaurus/router";
const Home = () => {
return
;
};
-export default Home;
\ No newline at end of file
+export default Home;
diff --git a/docs-website/src/pages/integrations.jsx b/docs-website/src/pages/integrations.jsx
index 569823d0f08267..204ae8aef97a9f 100644
--- a/docs-website/src/pages/integrations.jsx
+++ b/docs-website/src/pages/integrations.jsx
@@ -16,7 +16,7 @@ function DataProviderComponent() {
"DataHub Integrations",
"Services that integrate with DataHub",
false,
- true
+ true,
);
}
diff --git a/docs-website/src/styles/config-table.scss b/docs-website/src/styles/config-table.scss
index dfda3939487815..486fa1f95ce1e1 100644
--- a/docs-website/src/styles/config-table.scss
+++ b/docs-website/src/styles/config-table.scss
@@ -2,48 +2,48 @@
// config tables for our sources.
.config-table {
- a {
- // Forcibly allow links to wrap to avoid a horizontal scrollbar.
- word-wrap: break-word;
- }
-
- .path-line {
- max-width: 330px;
-
- // When the line wraps, make sure that it still looks ok.
- line-height: 1.25;
- }
- .path-prefix {
- font-size: 0.9rem;
- font-style: italic;
- margin-right: 1px;
-
- // This tells the engine to prefer putting line breaks
- // after the path prefix.
- // See https://stackoverflow.com/a/24357132.
- display: inline-block;
- }
- .path-main {
- font-weight: 600;
- }
-
- // .type-name-line {
- // margin-bottom: 5px;
- // }
-
- .type-name {
- font-size: 0.9rem;
- color: var(--ifm-font-color-secondary);
- }
-
- // .default-line {
- // // margin-top: 10px;
- // }
- .default-line-with-docs {
- margin-top: 7px;
- }
-
- .default-value {
- font: var(--ifm-code-font-size) var(--ifm-font-family-monospace);
- }
+ a {
+ // Forcibly allow links to wrap to avoid a horizontal scrollbar.
+ word-wrap: break-word;
+ }
+
+ .path-line {
+ max-width: 330px;
+
+ // When the line wraps, make sure that it still looks ok.
+ line-height: 1.25;
+ }
+ .path-prefix {
+ font-size: 0.9rem;
+ font-style: italic;
+ margin-right: 1px;
+
+ // This tells the engine to prefer putting line breaks
+ // after the path prefix.
+ // See https://stackoverflow.com/a/24357132.
+ display: inline-block;
+ }
+ .path-main {
+ font-weight: 600;
+ }
+
+ // .type-name-line {
+ // margin-bottom: 5px;
+ // }
+
+ .type-name {
+ font-size: 0.9rem;
+ color: var(--ifm-font-color-secondary);
+ }
+
+ // .default-line {
+ // // margin-top: 10px;
+ // }
+ .default-line-with-docs {
+ margin-top: 7px;
+ }
+
+ .default-value {
+ font: var(--ifm-code-font-size) var(--ifm-font-family-monospace);
+ }
}
diff --git a/docs-website/src/styles/global.scss b/docs-website/src/styles/global.scss
index 185b57b0389f9d..2c70d80e85900e 100644
--- a/docs-website/src/styles/global.scss
+++ b/docs-website/src/styles/global.scss
@@ -17,9 +17,12 @@
--ifm-code-font-size: 0.9em;
--ifm-heading-color: #000000;
--ifm-heading-font-family: "Manrope", sans-serif;
- --ifm-font-family-base: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
- "Segoe UI Emoji", "Segoe UI Symbol";
- --ifm-font-family-monospace: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
+ --ifm-font-family-base:
+ system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial,
+ sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
+ --ifm-font-family-monospace:
+ SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New",
+ monospace;
/* Buttons */
--ifm-button-border-radius: 1000em;
@@ -83,7 +86,7 @@ main {
}
.markdown,
-main>h1 {
+main > h1 {
margin-top: 1rem;
}
@@ -128,9 +131,9 @@ div[class^="announcementBar"] {
align-items: center;
background: linear-gradient(
75deg,
- #1c1e21 0%, /* Gray 800 */
- #333c48 50%, /* Slightly lighter - blue streak */
- #1c1e21 100% /* Gray 800 */
+ #1c1e21 0%,
+ /* Gray 800 */ #333c48 50%,
+ /* Slightly lighter - blue streak */ #1c1e21 100% /* Gray 800 */
);
background-size: 400% 400%;
animation: shimmer 10s ease-in-out infinite;
@@ -160,9 +163,14 @@ div[class^="announcementBar"] {
font-size: 0.8rem;
}
&::before {
- content: '';
+ content: "";
z-index: 0;
- background: conic-gradient(from var(--electric-wire-angle), transparent, rgba(255, 255, 255, 0.751) 5%, transparent 20%);
+ background: conic-gradient(
+ from var(--electric-wire-angle),
+ transparent,
+ rgba(255, 255, 255, 0.751) 5%,
+ transparent 20%
+ );
position: absolute;
top: 0;
left: 0;
@@ -172,7 +180,7 @@ div[class^="announcementBar"] {
animation: electric-wire 3s linear infinite;
}
&::after {
- content: '';
+ content: "";
z-index: 0;
height: calc(100% - 2px);
width: calc(100% - 2px);
@@ -208,7 +216,7 @@ div[class^="announcementBar"] {
}
@property --electric-wire-angle {
- syntax: '
';
+ syntax: "";
initial-value: 1deg;
inherits: false;
}
@@ -224,7 +232,9 @@ div[class^="announcementBar"] {
/* Add padding to body to prevent content from hiding under fixed elements */
body {
- padding-top: calc(var(--docusaurus-announcement-bar-height) + var(--ifm-navbar-height));
+ padding-top: calc(
+ var(--docusaurus-announcement-bar-height) + var(--ifm-navbar-height)
+ );
}
/** Navbar */
@@ -250,7 +260,7 @@ body {
border-bottom: 2px solid transparent;
}
- .dropdown>.navbar__link:after {
+ .dropdown > .navbar__link:after {
top: -1px;
border-width: 0.3em 0.3em 0;
margin-left: 0.4em;
@@ -402,21 +412,30 @@ body {
.menu__link {
font-weight: 400;
- padding: calc(var(--ifm-menu-link-padding-vertical) + 0.2rem) calc(var(--ifm-menu-link-padding-horizontal) + 0.2rem) calc(var(--ifm-menu-link-padding-vertical) + 0.2rem) calc(var(--ifm-menu-link-padding-horizontal) + 1rem);
+ padding: calc(var(--ifm-menu-link-padding-vertical) + 0.2rem)
+ calc(var(--ifm-menu-link-padding-horizontal) + 0.2rem)
+ calc(var(--ifm-menu-link-padding-vertical) + 0.2rem)
+ calc(var(--ifm-menu-link-padding-horizontal) + 1rem);
}
.menu__link--active {
font-weight: 400;
- padding: calc(var(--ifm-menu-link-padding-vertical) + 0.2rem) calc(var(--ifm-menu-link-padding-horizontal) + 0.2rem) calc(var(--ifm-menu-link-padding-vertical) + 0.2rem) calc(var(--ifm-menu-link-padding-horizontal) + 1rem);
+ padding: calc(var(--ifm-menu-link-padding-vertical) + 0.2rem)
+ calc(var(--ifm-menu-link-padding-horizontal) + 0.2rem)
+ calc(var(--ifm-menu-link-padding-vertical) + 0.2rem)
+ calc(var(--ifm-menu-link-padding-horizontal) + 1rem);
}
- .theme-doc-sidebar-item-category-level-1>div>a:first-child {
+ .theme-doc-sidebar-item-category-level-1 > div > a:first-child {
font-weight: 400;
color: var(--ifm-menu-color);
- padding: calc(var(--ifm-menu-link-padding-vertical) + 0.2rem) calc(var(--ifm-menu-link-padding-horizontal) + 0.2rem) calc(var(--ifm-menu-link-padding-vertical) + 0.2rem) calc(var(--ifm-menu-link-padding-horizontal) + 1rem);
+ padding: calc(var(--ifm-menu-link-padding-vertical) + 0.2rem)
+ calc(var(--ifm-menu-link-padding-horizontal) + 0.2rem)
+ calc(var(--ifm-menu-link-padding-vertical) + 0.2rem)
+ calc(var(--ifm-menu-link-padding-horizontal) + 1rem);
}
- .theme-doc-sidebar-item-category-level-1>div>a.menu__link--active {
+ .theme-doc-sidebar-item-category-level-1 > div > a.menu__link--active {
color: var(--ifm-menu-color);
font-weight: 400;
}
@@ -468,7 +487,11 @@ body {
--docsearch-hit-background: var(--ifm-color-emphasis-100);
/* Footer */
--docsearch-footer-background: var(--ifm-background-surface-color);
- --docsearch-key-gradient: linear-gradient(-26.5deg, var(--ifm-color-emphasis-200) 0%, var(--ifm-color-emphasis-100) 100%);
+ --docsearch-key-gradient: linear-gradient(
+ -26.5deg,
+ var(--ifm-color-emphasis-200) 0%,
+ var(--ifm-color-emphasis-100) 100%
+ );
}
.comapny__logos {
@@ -494,22 +517,21 @@ body {
flex-grow: 1;
justify-content: space-around;
-
.more_link {
font-size: 1.25rem;
color: #bbb;
font-weight: 600;
text-decoration: none;
position: relative;
- top: -.4rem;
+ top: -0.4rem;
}
}
.company_logo {
max-width: 80px;
max-height: 36px;
filter: brightness(0) grayscale(1);
- opacity: .4;
- transition: opacity .3s ease-in-out;
+ opacity: 0.4;
+ transition: opacity 0.3s ease-in-out;
&:hover {
opacity: 0.6;
}
@@ -519,7 +541,7 @@ body {
@keyframes openModal {
0% {
opacity: 0;
- scale: .8;
+ scale: 0.8;
}
100% {
opacity: 1;
@@ -529,16 +551,17 @@ body {
.tourModal {
position: fixed;
- top: 0; left: 0;
+ top: 0;
+ left: 0;
height: 100%;
width: 100%;
- background-color: #000000CC;
+ background-color: #000000cc;
z-index: 1000;
display: flex;
justify-content: center;
align-items: center;
- animation: openModal .3s ease-out;
+ animation: openModal 0.3s ease-out;
iframe {
border-radius: 24px;
@@ -556,7 +579,7 @@ body {
span {
font-size: 36px;
path {
- color: #CCC;
+ color: #ccc;
}
}
}
@@ -569,11 +592,11 @@ body {
min-width: 0;
.text {
padding-right: 0;
- font-size: .9rem;
+ font-size: 0.9rem;
line-height: 1.2rem;
text-align: center;
margin-bottom: 12px;
- opacity: .75;
+ opacity: 0.75;
br {
display: none;
}
@@ -657,10 +680,10 @@ body {
@keyframes expand-on-scroll {
from {
- transform: scaleY(0);
+ transform: scaleY(0);
}
to {
- transform: scaleY(1);
+ transform: scaleY(1);
}
}
diff --git a/docs-website/src/styles/sphinx.scss b/docs-website/src/styles/sphinx.scss
index 8343581077f043..e7725429f35420 100644
--- a/docs-website/src/styles/sphinx.scss
+++ b/docs-website/src/styles/sphinx.scss
@@ -13,7 +13,9 @@
padding: 0;
margin: 0;
}
- .hash-link { display: none; }
+ .hash-link {
+ display: none;
+ }
}
.h3-block {
@@ -29,12 +31,28 @@
font-size: 1rem;
border-bottom: 1px solid var(--ifm-hr-border-color);
- .class-text { color: #aaa; margin-right: 0.3rem; }
- .class-owner { color: var(--ifm-font-color-secondary); }
- .class-name { color: var(--ifm-color-primary); font-weight: 700; }
- .arg-name { color: var(--ifm-font-color-base); font-weight: 600; }
- .arg-type { color: gray; }
- .arg-default { color: #999; font-weight: 600; }
+ .class-text {
+ color: #aaa;
+ margin-right: 0.3rem;
+ }
+ .class-owner {
+ color: var(--ifm-font-color-secondary);
+ }
+ .class-name {
+ color: var(--ifm-color-primary);
+ font-weight: 700;
+ }
+ .arg-name {
+ color: var(--ifm-font-color-base);
+ font-weight: 600;
+ }
+ .arg-type {
+ color: gray;
+ }
+ .arg-default {
+ color: #999;
+ font-weight: 600;
+ }
}
p {
@@ -42,26 +60,25 @@
}
h4 {
- background: #FAFAFA;
+ background: #fafafa;
font-family: var(--ifm-font-family-monospace);
padding: 1rem 1.5rem;
margin: 0;
font-weight: 600;
border-top: 1px solid var(--ifm-hr-border-color);
border-bottom: 1px solid var(--ifm-hr-border-color);
- color: #444;
-
+ color: #444;
+
em {
color: gray;
- font-style: normal;
+ font-style: normal;
}
}
-
ul {
margin: 0;
padding: 1rem 2rem;
- background: #FDFDFD;
+ background: #fdfdfd;
li {
display: flex;
@@ -78,7 +95,7 @@
font-family: var(--ifm-font-family-monospace);
font-weight: 400;
color: #444;
- background: #F5F5F5;
+ background: #f5f5f5;
padding: 0.15rem 0.4rem;
border-radius: 4px;
margin-left: 0.5rem;
@@ -87,7 +104,8 @@
}
// pre code block (full examples)
- pre code, pre code * {
+ pre code,
+ pre code * {
color: var(--ifm-font-color-secondary);
padding: 1rem;
margin: 0;
@@ -96,7 +114,7 @@
}
code {
- background-color: #F5F5F5;
+ background-color: #f5f5f5;
padding: 0.1rem 0.4rem;
margin: auto 0.2rem;
font-size: 0.9em;
diff --git a/docs-website/src/theme/DocItem/Footer/index.js b/docs-website/src/theme/DocItem/Footer/index.js
index d6bc2c72914d50..9c50f920366ba3 100644
--- a/docs-website/src/theme/DocItem/Footer/index.js
+++ b/docs-website/src/theme/DocItem/Footer/index.js
@@ -11,21 +11,35 @@ import SlackUtm from "../../../components/SlackUtm";
function TagsRow(props) {
return (
-
+
);
}
-function EditMetaRow({ editUrl, lastUpdatedAt, lastUpdatedBy, formattedLastUpdatedAt }) {
+function EditMetaRow({
+ editUrl,
+ lastUpdatedAt,
+ lastUpdatedBy,
+ formattedLastUpdatedAt,
+}) {
return (
{editUrl && }
{(lastUpdatedAt || lastUpdatedBy) && (
-
+
)}
@@ -33,7 +47,14 @@ function EditMetaRow({ editUrl, lastUpdatedAt, lastUpdatedBy, formattedLastUpdat
}
export default function DocItemFooter() {
const { metadata } = useDoc();
- const { editUrl, lastUpdatedAt, formattedLastUpdatedAt, lastUpdatedBy, tags, unversionedId } = metadata;
+ const {
+ editUrl,
+ lastUpdatedAt,
+ formattedLastUpdatedAt,
+ lastUpdatedBy,
+ tags,
+ unversionedId,
+ } = metadata;
const canDisplayTagsRow = tags.length > 0;
const canDisplayEditMetaRow = !!(editUrl || lastUpdatedAt || lastUpdatedBy);
const canDisplayFooter = canDisplayTagsRow || canDisplayEditMetaRow;
@@ -42,8 +63,10 @@ export default function DocItemFooter() {
}
return (
<>
-
-
+
+
{canDisplayTagsRow && }
{canDisplayEditMetaRow && (
version.docs.find((doc) => doc.id === version.mainDocId);
+const getVersionMainDoc = (version) =>
+ version.docs.find((doc) => doc.id === version.mainDocId);
export default function DocsVersionDropdownNavbarItem({
mobile,
@@ -24,7 +28,9 @@ export default function DocsVersionDropdownNavbarItem({
const versions = useVersions(docsPluginId);
const { savePreferredVersionName } = useDocsPreferredVersion(docsPluginId);
const versionLinks = versions.map((version) => {
- const versionDoc = activeDocContext.alternateDocVersions[version.name] ?? getVersionMainDoc(version);
+ const versionDoc =
+ activeDocContext.alternateDocVersions[version.name] ??
+ getVersionMainDoc(version);
return {
label: version.label,
to: `${versionDoc.path}${search}${hash}`,
@@ -33,18 +39,25 @@ export default function DocsVersionDropdownNavbarItem({
};
});
-
- const items = [...dropdownItemsBefore, ...versionLinks, ...dropdownItemsAfter];
+ const items = [
+ ...dropdownItemsBefore,
+ ...versionLinks,
+ ...dropdownItemsAfter,
+ ];
const dropdownVersion = useDocsVersionCandidates(docsPluginId)[0];
const dropdownLabel =
mobile && items.length > 1
? translate({
id: "theme.navbar.mobileVersionsDropdown.label",
message: "Versions",
- description: "The label for the navbar versions dropdown on mobile view",
+ description:
+ "The label for the navbar versions dropdown on mobile view",
})
: dropdownVersion.label;
- const dropdownTo = mobile && items.length > 1 ? undefined : getVersionMainDoc(dropdownVersion).path;
+ const dropdownTo =
+ mobile && items.length > 1
+ ? undefined
+ : getVersionMainDoc(dropdownVersion).path;
if (items.length <= 1) {
return (
@@ -68,4 +81,4 @@ export default function DocsVersionDropdownNavbarItem({
isActive={dropdownActiveClassDisabled ? () => false : undefined}
/>
);
-}
\ No newline at end of file
+}
diff --git a/docs-website/src/theme/Root.js b/docs-website/src/theme/Root.js
index 894bf2be0e289d..bc40584b05583a 100644
--- a/docs-website/src/theme/Root.js
+++ b/docs-website/src/theme/Root.js
@@ -1,5 +1,5 @@
-import React, { useEffect } from 'react';
-import { useLocation } from '@docusaurus/router';
+import React, { useEffect } from "react";
+import { useLocation } from "@docusaurus/router";
export default function Root({ children }) {
const location = useLocation();
diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock
index c0ac2729adebab..25597f2e3a9fd2 100644
--- a/docs-website/yarn.lock
+++ b/docs-website/yarn.lock
@@ -2885,6 +2885,72 @@
rc-resize-observer "^1.3.1"
rc-util "^5.38.0"
+"@reactflow/background@11.3.14":
+ version "11.3.14"
+ resolved "https://registry.yarnpkg.com/@reactflow/background/-/background-11.3.14.tgz#778ca30174f3de77fc321459ab3789e66e71a699"
+ integrity sha512-Gewd7blEVT5Lh6jqrvOgd4G6Qk17eGKQfsDXgyRSqM+CTwDqRldG2LsWN4sNeno6sbqVIC2fZ+rAUBFA9ZEUDA==
+ dependencies:
+ "@reactflow/core" "11.11.4"
+ classcat "^5.0.3"
+ zustand "^4.4.1"
+
+"@reactflow/controls@11.2.14":
+ version "11.2.14"
+ resolved "https://registry.yarnpkg.com/@reactflow/controls/-/controls-11.2.14.tgz#508ed2c40d23341b3b0919dd11e76fd49cf850c7"
+ integrity sha512-MiJp5VldFD7FrqaBNIrQ85dxChrG6ivuZ+dcFhPQUwOK3HfYgX2RHdBua+gx+40p5Vw5It3dVNp/my4Z3jF0dw==
+ dependencies:
+ "@reactflow/core" "11.11.4"
+ classcat "^5.0.3"
+ zustand "^4.4.1"
+
+"@reactflow/core@11.11.4":
+ version "11.11.4"
+ resolved "https://registry.yarnpkg.com/@reactflow/core/-/core-11.11.4.tgz#89bd86d1862aa1416f3f49926cede7e8c2aab6a7"
+ integrity sha512-H4vODklsjAq3AMq6Np4LE12i1I4Ta9PrDHuBR9GmL8uzTt2l2jh4CiQbEMpvMDcp7xi4be0hgXj+Ysodde/i7Q==
+ dependencies:
+ "@types/d3" "^7.4.0"
+ "@types/d3-drag" "^3.0.1"
+ "@types/d3-selection" "^3.0.3"
+ "@types/d3-zoom" "^3.0.1"
+ classcat "^5.0.3"
+ d3-drag "^3.0.0"
+ d3-selection "^3.0.0"
+ d3-zoom "^3.0.0"
+ zustand "^4.4.1"
+
+"@reactflow/minimap@11.7.14":
+ version "11.7.14"
+ resolved "https://registry.yarnpkg.com/@reactflow/minimap/-/minimap-11.7.14.tgz#298d7a63cb1da06b2518c99744f716560c88ca73"
+ integrity sha512-mpwLKKrEAofgFJdkhwR5UQ1JYWlcAAL/ZU/bctBkuNTT1yqV+y0buoNVImsRehVYhJwffSWeSHaBR5/GJjlCSQ==
+ dependencies:
+ "@reactflow/core" "11.11.4"
+ "@types/d3-selection" "^3.0.3"
+ "@types/d3-zoom" "^3.0.1"
+ classcat "^5.0.3"
+ d3-selection "^3.0.0"
+ d3-zoom "^3.0.0"
+ zustand "^4.4.1"
+
+"@reactflow/node-resizer@2.2.14":
+ version "2.2.14"
+ resolved "https://registry.yarnpkg.com/@reactflow/node-resizer/-/node-resizer-2.2.14.tgz#1810c0ce51aeb936f179466a6660d1e02c7a77a8"
+ integrity sha512-fwqnks83jUlYr6OHcdFEedumWKChTHRGw/kbCxj0oqBd+ekfs+SIp4ddyNU0pdx96JIm5iNFS0oNrmEiJbbSaA==
+ dependencies:
+ "@reactflow/core" "11.11.4"
+ classcat "^5.0.4"
+ d3-drag "^3.0.0"
+ d3-selection "^3.0.0"
+ zustand "^4.4.1"
+
+"@reactflow/node-toolbar@1.3.14":
+ version "1.3.14"
+ resolved "https://registry.yarnpkg.com/@reactflow/node-toolbar/-/node-toolbar-1.3.14.tgz#c6ffc76f82acacdce654f2160dc9852162d6e7c9"
+ integrity sha512-rbynXQnH/xFNu4P9H+hVqlEUafDCkEoCy0Dg9mG22Sg+rY/0ck6KkrAQrYrTgXusd+cEJOMK0uOOFCK2/5rSGQ==
+ dependencies:
+ "@reactflow/core" "11.11.4"
+ classcat "^5.0.3"
+ zustand "^4.4.1"
+
"@servicebell/widget@^0.1.6":
version "0.1.6"
resolved "https://registry.yarnpkg.com/@servicebell/widget/-/widget-0.1.6.tgz#04672a7e7b14ff7025ec83fd740373345c359d74"
@@ -3237,6 +3303,216 @@
dependencies:
"@types/node" "*"
+"@types/d3-array@*":
+ version "3.2.2"
+ resolved "https://registry.yarnpkg.com/@types/d3-array/-/d3-array-3.2.2.tgz#e02151464d02d4a1b44646d0fcdb93faf88fde8c"
+ integrity sha512-hOLWVbm7uRza0BYXpIIW5pxfrKe0W+D5lrFiAEYR+pb6w3N2SwSMaJbXdUfSEv+dT4MfHBLtn5js0LAWaO6otw==
+
+"@types/d3-axis@*":
+ version "3.0.6"
+ resolved "https://registry.yarnpkg.com/@types/d3-axis/-/d3-axis-3.0.6.tgz#e760e5765b8188b1defa32bc8bb6062f81e4c795"
+ integrity sha512-pYeijfZuBd87T0hGn0FO1vQ/cgLk6E1ALJjfkC0oJ8cbwkZl3TpgS8bVBLZN+2jjGgg38epgxb2zmoGtSfvgMw==
+ dependencies:
+ "@types/d3-selection" "*"
+
+"@types/d3-brush@*":
+ version "3.0.6"
+ resolved "https://registry.yarnpkg.com/@types/d3-brush/-/d3-brush-3.0.6.tgz#c2f4362b045d472e1b186cdbec329ba52bdaee6c"
+ integrity sha512-nH60IZNNxEcrh6L1ZSMNA28rj27ut/2ZmI3r96Zd+1jrZD++zD3LsMIjWlvg4AYrHn/Pqz4CF3veCxGjtbqt7A==
+ dependencies:
+ "@types/d3-selection" "*"
+
+"@types/d3-chord@*":
+ version "3.0.6"
+ resolved "https://registry.yarnpkg.com/@types/d3-chord/-/d3-chord-3.0.6.tgz#1706ca40cf7ea59a0add8f4456efff8f8775793d"
+ integrity sha512-LFYWWd8nwfwEmTZG9PfQxd17HbNPksHBiJHaKuY1XeqscXacsS2tyoo6OdRsjf+NQYeB6XrNL3a25E3gH69lcg==
+
+"@types/d3-color@*":
+ version "3.1.3"
+ resolved "https://registry.yarnpkg.com/@types/d3-color/-/d3-color-3.1.3.tgz#368c961a18de721da8200e80bf3943fb53136af2"
+ integrity sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==
+
+"@types/d3-contour@*":
+ version "3.0.6"
+ resolved "https://registry.yarnpkg.com/@types/d3-contour/-/d3-contour-3.0.6.tgz#9ada3fa9c4d00e3a5093fed0356c7ab929604231"
+ integrity sha512-BjzLgXGnCWjUSYGfH1cpdo41/hgdWETu4YxpezoztawmqsvCeep+8QGfiY6YbDvfgHz/DkjeIkkZVJavB4a3rg==
+ dependencies:
+ "@types/d3-array" "*"
+ "@types/geojson" "*"
+
+"@types/d3-delaunay@*":
+ version "6.0.4"
+ resolved "https://registry.yarnpkg.com/@types/d3-delaunay/-/d3-delaunay-6.0.4.tgz#185c1a80cc807fdda2a3fe960f7c11c4a27952e1"
+ integrity sha512-ZMaSKu4THYCU6sV64Lhg6qjf1orxBthaC161plr5KuPHo3CNm8DTHiLw/5Eq2b6TsNP0W0iJrUOFscY6Q450Hw==
+
+"@types/d3-dispatch@*":
+ version "3.0.7"
+ resolved "https://registry.yarnpkg.com/@types/d3-dispatch/-/d3-dispatch-3.0.7.tgz#ef004d8a128046cfce434d17182f834e44ef95b2"
+ integrity sha512-5o9OIAdKkhN1QItV2oqaE5KMIiXAvDWBDPrD85e58Qlz1c1kI/J0NcqbEG88CoTwJrYe7ntUCVfeUl2UJKbWgA==
+
+"@types/d3-drag@*", "@types/d3-drag@^3.0.1":
+ version "3.0.7"
+ resolved "https://registry.yarnpkg.com/@types/d3-drag/-/d3-drag-3.0.7.tgz#b13aba8b2442b4068c9a9e6d1d82f8bcea77fc02"
+ integrity sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ==
+ dependencies:
+ "@types/d3-selection" "*"
+
+"@types/d3-dsv@*":
+ version "3.0.7"
+ resolved "https://registry.yarnpkg.com/@types/d3-dsv/-/d3-dsv-3.0.7.tgz#0a351f996dc99b37f4fa58b492c2d1c04e3dac17"
+ integrity sha512-n6QBF9/+XASqcKK6waudgL0pf/S5XHPPI8APyMLLUHd8NqouBGLsU8MgtO7NINGtPBtk9Kko/W4ea0oAspwh9g==
+
+"@types/d3-ease@*":
+ version "3.0.2"
+ resolved "https://registry.yarnpkg.com/@types/d3-ease/-/d3-ease-3.0.2.tgz#e28db1bfbfa617076f7770dd1d9a48eaa3b6c51b"
+ integrity sha512-NcV1JjO5oDzoK26oMzbILE6HW7uVXOHLQvHshBUW4UMdZGfiY6v5BeQwh9a9tCzv+CeefZQHJt5SRgK154RtiA==
+
+"@types/d3-fetch@*":
+ version "3.0.7"
+ resolved "https://registry.yarnpkg.com/@types/d3-fetch/-/d3-fetch-3.0.7.tgz#c04a2b4f23181aa376f30af0283dbc7b3b569980"
+ integrity sha512-fTAfNmxSb9SOWNB9IoG5c8Hg6R+AzUHDRlsXsDZsNp6sxAEOP0tkP3gKkNSO/qmHPoBFTxNrjDprVHDQDvo5aA==
+ dependencies:
+ "@types/d3-dsv" "*"
+
+"@types/d3-force@*":
+ version "3.0.10"
+ resolved "https://registry.yarnpkg.com/@types/d3-force/-/d3-force-3.0.10.tgz#6dc8fc6e1f35704f3b057090beeeb7ac674bff1a"
+ integrity sha512-ZYeSaCF3p73RdOKcjj+swRlZfnYpK1EbaDiYICEEp5Q6sUiqFaFQ9qgoshp5CzIyyb/yD09kD9o2zEltCexlgw==
+
+"@types/d3-format@*":
+ version "3.0.4"
+ resolved "https://registry.yarnpkg.com/@types/d3-format/-/d3-format-3.0.4.tgz#b1e4465644ddb3fdf3a263febb240a6cd616de90"
+ integrity sha512-fALi2aI6shfg7vM5KiR1wNJnZ7r6UuggVqtDA+xiEdPZQwy/trcQaHnwShLuLdta2rTymCNpxYTiMZX/e09F4g==
+
+"@types/d3-geo@*":
+ version "3.1.0"
+ resolved "https://registry.yarnpkg.com/@types/d3-geo/-/d3-geo-3.1.0.tgz#b9e56a079449174f0a2c8684a9a4df3f60522440"
+ integrity sha512-856sckF0oP/diXtS4jNsiQw/UuK5fQG8l/a9VVLeSouf1/PPbBE1i1W852zVwKwYCBkFJJB7nCFTbk6UMEXBOQ==
+ dependencies:
+ "@types/geojson" "*"
+
+"@types/d3-hierarchy@*":
+ version "3.1.7"
+ resolved "https://registry.yarnpkg.com/@types/d3-hierarchy/-/d3-hierarchy-3.1.7.tgz#6023fb3b2d463229f2d680f9ac4b47466f71f17b"
+ integrity sha512-tJFtNoYBtRtkNysX1Xq4sxtjK8YgoWUNpIiUee0/jHGRwqvzYxkq0hGVbbOGSz+JgFxxRu4K8nb3YpG3CMARtg==
+
+"@types/d3-interpolate@*":
+ version "3.0.4"
+ resolved "https://registry.yarnpkg.com/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz#412b90e84870285f2ff8a846c6eb60344f12a41c"
+ integrity sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==
+ dependencies:
+ "@types/d3-color" "*"
+
+"@types/d3-path@*":
+ version "3.1.1"
+ resolved "https://registry.yarnpkg.com/@types/d3-path/-/d3-path-3.1.1.tgz#f632b380c3aca1dba8e34aa049bcd6a4af23df8a"
+ integrity sha512-VMZBYyQvbGmWyWVea0EHs/BwLgxc+MKi1zLDCONksozI4YJMcTt8ZEuIR4Sb1MMTE8MMW49v0IwI5+b7RmfWlg==
+
+"@types/d3-polygon@*":
+ version "3.0.2"
+ resolved "https://registry.yarnpkg.com/@types/d3-polygon/-/d3-polygon-3.0.2.tgz#dfae54a6d35d19e76ac9565bcb32a8e54693189c"
+ integrity sha512-ZuWOtMaHCkN9xoeEMr1ubW2nGWsp4nIql+OPQRstu4ypeZ+zk3YKqQT0CXVe/PYqrKpZAi+J9mTs05TKwjXSRA==
+
+"@types/d3-quadtree@*":
+ version "3.0.6"
+ resolved "https://registry.yarnpkg.com/@types/d3-quadtree/-/d3-quadtree-3.0.6.tgz#d4740b0fe35b1c58b66e1488f4e7ed02952f570f"
+ integrity sha512-oUzyO1/Zm6rsxKRHA1vH0NEDG58HrT5icx/azi9MF1TWdtttWl0UIUsjEQBBh+SIkrpd21ZjEv7ptxWys1ncsg==
+
+"@types/d3-random@*":
+ version "3.0.3"
+ resolved "https://registry.yarnpkg.com/@types/d3-random/-/d3-random-3.0.3.tgz#ed995c71ecb15e0cd31e22d9d5d23942e3300cfb"
+ integrity sha512-Imagg1vJ3y76Y2ea0871wpabqp613+8/r0mCLEBfdtqC7xMSfj9idOnmBYyMoULfHePJyxMAw3nWhJxzc+LFwQ==
+
+"@types/d3-scale-chromatic@*":
+ version "3.1.0"
+ resolved "https://registry.yarnpkg.com/@types/d3-scale-chromatic/-/d3-scale-chromatic-3.1.0.tgz#dc6d4f9a98376f18ea50bad6c39537f1b5463c39"
+ integrity sha512-iWMJgwkK7yTRmWqRB5plb1kadXyQ5Sj8V/zYlFGMUBbIPKQScw+Dku9cAAMgJG+z5GYDoMjWGLVOvjghDEFnKQ==
+
+"@types/d3-scale@*":
+ version "4.0.9"
+ resolved "https://registry.yarnpkg.com/@types/d3-scale/-/d3-scale-4.0.9.tgz#57a2f707242e6fe1de81ad7bfcccaaf606179afb"
+ integrity sha512-dLmtwB8zkAeO/juAMfnV+sItKjlsw2lKdZVVy6LRr0cBmegxSABiLEpGVmSJJ8O08i4+sGR6qQtb6WtuwJdvVw==
+ dependencies:
+ "@types/d3-time" "*"
+
+"@types/d3-selection@*", "@types/d3-selection@^3.0.3":
+ version "3.0.11"
+ resolved "https://registry.yarnpkg.com/@types/d3-selection/-/d3-selection-3.0.11.tgz#bd7a45fc0a8c3167a631675e61bc2ca2b058d4a3"
+ integrity sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w==
+
+"@types/d3-shape@*":
+ version "3.1.7"
+ resolved "https://registry.yarnpkg.com/@types/d3-shape/-/d3-shape-3.1.7.tgz#2b7b423dc2dfe69c8c93596e673e37443348c555"
+ integrity sha512-VLvUQ33C+3J+8p+Daf+nYSOsjB4GXp19/S/aGo60m9h1v6XaxjiT82lKVWJCfzhtuZ3yD7i/TPeC/fuKLLOSmg==
+ dependencies:
+ "@types/d3-path" "*"
+
+"@types/d3-time-format@*":
+ version "4.0.3"
+ resolved "https://registry.yarnpkg.com/@types/d3-time-format/-/d3-time-format-4.0.3.tgz#d6bc1e6b6a7db69cccfbbdd4c34b70632d9e9db2"
+ integrity sha512-5xg9rC+wWL8kdDj153qZcsJ0FWiFt0J5RB6LYUNZjwSnesfblqrI/bJ1wBdJ8OQfncgbJG5+2F+qfqnqyzYxyg==
+
+"@types/d3-time@*":
+ version "3.0.4"
+ resolved "https://registry.yarnpkg.com/@types/d3-time/-/d3-time-3.0.4.tgz#8472feecd639691450dd8000eb33edd444e1323f"
+ integrity sha512-yuzZug1nkAAaBlBBikKZTgzCeA+k1uy4ZFwWANOfKw5z5LRhV0gNA7gNkKm7HoK+HRN0wX3EkxGk0fpbWhmB7g==
+
+"@types/d3-timer@*":
+ version "3.0.2"
+ resolved "https://registry.yarnpkg.com/@types/d3-timer/-/d3-timer-3.0.2.tgz#70bbda77dc23aa727413e22e214afa3f0e852f70"
+ integrity sha512-Ps3T8E8dZDam6fUyNiMkekK3XUsaUEik+idO9/YjPtfj2qruF8tFBXS7XhtE4iIXBLxhmLjP3SXpLhVf21I9Lw==
+
+"@types/d3-transition@*":
+ version "3.0.9"
+ resolved "https://registry.yarnpkg.com/@types/d3-transition/-/d3-transition-3.0.9.tgz#1136bc57e9ddb3c390dccc9b5ff3b7d2b8d94706"
+ integrity sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg==
+ dependencies:
+ "@types/d3-selection" "*"
+
+"@types/d3-zoom@*", "@types/d3-zoom@^3.0.1":
+ version "3.0.8"
+ resolved "https://registry.yarnpkg.com/@types/d3-zoom/-/d3-zoom-3.0.8.tgz#dccb32d1c56b1e1c6e0f1180d994896f038bc40b"
+ integrity sha512-iqMC4/YlFCSlO8+2Ii1GGGliCAY4XdeG748w5vQUbevlbDu0zSjH/+jojorQVBK/se0j6DUFNPBGSqD3YWYnDw==
+ dependencies:
+ "@types/d3-interpolate" "*"
+ "@types/d3-selection" "*"
+
+"@types/d3@^7.4.0":
+ version "7.4.3"
+ resolved "https://registry.yarnpkg.com/@types/d3/-/d3-7.4.3.tgz#d4550a85d08f4978faf0a4c36b848c61eaac07e2"
+ integrity sha512-lZXZ9ckh5R8uiFVt8ogUNf+pIrK4EsWrx2Np75WvF/eTpJ0FMHNhjXk8CKEx/+gpHbNQyJWehbFaTvqmHWB3ww==
+ dependencies:
+ "@types/d3-array" "*"
+ "@types/d3-axis" "*"
+ "@types/d3-brush" "*"
+ "@types/d3-chord" "*"
+ "@types/d3-color" "*"
+ "@types/d3-contour" "*"
+ "@types/d3-delaunay" "*"
+ "@types/d3-dispatch" "*"
+ "@types/d3-drag" "*"
+ "@types/d3-dsv" "*"
+ "@types/d3-ease" "*"
+ "@types/d3-fetch" "*"
+ "@types/d3-force" "*"
+ "@types/d3-format" "*"
+ "@types/d3-geo" "*"
+ "@types/d3-hierarchy" "*"
+ "@types/d3-interpolate" "*"
+ "@types/d3-path" "*"
+ "@types/d3-polygon" "*"
+ "@types/d3-quadtree" "*"
+ "@types/d3-random" "*"
+ "@types/d3-scale" "*"
+ "@types/d3-scale-chromatic" "*"
+ "@types/d3-selection" "*"
+ "@types/d3-shape" "*"
+ "@types/d3-time" "*"
+ "@types/d3-time-format" "*"
+ "@types/d3-timer" "*"
+ "@types/d3-transition" "*"
+ "@types/d3-zoom" "*"
+
"@types/debug@^4.0.0":
version "4.1.12"
resolved "https://registry.yarnpkg.com/@types/debug/-/debug-4.1.12.tgz#a155f21690871953410df4b6b6f53187f0500917"
@@ -3276,6 +3552,11 @@
"@types/qs" "*"
"@types/serve-static" "*"
+"@types/geojson@*":
+ version "7946.0.16"
+ resolved "https://registry.yarnpkg.com/@types/geojson/-/geojson-7946.0.16.tgz#8ebe53d69efada7044454e3305c19017d97ced2a"
+ integrity sha512-6C8nqWur3j98U6+lXDfTUWIfgvZU+EumvpHKcYjujKH7woYyLj2sUmff0tRhrqM7BohUw7Pz3ZB1jj2gW9Fvmg==
+
"@types/hast@^2.0.0":
version "2.3.10"
resolved "https://registry.yarnpkg.com/@types/hast/-/hast-2.3.10.tgz#5c9d9e0b304bbb8879b857225c5ebab2d81d7643"
@@ -4433,6 +4714,11 @@ ci-info@^3.2.0:
resolved "https://registry.yarnpkg.com/ci-info/-/ci-info-3.9.0.tgz#4279a62028a7b1f262f3473fc9605f5e218c59b4"
integrity sha512-NIxF55hv4nSqQswkAeiOi1r83xy8JldOFDTWiug55KBu9Jnblncd2U6ViHmYgHf01TPZS77NJBhBMKdWj9HQMQ==
+classcat@^5.0.3, classcat@^5.0.4:
+ version "5.0.5"
+ resolved "https://registry.yarnpkg.com/classcat/-/classcat-5.0.5.tgz#8c209f359a93ac302404a10161b501eba9c09c77"
+ integrity sha512-JhZUT7JFcQy/EzW605k/ktHtncoo9vnyW/2GspNYwFlN1C/WmjuV/xtS04e9SOkL2sTdw0VAZ2UGCcQ9lR6p6w==
+
classnames@2.x, classnames@^2.2.1, classnames@^2.2.3, classnames@^2.2.5, classnames@^2.2.6, classnames@^2.3.1, classnames@^2.3.2, classnames@^2.5.1:
version "2.5.1"
resolved "https://registry.yarnpkg.com/classnames/-/classnames-2.5.1.tgz#ba774c614be0f016da105c858e7159eae8e7687b"
@@ -4951,6 +5237,68 @@ csstype@^3.0.2, csstype@^3.1.3:
resolved "https://registry.yarnpkg.com/csstype/-/csstype-3.1.3.tgz#d80ff294d114fb0e6ac500fbf85b60137d7eff81"
integrity sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==
+"d3-color@1 - 3":
+ version "3.1.0"
+ resolved "https://registry.yarnpkg.com/d3-color/-/d3-color-3.1.0.tgz#395b2833dfac71507f12ac2f7af23bf819de24e2"
+ integrity sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==
+
+"d3-dispatch@1 - 3":
+ version "3.0.1"
+ resolved "https://registry.yarnpkg.com/d3-dispatch/-/d3-dispatch-3.0.1.tgz#5fc75284e9c2375c36c839411a0cf550cbfc4d5e"
+ integrity sha512-rzUyPU/S7rwUflMyLc1ETDeBj0NRuHKKAcvukozwhshr6g6c5d8zh4c2gQjY2bZ0dXeGLWc1PF174P2tVvKhfg==
+
+"d3-drag@2 - 3", d3-drag@^3.0.0:
+ version "3.0.0"
+ resolved "https://registry.yarnpkg.com/d3-drag/-/d3-drag-3.0.0.tgz#994aae9cd23c719f53b5e10e3a0a6108c69607ba"
+ integrity sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg==
+ dependencies:
+ d3-dispatch "1 - 3"
+ d3-selection "3"
+
+"d3-ease@1 - 3":
+ version "3.0.1"
+ resolved "https://registry.yarnpkg.com/d3-ease/-/d3-ease-3.0.1.tgz#9658ac38a2140d59d346160f1f6c30fda0bd12f4"
+ integrity sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==
+
+"d3-interpolate@1 - 3":
+ version "3.0.1"
+ resolved "https://registry.yarnpkg.com/d3-interpolate/-/d3-interpolate-3.0.1.tgz#3c47aa5b32c5b3dfb56ef3fd4342078a632b400d"
+ integrity sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==
+ dependencies:
+ d3-color "1 - 3"
+
+"d3-selection@2 - 3", d3-selection@3, d3-selection@^3.0.0:
+ version "3.0.0"
+ resolved "https://registry.yarnpkg.com/d3-selection/-/d3-selection-3.0.0.tgz#c25338207efa72cc5b9bd1458a1a41901f1e1b31"
+ integrity sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==
+
+"d3-timer@1 - 3":
+ version "3.0.1"
+ resolved "https://registry.yarnpkg.com/d3-timer/-/d3-timer-3.0.1.tgz#6284d2a2708285b1abb7e201eda4380af35e63b0"
+ integrity sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==
+
+"d3-transition@2 - 3":
+ version "3.0.1"
+ resolved "https://registry.yarnpkg.com/d3-transition/-/d3-transition-3.0.1.tgz#6869fdde1448868077fdd5989200cb61b2a1645f"
+ integrity sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w==
+ dependencies:
+ d3-color "1 - 3"
+ d3-dispatch "1 - 3"
+ d3-ease "1 - 3"
+ d3-interpolate "1 - 3"
+ d3-timer "1 - 3"
+
+d3-zoom@^3.0.0:
+ version "3.0.0"
+ resolved "https://registry.yarnpkg.com/d3-zoom/-/d3-zoom-3.0.0.tgz#d13f4165c73217ffeaa54295cd6969b3e7aee8f3"
+ integrity sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw==
+ dependencies:
+ d3-dispatch "1 - 3"
+ d3-drag "2 - 3"
+ d3-interpolate "1 - 3"
+ d3-selection "2 - 3"
+ d3-transition "2 - 3"
+
data-uri-to-buffer@^6.0.2:
version "6.0.2"
resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz#8a58bb67384b261a38ef18bea1810cb01badd28b"
@@ -9432,6 +9780,18 @@ react@^18.2.0:
dependencies:
loose-envify "^1.1.0"
+reactflow@^11.11.4:
+ version "11.11.4"
+ resolved "https://registry.yarnpkg.com/reactflow/-/reactflow-11.11.4.tgz#e3593e313420542caed81aecbd73fb9bc6576653"
+ integrity sha512-70FOtJkUWH3BAOsN+LU9lCrKoKbtOPnz2uq0CV2PLdNSwxTXOhCbsZr50GmZ+Rtw3jx8Uv7/vBFtCGixLfd4Og==
+ dependencies:
+ "@reactflow/background" "11.3.14"
+ "@reactflow/controls" "11.2.14"
+ "@reactflow/core" "11.11.4"
+ "@reactflow/minimap" "11.7.14"
+ "@reactflow/node-resizer" "2.2.14"
+ "@reactflow/node-toolbar" "1.3.14"
+
readable-stream@^2.0.1:
version "2.3.8"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.3.8.tgz#91125e8042bba1b9887f49345f6277027ce8be9b"
diff --git a/docs/api/tutorials/sdk/bulk-assertions-sdk.md b/docs/api/tutorials/sdk/bulk-assertions-sdk.md
index a23e6311215130..4a25183f9ac82e 100644
--- a/docs/api/tutorials/sdk/bulk-assertions-sdk.md
+++ b/docs/api/tutorials/sdk/bulk-assertions-sdk.md
@@ -176,7 +176,7 @@ def create_freshness_assertions(datasets, client, registry):
# Store the assertion URN for future reference
registry["freshness"][str(dataset_urn)] = str(freshness_assertion.urn)
- print(f"✅ Created freshness assertion for {dataset_urn.name}: {freshness_assertion.urn}")
+ print(f"Created freshness assertion for {dataset_urn.name}: {freshness_assertion.urn}")
except Exception as e:
print(f"❌ Failed to create freshness assertion for {dataset_urn.name}: {e}")
@@ -212,7 +212,7 @@ def create_volume_assertions(datasets, client, registry):
# Store the assertion URN
registry["volume"][str(dataset_urn)] = str(volume_assertion.urn)
- print(f"✅ Created volume assertion for {dataset_urn.name}: {volume_assertion.urn}")
+ print(f"Created volume assertion for {dataset_urn.name}: {volume_assertion.urn}")
except Exception as e:
print(f"❌ Failed to create volume assertion for {dataset_urn.name}: {e}")
@@ -248,7 +248,7 @@ dataset_columns = {}
for dataset_urn in datasets:
columns = get_dataset_columns(client, dataset_urn)
dataset_columns[str(dataset_urn)] = columns
- print(f"📊 Found {len(columns)} columns in {dataset_urn.name}")
+ print(f"Found {len(columns)} columns in {dataset_urn.name}")
```
## Step 4: Create Column-Level Assertions
@@ -321,7 +321,7 @@ def create_column_assertions(datasets, columns_dict, client, registry):
registry["column_metrics"][dataset_key][column_name] = {}
registry["column_metrics"][dataset_key][column_name][rule_name] = str(assertion.urn)
- print(f"✅ Created {rule_name} assertion for {dataset_urn.name}.{column_name}")
+ print(f"Created {rule_name} assertion for {dataset_urn.name}.{column_name}")
except Exception as e:
print(f"❌ Failed to create {rule_name} assertion for {dataset_urn.name}.{column_name}: {e}")
@@ -383,7 +383,7 @@ def save_assertion_registry(registry, filename=None):
with open(filename, 'w') as f:
json.dump(registry_with_metadata, f, indent=2)
- print(f"💾 Saved assertion registry to {filename}")
+ print(f"Saved assertion registry to {filename}")
return filename
# Save the registry
@@ -602,7 +602,7 @@ def main():
print("\n📋 Creating freshness assertions...")
create_freshness_assertions(datasets, client, assertion_registry)
- print("\n📊 Creating volume assertions...")
+ print("\nCreating volume assertions...")
create_volume_assertions(datasets, client, assertion_registry)
# Step 2: Get column information and create column assertions
@@ -615,7 +615,7 @@ def main():
create_column_assertions(datasets, dataset_columns, client, assertion_registry)
# Step 3: Save results
- print("\n💾 Saving assertion registry...")
+ print("\nSaving assertion registry...")
registry_file = save_assertion_registry(assertion_registry)
# Summary
@@ -625,12 +625,12 @@ def main():
sum(len(cols) for cols in assertion_registry["column_metrics"].values())
)
- print(f"\n✅ Bulk assertion creation complete!")
- print(f" 📈 Total assertions created: {total_assertions}")
- print(f" 🕐 Freshness assertions: {len(assertion_registry['freshness'])}")
- print(f" 📊 Volume assertions: {len(assertion_registry['volume'])}")
- print(f" 🎯 Column assertions: {sum(len(cols) for cols in assertion_registry['column_metrics'].values())}")
- print(f" 💾 Registry saved to: {registry_file}")
+ print(f"\nBulk assertion creation complete!")
+ print(f" Total assertions created: {total_assertions}")
+ print(f" Freshness assertions: {len(assertion_registry['freshness'])}")
+ print(f" Volume assertions: {len(assertion_registry['volume'])}")
+ print(f" Column assertions: {sum(len(cols) for cols in assertion_registry['column_metrics'].values())}")
+ print(f" Registry saved to: {registry_file}")
if __name__ == "__main__":
main()
diff --git a/docs/learn-datahub/discovery/advanced-search.md b/docs/learn-datahub/discovery/advanced-search.md
new file mode 100644
index 00000000000000..23ffa32ee86c96
--- /dev/null
+++ b/docs/learn-datahub/discovery/advanced-search.md
@@ -0,0 +1,606 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+import DataHubEntityCard from '@site/src/components/DataHubEntityCard';
+import { SearchExercise, HandsOnExercise, InteractiveDemo } from '@site/src/components/TutorialExercise';
+import NextStepButton from '@site/src/components/NextStepButton';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Advanced Search Techniques (15 minutes)
+
+
+
+Master DataHub's powerful search capabilities to find exactly what you need, when you need it. Transform from basic keyword searching to surgical data discovery.
+
+## Scenario 1: Targeted Data Discovery
+
+**Objective**: Find customer segmentation data for a marketing campaign without exact table names or locations.
+
+**What You'll Learn**: Strategic search approaches, advanced operators, and effective filtering techniques.
+
+## Search Strategy Framework
+
+Effective data discovery follows a systematic approach:
+
+**Professional Search Strategy:**
+
+1. **Start with Business Terms**: Use domain-specific language and common business concepts
+2. **Apply Smart Filters**: Narrow scope using platform, domain, and entity type filters
+3. **Refine with Operators**: Use advanced search operators for precise matching
+4. **Validate Results**: Review relevance, quality, and completeness of results
+5. **Save for Reuse**: Create saved searches for recurring discovery needs
+
+**Search Progression Example:**
+
+```
+Initial Query: "customer data"
+↓ Add Filters: Platform=Hive, Type=Dataset
+↓ Use Operators: "customer" AND ("profile" OR "behavior")
+↓ Validate: Check schema, lineage, and documentation
+↓ Save: "Customer Analytics Datasets" search
+```
+
+Let's apply this framework to solve our challenge!
+
+## Level 1: Strategic Keyword Search
+
+Start with business concepts, not technical terms:
+
+
+
+
+**Try these searches in your DataHub instance:**
+
+
+
+**What You'll Find**: Here are examples of the datasets your search would discover:
+
+
+
+
+
+
+
+:::tip Pro Tip
+Business users often name things differently than technical teams. Try both perspectives!
+:::
+
+
+
+
+**When business terms don't work, try technical approaches:**
+
+
+
+**Search 1: Database Patterns**
+
+```
+customer_segment user_cohort cust_analytics
+```
+
+**Search 2: Common Prefixes**
+
+```
+dim_customer fact_customer customer_dim
+```
+
+**Search 3: Analytics Patterns**
+
+```
+customer_ltv customer_score customer_tier
+```
+
+
+
+
+
+
+### Interactive Exercise: Your First Search
+
+
+
+**Try this now in DataHub:**
+
+1. **Open DataHub** at http://localhost:9002
+2. **Search for**: `customer segmentation`
+3. **Count the results**: How many datasets appear?
+4. **Note the variety**: Different platforms, naming conventions, descriptions
+
+**Reflection Questions:**
+
+- Which results look most relevant for marketing analysis?
+- What patterns do you notice in the naming conventions?
+- Are there results you didn't expect?
+
+
+
+## Level 2: Smart Filtering
+
+Raw search results can be overwhelming. Use filters to focus on what matters:
+
+### Platform Filtering
+
+
+
+
+**Follow along in DataHub:**
+
+1. **Search**: `customer`
+2. **Apply Platform Filter**:
+ - Click "Filters" in the left sidebar
+ - Select "Platform"
+ - Choose "PostgreSQL" (for operational data)
+ - OR choose "Snowflake" (for analytics data)
+
+**Notice how results change!**
+
+
+
+
+**Choose filters based on your use case:**
+
+**For Marketing Analysis:**
+
+- Snowflake, BigQuery (analytics platforms)
+- dbt (transformed data)
+- MySQL, PostgreSQL (raw operational data) - not recommended
+
+**For Operational Insights:**
+
+- PostgreSQL, MySQL (live operational data)
+- Kafka (real-time streams)
+- S3 (archived data) - not recommended
+
+**For Data Engineering:**
+
+- All platforms (need complete picture)
+- Include pipelines and jobs
+- Show lineage connections
+
+
+
+
+### Entity Type Filtering
+
+
+
+**Filter by what you're looking for:**
+
+| Need | Filter Selection | Why |
+| --------------------- | --------------------- | -------------------------- |
+| **Raw Data** | Datasets only | Focus on tables and views |
+| **Business Insights** | Dashboards + Charts | See existing analysis |
+| **Data Processing** | Data Jobs + Pipelines | Understand transformations |
+| **Complete Picture** | All entity types | Full ecosystem view |
+
+
+
+### Interactive Exercise: Smart Filtering
+
+
+
+**Challenge**: Find customer segmentation data suitable for marketing analysis
+
+**Your Turn:**
+
+1. **Search**: `customer segment`
+2. **Apply Filters**:
+ - Entity Type: "Datasets"
+ - Platform: "Snowflake" OR "BigQuery"
+ - (If available) Domain: "Marketing" or "Analytics"
+3. **Compare**: How many results now vs. before filtering?
+
+**Success Criteria**:
+
+- Results reduced to manageable number (< 20)
+- Results are more relevant to marketing use case
+- You can see clear candidates for your analysis
+
+
+
+## Level 3: Advanced Search Operators
+
+Unlock DataHub's power with search operators:
+
+### Boolean Operators
+
+
+
+
+
+
+**AND - All terms must match:**
+
+```
+customer AND segmentation AND marketing
+```
+
+_Finds datasets containing all three terms_
+
+**OR - Any term can match:**
+
+```
+customer OR user OR client
+```
+
+_Finds datasets with any customer-related term_
+
+**Combined Logic:**
+
+```
+(customer OR user) AND (segment OR cohort OR tier)
+```
+
+_Flexible matching for customer segmentation concepts_
+
+
+
+
+
+
+
+
+**NOT - Exclude unwanted results:**
+
+```
+customer NOT test
+```
+
+_Customer data excluding test tables_
+
+**Exclude Multiple Terms:**
+
+```
+customer NOT (test OR temp OR backup)
+```
+
+_Clean production customer data only_
+
+**Exclude Platforms:**
+
+```
+customer NOT platform:mysql
+```
+
+_Customer data from all platforms except MySQL_
+
+
+
+
+
+
+### Field-Specific Search
+
+Target specific metadata fields for precision:
+
+
+
+**Syntax**: `field:value` or `field:"exact phrase"`
+
+| Field | Example | Use Case |
+| -------------- | --------------------------------------- | ------------------------- |
+| `name:` | `name:customer*` | Search table/column names |
+| `description:` | `description:"customer lifetime value"` | Search documentation |
+| `platform:` | `platform:snowflake` | Specific data platform |
+| `tags:` | `tags:pii` | Find tagged datasets |
+| `owners:` | `owners:john.doe` | Find owned datasets |
+
+
+
+### Wildcard and Pattern Matching
+
+
+
+
+
+
+**Prefix Matching:**
+
+```
+customer*
+```
+
+_Matches: customer, customers, customer_data, customer_analytics_
+
+**Suffix Matching:**
+
+```
+*_customer
+```
+
+_Matches: dim_customer, fact_customer, raw_customer_
+
+**Complex Patterns:**
+
+```
+cust*_seg*
+```
+
+_Matches: customer_segments, cust_data_segmentation_
+
+
+
+
+
+
+
+
+**Exact Phrase Matching:**
+
+```
+"customer lifetime value"
+```
+
+_Must contain this exact phrase_
+
+**Combine with Operators:**
+
+```
+"customer segmentation" OR "user cohorts"
+```
+
+_Either exact phrase_
+
+**Field + Phrase:**
+
+```
+description:"high value customers"
+```
+
+_Exact phrase in description field_
+
+
+
+
+
+
+### Interactive Exercise: Operator Mastery
+
+
+
+**Your Mission**: Try each level in DataHub and observe how results change. Notice how each level gives you more control and precision. Which approach gives you the most relevant results for marketing analysis?
+
+**Pro Tip**: Copy each query into DataHub's search bar and compare the result quality. Level 4 should give you the most targeted, actionable datasets.
+
+
+
+## Level 4: Saved Searches & Efficiency
+
+Don't repeat work - save your successful searches:
+
+### Creating Saved Searches
+
+
+
+**When you find a great search:**
+
+1. **Perfect your search** using the techniques above
+2. **Click the bookmark icon** next to the search bar
+3. **Name it descriptively**: "Customer Segmentation - Marketing Ready"
+4. **Add description**: "Analytics-ready customer segment data for marketing campaigns"
+5. **Set sharing**: Team-wide or personal
+
+**Pro Naming Convention:**
+
+- `[Use Case] - [Data Type] - [Quality Level]`
+- Examples:
+ - "Marketing - Customer Segments - Production"
+ - "Analysis - User Behavior - High Quality"
+ - "Reporting - Sales Metrics - Daily Updated"
+
+
+
+### Search Templates for Common Scenarios
+
+
+
+
+```
+# High-quality customer data for campaigns
+(customer OR user) AND (segment OR cohort OR tier)
+AND platform:(snowflake OR bigquery)
+NOT (test OR temp OR backup)
+```
+
+
+
+
+```
+# Live operational customer data
+name:customer* AND platform:(postgres OR mysql)
+AND hasOwners:true AND updatedInLastWeek:true
+```
+
+
+
+
+```
+# Processed analytical datasets
+description:(analytics OR analysis OR processed)
+AND (customer OR user) AND NOT raw
+AND platform:(snowflake OR bigquery OR dbt)
+```
+
+
+
+
+## Success Checkpoint
+
+
+
+**You've mastered advanced search when you can:**
+
+**Speed Test**: Find relevant customer segmentation data in under 2 minutes
+**Precision Test**: Get < 10 highly relevant results using operators
+**Efficiency Test**: Create and use a saved search for future use
+**Strategy Test**: Choose the right approach for different discovery scenarios
+
+**Validation Exercise:**
+Try to solve this in 90 seconds: _"Find production-ready customer analytics data suitable for a marketing campaign, excluding any test or temporary tables."_
+
+**Expected Result**: 1-5 highly relevant datasets from analytics platforms
+
+
+
+## Pro Tips & Shortcuts
+
+
+
+**Speed Techniques:**
+
+- Use browser bookmarks for common DataHub searches
+- Set up browser shortcuts: `dh customer` → DataHub customer search
+- Learn keyboard shortcuts: `Ctrl+K` for quick search
+
+**Accuracy Boosters:**
+
+- Always check the "Updated" date - stale data wastes time
+- Look for owner information - contactable owners = reliable data
+- Check description quality - well-documented data is usually better maintained
+
+**Team Efficiency:**
+
+- Share successful search patterns with teammates
+- Create team-wide saved searches for common use cases
+- Document search strategies in team wikis
+
+
+
+## Troubleshooting Common Issues
+
+
+
+
+**Problem**: Search returns hundreds of results
+
+**Solutions:**
+
+1. **Add more specific terms**: `customer segmentation marketing`
+2. **Use field targeting**: `name:customer* AND description:segment*`
+3. **Apply platform filters**: Focus on relevant data platforms
+4. **Exclude noise**: `NOT (test OR temp OR backup OR old)`
+
+
+
+
+**Problem**: Search returns nothing
+
+**Solutions:**
+
+1. **Check spelling**: Try variations and wildcards
+2. **Broaden terms**: Use OR operators for synonyms
+3. **Remove filters**: Start broad, then narrow down
+4. **Try different fields**: Maybe it's in descriptions, not names
+
+
+
+
+**Problem**: Results aren't relevant to your use case
+
+**Solutions:**
+
+1. **Add context terms**: Include your domain/use case
+2. **Use exclusions**: Remove irrelevant categories
+3. **Filter by platform**: Match your analysis environment
+4. **Check entity types**: Maybe you need dashboards, not datasets
+
+
+
+
+## What You've Learned
+
+**Congratulations!** You've transformed from basic search to advanced discovery:
+
+- **Strategic Approach**: Business-first thinking with technical backup
+- **Smart Filtering**: Platform and entity type filtering for relevance
+- **Advanced Operators**: Boolean logic, field targeting, and wildcards
+- **Efficiency Tools**: Saved searches and reusable patterns
+- **Troubleshooting**: Common issues and systematic solutions
+
+---
+
+
+Next: Understand and Evaluate Your Data
+
diff --git a/docs/learn-datahub/discovery/collaborative-discovery.md b/docs/learn-datahub/discovery/collaborative-discovery.md
new file mode 100644
index 00000000000000..8330a912fb3d5a
--- /dev/null
+++ b/docs/learn-datahub/discovery/collaborative-discovery.md
@@ -0,0 +1,550 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Collaborative Discovery (10 minutes)
+
+
+
+Transform DataHub from a solo tool into a team knowledge platform. Learn to document insights, ask questions, and build collective data intelligence that benefits everyone.
+
+## Scenario 3: Knowledge Sharing at Scale
+
+**Objective**: Capture and share insights so future analysts can easily find and apply the dataset in the correct context.
+
+**What You'll Learn**: Documentation practices, effective tagging strategies, and sustainable collaboration patterns.
+
+## The Collaboration Multiplier Effect
+
+Individual discoveries become team assets through effective collaboration:
+
+**Collaborative Discovery Workflow:**
+
+1. **Individual Discovery**: Find and explore datasets for specific needs
+2. **Document Insights**: Add descriptions, business context, and usage notes
+3. **Tag & Classify**: Apply consistent tags and domain classifications
+4. **Share Knowledge**: Contribute to team understanding through documentation
+5. **Team Benefits**: Enable others to discover and understand data faster
+6. **Improved Discovery**: Create a self-reinforcing cycle of knowledge sharing
+
+**Collaboration Impact:**
+
+```
+Personal Discovery → Team Knowledge → Organizational Asset
+ ↓ ↓ ↓
+Find what you need Others find it too Enterprise catalog
+Document findings Builds on your work Reduces redundancy
+Tag for context Improves over time Accelerates innovation
+```
+
+**The Multiplier Effect**: Your 10 minutes of documentation saves hours for future users.
+
+## Level 1: Smart Documentation (4 minutes)
+
+Transform cryptic datasets into self-explanatory resources:
+
+### Documentation That Actually Helps
+
+
+
+
+**Typical (Unhelpful) Documentation:**
+
+```
+Table: customer_seg_v3
+Description: Customer segmentation data
+```
+
+**Helpful Documentation:**
+
+```
+Table: customer_seg_v3
+Description: Customer segmentation analysis for marketing campaigns
+
+Business Purpose:
+- Identifies high-value customer segments for targeted marketing
+- Updated weekly based on 90-day purchase behavior
+- Used by Marketing, Sales, and Customer Success teams
+
+Key Insights:
+- 'Premium' segment represents 15% of customers but 60% of revenue
+- 'At Risk' segment requires immediate retention efforts
+- Segmentation logic based on RFM analysis (Recency, Frequency, Monetary)
+
+Usage Notes:
+- Use customer_id to join with other customer tables
+- segment_score ranges from 1-100 (higher = more valuable)
+- last_updated shows when each customer's segment was calculated
+```
+
+
+
+
+**Use these templates for consistency:**
+
+
+
+**Analytics Dataset Template:**
+
+```
+Business Purpose: [What business problem does this solve?]
+Key Metrics: [What can you measure with this data?]
+Refresh Schedule: [How often is this updated?]
+Data Quality: [Known limitations or gotchas]
+Common Use Cases: [How do teams typically use this?]
+Related Datasets: [What other data works well with this?]
+```
+
+**Operational Dataset Template:**
+
+```
+System Source: [What application generates this data?]
+Business Process: [What real-world process does this represent?]
+Key Relationships: [How does this connect to other systems?]
+SLA Information: [How fresh is this data expected to be?]
+Access Patterns: [Who typically needs this data and why?]
+```
+
+
+
+
+
+
+### Interactive Exercise: Documentation Makeover
+
+
+
+**Your Turn**: Find a poorly documented dataset and give it a makeover.
+
+**Step 1: Find a Dataset**
+
+- Search for datasets with minimal descriptions
+- Look for technical names without business context
+- Choose one that you understand or can research
+
+**Step 2: Research & Document**
+
+- What business problem does this solve?
+- Who would use this data and why?
+- What are the key columns and their meanings?
+- Are there any gotchas or limitations?
+
+**Step 3: Write Helpful Documentation**
+Use the templates above to create documentation that would help a new team member understand this dataset in 2 minutes.
+
+**Success Criteria:**
+
+- Business purpose is clear
+- Key columns are explained
+- Usage guidance is provided
+- You'd be comfortable with a new hire using this dataset based on your documentation
+
+
+
+## Level 2: Strategic Tagging (3 minutes)
+
+Tags are the navigation system for your data catalog. Use them strategically:
+
+### Tagging Strategy Framework
+
+
+
+**Tag Categories & Examples:**
+
+| Category | Purpose | Examples |
+| -------------------- | -------------------- | ----------------------------------------------------------- |
+| **Data Quality** | Signal reliability | `high-quality`, `needs-validation`, `production-ready` |
+| **Business Domain** | Organize by function | `marketing`, `finance`, `operations`, `customer-success` |
+| **Data Sensitivity** | Privacy & compliance | `pii`, `confidential`, `public`, `gdpr-relevant` |
+| **Usage Pattern** | Guide consumption | `real-time`, `batch-processed`, `analytical`, `operational` |
+| **Lifecycle Stage** | Indicate status | `active`, `deprecated`, `experimental`, `archived` |
+
+
+
+### Tagging Best Practices
+
+
+
+
+**Establish team conventions:**
+
+
+
+**Good Tag Naming:**
+
+- Use lowercase with hyphens: `customer-analytics`
+- Be specific: `daily-updated` not just `updated`
+- Use standard terms: `pii` not `personal-info`
+- Include context: `marketing-ready` not just `ready`
+
+**Avoid These Patterns:**
+
+- Inconsistent casing: `Customer-Analytics` vs `customer_analytics`
+- Vague terms: `good`, `important`, `useful`
+- Personal preferences: `johns-favorite`, `team-alpha-data`
+- Redundant info: `table-data` (everything in datasets is table data)
+
+
+
+
+
+
+**Create logical tag relationships:**
+
+**Business Domain Tag Hierarchy:**
+
+**Marketing Domain:**
+
+- `marketing-campaigns` - Campaign performance and attribution data
+- `marketing-analytics` - Customer behavior and conversion metrics
+- `marketing-automation` - Lead scoring and nurturing workflows
+
+**Finance Domain:**
+
+- `finance-reporting` - Financial statements and regulatory reports
+- `finance-forecasting` - Budget planning and revenue projections
+- `finance-compliance` - Audit trails and regulatory compliance data
+
+**Operations Domain:**
+
+- `operations-monitoring` - System performance and infrastructure metrics
+- `operations-logistics` - Supply chain and fulfillment data
+- `operations-support` - Customer service and issue tracking
+
+**Tag Strategy Best Practices:**
+
+```
+Domain Level: Broad business area (marketing, finance, operations)
+ ↓
+Function Level: Specific business function within domain
+ ↓
+Use Case Level: Specific analytical or operational purpose
+```
+
+**Example Tag Application:**
+
+- Dataset: "Customer Campaign Performance Q4 2024"
+- Tags: `marketing-campaigns`, `marketing-analytics`, `quarterly-reporting`
+- Result: Easily discoverable by marketing team and analysts
+
+**Benefits:**
+
+- Easier filtering and discovery
+- Consistent team usage
+- Scalable organization
+
+
+
+
+### Interactive Exercise: Tag Like a Pro
+
+
+
+**Challenge**: Tag 3 different datasets using strategic tagging.
+
+**Dataset Types to Find:**
+
+1. **Customer data** (operational or analytical)
+2. **Financial/sales data** (revenue, transactions, etc.)
+3. **Product/inventory data** (catalog, usage, etc.)
+
+**For Each Dataset, Add Tags For:**
+
+- **Quality level**: How reliable is this data?
+- **Business domain**: Which team owns/uses this?
+- **Sensitivity**: Any privacy considerations?
+- **Usage pattern**: How is this typically consumed?
+- **Lifecycle stage**: What's the status of this dataset?
+
+**Example Tagging:**
+
+```
+Dataset: customer_segments_weekly
+Tags: high-quality, marketing, pii, analytical, production-ready
+```
+
+**Validation**: Would a new team member understand the dataset's purpose and usage from your tags alone?
+
+
+
+## Level 3: Knowledge Sharing (3 minutes)
+
+Turn your discoveries into team assets:
+
+### Effective Knowledge Sharing Techniques
+
+
+
+
+**Use DataHub's Q&A features strategically:**
+
+
+
+**Ask Good Questions:**
+
+- "What's the difference between customer_id and user_id in this table?"
+- "How often is this data refreshed? I see conflicting information."
+- "Are there known data quality issues with the email column?"
+- "What's the business logic behind the customer_score calculation?"
+
+**Provide Helpful Answers:**
+
+- Be specific and actionable
+- Include examples when possible
+- Reference related datasets or documentation
+- Update your answer if information changes
+
+**Question Patterns That Help Teams:**
+
+- Data quality clarifications
+- Business logic explanations
+- Usage recommendations
+- Alternative dataset suggestions
+
+
+
+
+
+
+**Guide future users with recommendations:**
+
+
+
+**Recommendation Types:**
+
+**Alternative Datasets:**
+"For real-time customer data, consider `customer_events_stream` instead of this daily batch table."
+
+**Usage Warnings:**
+"This table has a 2-hour delay. For time-sensitive analysis, use `customer_realtime_view`."
+
+**Quality Notes:**
+"Email column has ~15% null values. Use `email_verified` flag to filter for valid emails."
+
+**Best Practices:**
+"Join on `customer_uuid` rather than `email` for better accuracy and privacy compliance."
+
+
+
+
+
+
+**Build discovery networks:**
+
+
+
+**📌 Strategic Bookmarking:**
+
+- Bookmark datasets you use regularly
+- Bookmark high-quality examples for reference
+- Bookmark datasets relevant to your domain
+
+**👀 Smart Following:**
+
+- Follow datasets critical to your work
+- Follow datasets you've contributed documentation to
+- Follow datasets in active development
+
+**🔔 Notification Benefits:**
+
+- Get alerts when important data changes
+- Stay informed about schema updates
+- Learn from others' questions and discoveries
+
+
+
+
+
+
+### Building a Collaborative Culture
+
+
+
+**🌟 Team Practices That Work:**
+
+**📅 Regular Data Reviews:**
+
+- Weekly team check-ins on new datasets
+- Monthly data quality discussions
+- Quarterly documentation cleanup
+
+**🎓 Knowledge Sharing:**
+
+- Document discoveries in team channels
+- Share interesting datasets in team meetings
+- Create "dataset of the week" highlights
+
+**🏆 Recognition:**
+
+- Acknowledge good documentation contributors
+- Celebrate data quality improvements
+- Share success stories from collaborative discovery
+
+
+
+## Success Stories: Collaboration in Action
+
+
+
+
+**Before Collaboration:**
+
+- Each analyst spent 2-3 hours finding customer data
+- Repeated work across team members
+- Inconsistent analysis due to different data sources
+
+**After Implementing Collaboration:**
+
+- Comprehensive tagging system for marketing data
+- Shared documentation with business context
+- Team-wide saved searches for common use cases
+
+**Results:**
+
+- 70% reduction in data discovery time
+- Consistent analysis across team
+- New team members productive in days, not weeks
+
+
+
+
+**Challenge**: Engineering and Marketing teams using different customer datasets
+
+**Collaboration Solution:**
+
+- Joint documentation sessions
+- Shared tagging conventions
+- Cross-team Q&A on dataset differences
+
+**Outcome:**
+
+- Clear guidance on when to use each dataset
+- Reduced confusion and duplicate analysis
+- Better alignment between teams
+
+
+
+
+## Advanced Collaboration Features
+
+
+
+**Automated Collaboration:**
+
+- Set up alerts for dataset changes
+- Use DataHub Actions to notify teams of quality issues
+- Integrate with Slack for team notifications
+
+**Collaboration Analytics:**
+
+- Track which datasets are most bookmarked
+- Identify documentation gaps
+- Measure team engagement with data catalog
+
+**Targeted Sharing:**
+
+- Use domain-specific tags for relevant teams
+- Create role-based saved searches
+- Implement approval workflows for sensitive data
+
+
+
+## Success Checkpoint
+
+
+
+**You've mastered collaborative discovery when you can:**
+
+**Documentation Test**: Write dataset documentation that helps a new team member be productive immediately
+**Tagging Test**: Apply consistent, strategic tags that improve discoverability
+**Sharing Test**: Contribute questions, answers, or recommendations that benefit the team
+**Culture Test**: Establish practices that make collaboration natural and valuable
+
+**Final Challenge**:
+Take a dataset you've worked with and make it 50% more valuable to your team through documentation, tagging, and knowledge sharing. Measure success by asking: "Would this save time for the next person who needs similar data?"
+
+
+
+## Measuring Collaboration Success
+
+
+
+**Team Metrics to Track:**
+
+| Metric | Good Trend | What It Means |
+| -------------------------- | ---------- | --------------------------------------- |
+| **Documentation Coverage** | Increasing | More datasets have helpful descriptions |
+| **Tag Consistency** | Increasing | Team uses standardized tagging |
+| **Q&A Activity** | Increasing | Active knowledge sharing |
+| **Discovery Time** | Decreasing | Faster data finding |
+| **Repeat Questions** | Decreasing | Better documentation quality |
+
+
+
+## What You've Accomplished
+
+**Outstanding work!** You've completed the Data Discovery & Search mastery series:
+
+### Skills Mastered:
+
+- **Advanced Search**: Strategic search approaches with operators and filters
+- **Dataset Evaluation**: Rapid quality assessment and decision-making
+- **Collaborative Discovery**: Documentation, tagging, and knowledge sharing
+
+### Business Impact:
+
+- **Speed**: Find relevant data in minutes, not hours
+- **Accuracy**: Make informed decisions about data quality and fit
+- **Team Efficiency**: Share knowledge that benefits everyone
+- **Scalability**: Build practices that improve over time
+
+## What's Next?
+
+Choose your next learning adventure based on your role and interests:
+
+
+
+**For Data Analysts:**
+→ [Data Lineage & Impact Analysis](../lineage/overview.md) - Understand data dependencies and trace issues
+
+**For Data Engineers:**
+→ [Data Ingestion Mastery](../ingestion/overview.md) - Master recipes, profiling, and production patterns
+
+**For Data Governance Teams:**
+→ [Data Governance Fundamentals](../governance/overview.md) - Ownership, classification, and business glossaries
+
+**For Everyone:**
+→ [Data Quality & Monitoring](../quality/overview.md) - Set up assertions and monitoring for reliable data
+
+
+
+## Keep Learning & Contributing
+
+
+
+**🌟 Stay Engaged:**
+
+- Share your success stories with the DataHub community
+- Contribute to DataHub documentation and tutorials
+- Help other users in the DataHub Slack community
+- Suggest improvements to DataHub's collaborative features
+
+**📚 Additional Resources:**
+
+- [DataHub Community Slack](https://datahub.com/slack)
+- [DataHub Documentation](../../)
+- [DataHub GitHub](https://github.com/datahub-project/datahub)
+
+
+
+**Congratulations on becoming a DataHub Discovery Expert!**
+
+Your investment in learning these skills will pay dividends every time you or your teammates need to find and understand data. Keep practicing, keep collaborating, and keep discovering!
diff --git a/docs/learn-datahub/discovery/dataset-profiles.md b/docs/learn-datahub/discovery/dataset-profiles.md
new file mode 100644
index 00000000000000..21f1b67f04d4dd
--- /dev/null
+++ b/docs/learn-datahub/discovery/dataset-profiles.md
@@ -0,0 +1,564 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import DataHubEntityCard from '@site/src/components/DataHubEntityCard';
+import NextStepButton from '@site/src/components/NextStepButton';
+
+# Understanding Dataset Profiles (20 minutes)
+
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+
+
+Learn to quickly assess data quality, understand schemas, and make informed decisions about whether a dataset meets your analysis needs. Transform from guessing to knowing.
+
+## Scenario 2: Root Cause Analysis
+
+**Objective**: Investigate anomalous dashboard metrics and evaluate candidate datasets to identify the root cause.
+
+**What You'll Learn**: Rapid quality assessment, statistics interpretation, and issue identification using DataHub profiles.
+
+## The Dataset Intelligence Framework
+
+Every dataset tells a story through its metadata. Learn to read these signals:
+
+**Dataset Profile Analysis Workflow:**
+
+1. **Dataset Discovery**: Locate potential datasets through search or browsing
+2. **Quick Health Check**: Review freshness, completeness, and quality indicators
+3. **Schema Analysis**: Examine column types, constraints, and relationships
+4. **Quality Assessment**: Evaluate data distributions, null rates, and anomalies
+5. **Usage Validation**: Check access patterns, downstream dependencies, and documentation
+6. **Decision**: Determine dataset suitability for your use case
+
+**Profile Reading Checklist:**
+
+```
+✓ Last Updated: Within acceptable freshness window?
+✓ Row Count: Reasonable size for expected data volume?
+✓ Column Quality: Acceptable null rates and distributions?
+✓ Schema Stability: Consistent structure over time?
+✓ Documentation: Sufficient context and business meaning?
+✓ Access Patterns: Evidence of active usage by others?
+```
+
+## Quick Health Check (2 minutes)
+
+Before diving deep, get a rapid overview of dataset health:
+
+### The 30-Second Assessment
+
+
+
+**Traffic Light System:**
+
+| Green Light | Yellow Light | Red Light |
+| ------------------ | -------------------- | -------------------- |
+| Updated < 24h ago | Updated 1-7 days ago | Updated > 7 days ago |
+| Has owner assigned | Owner unclear | No owner |
+| Has description | Minimal description | No description |
+| Normal row count | Row count changed | Dramatic row changes |
+
+
+
+**Visual Health Assessment Examples:**
+
+
+
+
+
+
+
+
+
+### Interactive Exercise: Health Check Practice
+
+
+
+**Find and evaluate 3 customer-related datasets:**
+
+1. **Open any customer dataset** from your previous search
+2. **Look at the header area** - note the key indicators
+3. **Fill out this assessment:**
+
+```
+Dataset Name: ________________
+Last Updated: ________________
+Owner: ______________________
+Row Count: ___________________
+Health Score: Good / Warning / Critical (circle one)
+```
+
+**Repeat for 2 more datasets and compare results**
+
+
+
+## Schema Deep Dive (8 minutes)
+
+The schema tells you what data is actually available and how it's structured:
+
+### Reading the Schema Tab
+
+
+
+
+**What to look for in each column:**
+
+
+
+**Column Name Patterns:**
+
+- `id`, `uuid`, `key` → Identifiers (good for joins)
+- `created_at`, `updated_at` → Timestamps (good for time analysis)
+- `email`, `phone`, `address` → PII (privacy considerations)
+- `status`, `type`, `category` → Categorical data (good for grouping)
+- `amount`, `count`, `score` → Numeric data (good for calculations)
+
+**Data Type Insights:**
+
+- `VARCHAR(255)` → Text fields, check for standardization
+- `TIMESTAMP` → Time-based analysis possible
+- `INTEGER` → Counting and math operations
+- `DECIMAL(10,2)` → Monetary values, precise calculations
+- `BOOLEAN` → Binary flags and filters
+
+
+
+
+
+
+**Understanding table relationships:**
+
+
+
+**🔑 Primary Keys:**
+
+- Usually named `id`, `uuid`, or `[table]_id`
+- Unique identifier for each row
+- Essential for joins and deduplication
+
+**Foreign Keys:**
+
+- References to other tables
+- Shows data relationships
+- Enables cross-table analysis
+
+**Composite Keys:**
+
+- Multiple columns forming unique identifier
+- Common in fact tables and junction tables
+- Important for grain understanding
+
+
+
+**Try This:** Look at a customer table schema and identify:
+
+- Primary key column
+- Foreign key relationships
+- PII columns that need special handling
+- Timestamp columns for temporal analysis
+
+
+
+
+**Schema-level quality signals:**
+
+
+
+**High Quality Indicators:**
+
+- Consistent naming conventions
+- Comprehensive column descriptions
+- Appropriate data types
+- Clear primary/foreign key relationships
+- Reasonable column count (not too sparse/dense)
+
+**Quality Concerns:**
+
+- Inconsistent naming (camelCase + snake_case)
+- Missing column descriptions
+- Generic column names (`col1`, `field_a`)
+- All VARCHAR types (suggests poor modeling)
+- Excessive NULL values in key columns
+
+
+
+
+
+
+### Interactive Exercise: Schema Detective Work
+
+
+
+**Scenario**: You need to join customer data with order data for analysis.
+
+**Your Task:**
+
+1. **Find a customer dataset** and examine its schema
+2. **Find an orders dataset** and examine its schema
+3. **Identify the join key(s)** - what columns connect these tables?
+4. **Assess join feasibility:**
+ - Are the key columns the same data type?
+ - Do the column names suggest they're related?
+ - Are there any data quality concerns?
+
+**Success Criteria:**
+
+- Identified clear join path between tables
+- Assessed potential data quality issues
+- Understand what analysis would be possible
+
+
+
+## Data Statistics & Profiling (7 minutes)
+
+DataHub's automated profiling reveals data patterns and quality issues:
+
+### Understanding Profile Statistics
+
+
+
+
+**Key statistics to interpret:**
+
+
+
+| Statistic | What It Tells You | Red Flags |
+| ---------------------- | ----------------------- | --------------------------------- |
+| **Min/Max** | Data range and outliers | Impossible values (negative ages) |
+| **Mean/Median** | Central tendency | Large difference = skewed data |
+| **Null Count** | Data completeness | High nulls in key fields |
+| **Distinct Count** | Data variety | Too few = poor granularity |
+| **Standard Deviation** | Data spread | Very high = inconsistent data |
+
+**Practice Interpretation:**
+
+```
+customer_age: Min=18, Max=150, Mean=45, Median=42, Nulls=5%
+```
+
+**Analysis**: Reasonable age range, slight right skew (mean > median), good completeness
+
+
+
+
+
+
+**Understanding categorical data:**
+
+
+
+**Value Distribution:**
+
+- **Top Values**: Most common categories
+- **Unique Count**: How many distinct values
+- **Null Percentage**: Missing data rate
+
+**Quality Signals:**
+
+- **Good**: Clear categories, low null rate
+- **Concerning**: Too many unique values, high null rate
+- **Bad**: Inconsistent formatting, obvious data entry errors
+
+**Example Analysis:**
+
+```
+customer_status:
+- Active: 85% (good - most customers active)
+- Inactive: 12% (reasonable churn)
+- Pending: 3% (small processing queue)
+- Nulls: 0% (excellent - no missing status)
+```
+
+
+
+
+
+
+**Time-based data insights:**
+
+
+
+**Temporal Patterns:**
+
+- **Date Range**: How far back does data go?
+- **Update Frequency**: Daily, hourly, real-time?
+- **Gaps**: Missing time periods?
+- **Seasonality**: Regular patterns?
+
+**Business Relevance:**
+
+- **Recent Data**: Good for current analysis
+- **Historical Depth**: Enables trend analysis
+- **Regular Updates**: Reliable for ongoing monitoring
+- **Complete Coverage**: No missing business periods
+
+
+
+
+
+
+### Interactive Exercise: Data Quality Detective
+
+
+
+**Mystery**: Customer count dropped 50% overnight. Use profiling data to investigate.
+
+**Investigation Steps:**
+
+1. **Find customer datasets** updated in the last 2 days
+2. **Check row count trends** - look for dramatic changes
+3. **Examine key columns** for anomalies:
+ - Are there unusual null rates?
+ - Do value distributions look normal?
+ - Are there data type inconsistencies?
+
+**Detective Questions:**
+
+- Which dataset shows the row count drop?
+- What columns might explain the change?
+- Are there data quality issues that could cause undercounting?
+
+**Report Your Findings:**
+
+```
+Suspect Dataset: ________________
+Row Count Change: _______________
+Potential Cause: ________________
+Confidence Level: High/Medium/Low
+```
+
+
+
+## Usage Patterns & Validation (3 minutes)
+
+Understand how others use this data to validate your choice:
+
+### Query History Analysis
+
+
+
+**Usage Indicators:**
+
+| Pattern | Interpretation | Decision Impact |
+| --------------------- | ------------------------ | ------------------------------- |
+| **High Query Volume** | Popular, trusted dataset | Good choice for analysis |
+| **Recent Queries** | Actively used, current | Likely up-to-date |
+| **Complex Queries** | Rich analytical use | Supports sophisticated analysis |
+| **Simple Queries** | Basic lookup use | May lack analytical depth |
+| **No Recent Usage** | Potentially stale | Investigate before using |
+
+
+
+### User Feedback Signals
+
+
+
+
+**Look for community validation:**
+
+- **Bookmarks/Follows**: How many users track this dataset?
+- **Documentation Quality**: Well-documented = well-used
+- **Owner Responsiveness**: Active owners = maintained data
+- **Related Datasets**: Part of a larger, maintained ecosystem?
+
+
+
+
+**User-generated quality signals:**
+
+- **Tags**: `high-quality`, `production-ready`, `deprecated`
+- **Comments**: User experiences and gotchas
+- **Issues**: Known problems and limitations
+- **Recommendations**: Alternative datasets for similar use cases
+
+
+
+
+## Making the Go/No-Go Decision
+
+Synthesize all information into a clear decision:
+
+### Decision Framework
+
+
+
+**Use This Dataset If:**
+
+- Health check shows green/yellow lights
+- Schema matches your analysis needs
+- Data quality statistics look reasonable
+- Usage patterns indicate active maintenance
+- You can contact the owner if needed
+
+**Investigate Further If:**
+
+- Some quality concerns but dataset is unique
+- Usage is low but data looks comprehensive
+- Owner is unclear but data seems current
+
+**Skip This Dataset If:**
+
+- Multiple red flags in health check
+- Schema doesn't support your use case
+- Serious data quality issues
+- No recent usage and no owner contact
+- Better alternatives are available
+
+
+
+### Final Exercise: Complete Dataset Evaluation
+
+
+
+**Challenge**: Evaluate 2 customer datasets and choose the better one for marketing analysis.
+
+**Evaluation Scorecard:**
+
+```
+Dataset A: ________________ Dataset B: ________________
+
+Health Check: Excellent Health Check: Excellent
+Schema Quality: Excellent Schema Quality: Excellent
+Data Quality: Excellent Data Quality: Excellent
+Usage Patterns: Excellent Usage Patterns: Excellent
+Total Score: ___/20 Total Score: ___/20
+
+Winner: Dataset ___
+Reason: ________________________
+```
+
+**Validation**: Can you justify your choice to a colleague in 30 seconds?
+
+
+
+## Pro Tips for Efficient Evaluation
+
+
+
+**Speed Techniques:**
+
+- Develop a mental checklist for rapid assessment
+- Use browser tabs to compare multiple datasets
+- Focus on deal-breakers first (freshness, schema fit)
+
+**Accuracy Boosters:**
+
+- Always check sample data when available
+- Cross-reference with lineage to understand data flow
+- Contact owners for clarification on edge cases
+
+**Team Efficiency:**
+
+- Document your evaluation criteria for consistency
+- Share findings with teammates to avoid duplicate work
+- Create team standards for "good enough" data quality
+
+
+
+## Success Checkpoint
+
+
+
+**You've mastered dataset evaluation when you can:**
+
+**Speed Test**: Complete health check + schema review in under 5 minutes
+**Quality Test**: Identify 3 potential data quality issues from profiling stats
+**Decision Test**: Make confident go/no-go decisions with clear justification
+**Communication Test**: Explain dataset suitability to stakeholders
+
+**Final Validation:**
+Choose the best customer dataset for a marketing campaign analysis. Justify your choice in 3 bullet points covering health, schema, and quality.
+
+
+
+## Common Evaluation Pitfalls
+
+
+
+
+**Problem**: Waiting for perfect data that doesn't exist
+
+**Solution**:
+
+- Define "good enough" criteria upfront
+- Focus on fitness for purpose, not perfection
+- Consider data improvement as part of your project
+
+
+
+
+**Problem**: Making decisions based only on names and descriptions
+
+**Solution**:
+
+- Always check actual schema and statistics
+- Look at sample data when available
+- Verify assumptions with data owners
+
+
+
+
+**Problem**: Evaluating datasets in isolation without considering alternatives
+
+**Solution**:
+
+- Always compare 2-3 options when possible
+- Consider combining multiple datasets
+- Check lineage for upstream/downstream alternatives
+
+
+
+
+## What You've Learned
+
+**Excellent work!** You can now rapidly assess dataset quality and make informed decisions:
+
+- **Health Assessment**: Quick evaluation of dataset reliability
+- **Schema Intelligence**: Understanding structure and relationships
+- **Quality Analysis**: Interpreting statistics and profiling data
+- **Usage Validation**: Leveraging community knowledge
+- **Decision Framework**: Systematic go/no-go evaluation
+
+---
+
+
+Next: Collaborate and Share Knowledge
+
diff --git a/docs/learn-datahub/discovery/overview.md b/docs/learn-datahub/discovery/overview.md
new file mode 100644
index 00000000000000..422e1bb3bd29c0
--- /dev/null
+++ b/docs/learn-datahub/discovery/overview.md
@@ -0,0 +1,270 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+import DataHubEntityCard from '@site/src/components/DataHubEntityCard';
+import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Data Discovery & Search (45 minutes)
+
+
+
+:::tip Prerequisites
+Complete the [DataHub Quickstart](../quickstart/overview.md) tutorial first to have DataHub running with sample data.
+:::
+
+## What You'll Master
+
+Transform from basic DataHub user to discovery expert by mastering advanced search techniques, understanding dataset profiles, and leveraging collaborative features.
+
+**Learning Outcomes:**
+
+- **Advanced Search Mastery**: Use operators, filters, and saved searches like a pro
+- **Dataset Intelligence**: Read and interpret automatically generated data profiles
+- **Collaborative Discovery**: Leverage social features to crowdsource data knowledge
+- **Search Strategy**: Develop systematic approaches for different discovery scenarios
+
+**Enterprise Data Discovery Framework:**
+
+
+
+**Discovery Navigation Strategy**:
+
+1. **Start with Business Need** (requirements gathering)
+2. **Apply Search Strategy** (targeted discovery)
+3. **Filter and Refine**: What platforms, what domains?
+ - Platform filters → Focus on relevant data systems
+ - Domain filters → Narrow to business area
+ - Entity type → Tables, dashboards, or pipelines
+4. **Evaluate Data Quality**: Is this the right data?
+ - Check data freshness and update patterns
+ - Review schema compatibility with analysis needs
+ - Assess lineage depth and data reliability
+5. **Plan Integration**: How to access and use
+ - Verify permissions and access controls
+ - Gather connection details and usage patterns
+ - Check tags and glossary terms for context
+
+**Professional Approach**: This 5-step discovery method mirrors the systematic approach used in lineage analysis - ensuring you find the right data efficiently while understanding its full context.
+
+## Interactive Tutorial Structure
+
+This hands-on tutorial uses **real search scenarios** you'll encounter daily:
+
+
+
+| Step | Scenario | Time | Interactive Elements |
+| ---- | ----------------------------------------------------- | ------ | ----------------------------------------- |
+| 1 | [Advanced Search Techniques](advanced-search.md) | 15 min | Live search examples, Interactive filters |
+| 2 | [Understanding Dataset Profiles](dataset-profiles.md) | 20 min | Profile interpretation, Quality analysis |
+| 3 | [Collaborative Discovery](collaborative-discovery.md) | 10 min | Documentation exercises, Tagging practice |
+
+
+
+**Total Time: 45 minutes**
+
+## Professional Discovery Scenarios
+
+Throughout this tutorial, you'll work through practical scenarios:
+
+:::info Scenario 1: Targeted Data Discovery
+**Use Case**: Locate customer segmentation data for a marketing campaign without knowing exact table names.
+
+**Focus Areas**: Exploratory search, filtering, schema analysis
+:::
+
+:::info Scenario 2: Root Cause Analysis
+**Use Case**: Investigate suspicious dashboard numbers by tracing data lineage to identify the source of the issue.
+
+**Focus Areas**: Lineage navigation, data quality assessment, root cause analysis
+:::
+
+:::info Scenario 3: Knowledge Sharing at Scale
+**Use Case**: Document insights and context so others can reliably discover and apply the dataset.
+
+**Focus Areas**: Documentation, tagging, collaborative features
+:::
+
+## Interactive Learning Features
+
+This tutorial leverages Docusaurus's interactive capabilities:
+
+
+
+
+**Live Search Practice**: Try real searches in your DataHub instance
+**Interactive Filters**: Step-by-step filter application
+**Profile Analysis**: Guided interpretation of data statistics
+**Collaboration Simulation**: Practice documentation and tagging
+
+
+
+
+**Knowledge Checks**: Quick quizzes to verify understanding
+**Practical Validation**: Confirm you can perform key tasks
+**Scenario Completion**: Solve real discovery challenges
+**Skill Assessment**: Rate your confidence with each technique
+
+
+
+
+**Cheat Sheets**: Quick reference for search operators
+**Best Practices**: Pro tips from experienced users
+**Troubleshooting**: Common issues and solutions
+**Advanced Techniques**: Power user shortcuts
+
+
+
+
+## Prerequisites Check
+
+Before starting, ensure you have:
+
+
+
+- [ ] **DataHub running locally** at http://localhost:9002
+- [ ] **Sample data ingested** (from quickstart tutorial)
+- [ ] **Basic familiarity** with DataHub navigation
+- [ ] **15 minutes** of focused time per section
+
+
+
+:::tip Quick Setup Verification
+Test your setup by searching for "customer" in DataHub. You should see several results from the sample data.
+:::
+
+## Learning Path Integration
+
+**Coming from:** [DataHub Quickstart](../quickstart/overview.md) - You understand basic navigation and have sample data
+
+**Going to:** Choose your next path based on your role:
+
+- **Data Engineers**: [Data Ingestion Mastery](../ingestion/overview.md)
+- **Analysts**: [Data Lineage & Impact Analysis](../lineage/overview.md)
+- **Governance Teams**: [Data Governance Fundamentals](../governance/overview.md)
+
+## Success Metrics
+
+By the end of this tutorial, you'll be able to:
+
+
+
+**Speed**: Find relevant datasets in under 2 minutes
+**Accuracy**: Identify the right data source for your analysis needs
+**Insight**: Quickly assess data quality and freshness
+**Collaboration**: Effectively document and share data knowledge
+
+
+
+## Interactive Demo Preview
+
+Here's a taste of what you'll learn:
+
+
+
+
+```
+Search: "customer"
+Results: 47 datasets found
+```
+
+
+
+
+```
+Search: name:customer* AND platform:postgres AND hasOwners:true
+Results: 3 highly relevant datasets found
+Filters: PostgreSQL, Has Documentation, Updated Last 7 Days
+```
+
+
+
+
+```
+Search: (customer OR user) AND (segment* OR cohort*) AND NOT test*
+Saved Search: "Customer Segmentation Data"
+Smart Filters: Production Only, High Quality, Well Documented
+Results: 1 perfect match found in 15 seconds
+```
+
+
+
+
+---
+
+**Ready to become a DataHub discovery expert?** Let's start with [Advanced Search Techniques](advanced-search.md) →
diff --git a/docs/learn-datahub/governance/business-glossary.md b/docs/learn-datahub/governance/business-glossary.md
new file mode 100644
index 00000000000000..dd46c7921fadbc
--- /dev/null
+++ b/docs/learn-datahub/governance/business-glossary.md
@@ -0,0 +1,285 @@
+# Business Glossary
+
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+
+
+## Creating Consistent Business Language
+
+**Time Required**: 12 minutes
+
+### The Business Language Challenge
+
+Your organization uses terms like "customer," "revenue," and "conversion" across different teams, but everyone has slightly different definitions. The marketing team's "active user" differs from the product team's definition, leading to:
+
+- **Conflicting reports** with different numbers for the same metric
+- **Wasted time** in meetings clarifying what terms mean
+- **Poor decision-making** based on misunderstood data
+- **Reduced trust** in data and analytics
+
+**Real-World Impact**: Your executive team received two different "monthly revenue" reports with a $2M discrepancy because Finance and Sales defined "recognized revenue" differently.
+
+### Understanding Business Glossaries
+
+A business glossary provides standardized definitions for business terms, ensuring everyone speaks the same data language:
+
+
+
+
+
+**Glossary Benefits**:
+
+- **Consistent Definitions**: Single source of truth for business terms
+- **Improved Communication**: Teams use standardized language
+- **Better Data Discovery**: Find data using business terminology
+- **Regulatory Compliance**: Clear definitions for audit requirements
+
+### Exercise 1: Create Core Business Terms
+
+Start by defining your organization's most important business concepts:
+
+#### Step 1: Access the Glossary
+
+1. **Navigate to "Glossary"** in DataHub's main menu
+2. **Click "Create Term"** to add your first business term
+3. **Review existing terms** to avoid duplicates
+
+#### Step 2: Define "Active Customer"
+
+Create a standardized definition for one of your most important terms:
+
+1. **Term Name**: "Active Customer"
+2. **Definition**: "A customer who has made at least one purchase or engaged with our platform within the last 90 days"
+3. **Business Context**: "Used across Marketing, Product, and Finance teams for consistent customer reporting"
+4. **Calculation Logic**: "WHERE last_activity_date >= CURRENT_DATE - 90"
+5. **Related Terms**: Link to "Customer," "Engagement," "Retention"
+6. **Owner**: Assign to your Customer Analytics team
+
+#### Step 3: Add Financial Terms
+
+Create definitions for key financial metrics:
+
+**Revenue Recognition**:
+
+- **Definition**: "Revenue recorded when goods are delivered or services are performed, following GAAP standards"
+- **Business Rules**: "Subscription revenue recognized monthly; one-time purchases at delivery"
+- **Calculation**: "SUM(recognized_amount) WHERE recognition_date <= report_date"
+
+**Customer Lifetime Value (CLV)**:
+
+- **Definition**: "Predicted total revenue from a customer over their entire relationship with the company"
+- **Formula**: "Average Order Value × Purchase Frequency × Customer Lifespan"
+- **Usage**: "Used for customer acquisition cost analysis and marketing budget allocation"
+
+### Exercise 2: Link Terms to Datasets
+
+Connect your business terms to actual data assets:
+
+#### Step 1: Navigate to Dataset
+
+1. **Open the customer analytics dataset** (e.g., "fct_users_created")
+2. **Go to the "Properties" tab**
+3. **Find the "Glossary Terms" section**
+
+#### Step 2: Add Relevant Terms
+
+1. **Click "Add Terms"**
+2. **Search for "Active Customer"** and select it
+3. **Add "Customer Lifetime Value"** if the dataset contains CLV calculations
+4. **Add "Revenue Metric"** for any revenue-related fields
+5. **Save the associations**
+
+#### Step 3: Column-Level Term Assignment
+
+For specific columns, add more granular terms:
+
+- `customer_id` column → "Customer Identifier"
+- `registration_date` column → "Customer Acquisition Date"
+- `last_login_date` column → "Customer Activity Date"
+- `total_spent` column → "Customer Value"
+
+### Exercise 3: Create Term Hierarchies
+
+Organize terms into logical hierarchies for better navigation:
+
+#### Step 1: Create Term Categories
+
+Set up high-level categories using DataHub's glossary hierarchy:
+
+**Business Glossary Term Hierarchy:**
+
+```
+Customer Terms
+├── 📂 Customer Identification
+│ ├── Customer ID
+│ └── Customer Segment
+├── 📂 Customer Behavior
+│ ├── Active Customer
+│ └── Customer Engagement
+└── 📂 Customer Value
+ ├── Customer Lifetime Value (CLV)
+ └── Customer Acquisition Cost (CAC)
+
+Financial Terms
+├── 📂 Revenue Metrics
+│ ├── Revenue Recognition
+│ └── Monthly Recurring Revenue (MRR)
+└── 📂 Cost Metrics
+ ├── Cost of Goods Sold (COGS)
+ └── Operating Expenses (OPEX)
+```
+
+#### Step 2: Implement Hierarchies
+
+1. **Create parent terms** for each category
+2. **Link child terms** to their parents
+3. **Add cross-references** between related terms
+4. **Document relationships** in term descriptions
+
+### Exercise 4: Establish Glossary Governance
+
+Set up processes to maintain glossary quality:
+
+#### Step 1: Assign Term Stewards
+
+1. **For each business domain**, assign term stewards:
+
+ - Customer terms → Customer Success Manager
+ - Financial terms → Finance Business Analyst
+ - Product terms → Product Manager
+ - Marketing terms → Marketing Operations
+
+2. **Define steward responsibilities**:
+ - Review and approve new terms
+ - Update definitions when business rules change
+ - Resolve conflicts between similar terms
+
+#### Step 2: Create Review Processes
+
+1. **Quarterly term reviews**:
+
+ - Verify definitions are still accurate
+ - Update terms based on business changes
+ - Archive obsolete terms
+
+2. **New term approval workflow**:
+ - Propose new terms through formal process
+ - Business stakeholder review and approval
+ - Technical validation of term usage
+
+### Understanding Glossary Impact
+
+A well-maintained business glossary delivers:
+
+**Improved Data Literacy**:
+
+- Business users understand data meaning
+- Reduced time spent clarifying definitions
+- Increased confidence in data-driven decisions
+
+**Better Collaboration**:
+
+- Consistent language across teams
+- Faster onboarding of new team members
+- More productive data discussions
+
+**Enhanced Data Discovery**:
+
+- Find datasets using business terminology
+- Understand data context without technical expertise
+- Discover related data through term relationships
+
+### Advanced Glossary Features
+
+#### 1. Term Lineage
+
+Track how business terms relate to data lineage:
+
+- See which datasets contribute to a business metric
+- Understand impact of data changes on business terms
+- Trace business definitions to source systems
+
+#### 2. Automated Term Detection
+
+Use DataHub's capabilities to:
+
+- Automatically suggest terms for new datasets
+- Detect when datasets match existing term definitions
+- Alert when term usage becomes inconsistent
+
+#### 3. Integration with BI Tools
+
+Connect your glossary to:
+
+- Business intelligence dashboards
+- Reporting tools
+- Data visualization platforms
+
+### Measuring Glossary Success
+
+Track these metrics to measure glossary adoption:
+
+- **Term Coverage**: Percentage of datasets with glossary terms
+- **Term Usage**: How often terms are referenced
+- **Definition Consistency**: Alignment across different uses
+- **User Engagement**: Active glossary users and contributions
+- **Business Impact**: Reduction in definition-related confusion
+
+### Best Practices for Business Glossaries
+
+#### 1. Start with High-Impact Terms
+
+Focus on terms that:
+
+- Appear in executive reports
+- Are used across multiple teams
+- Have caused confusion in the past
+- Are required for compliance
+
+#### 2. Keep Definitions Business-Focused
+
+- Use language business users understand
+- Avoid technical jargon
+- Include business context and usage
+- Provide concrete examples
+
+#### 3. Maintain Glossary Quality
+
+- Regular reviews and updates
+- Clear ownership and stewardship
+- Version control for definition changes
+- Feedback mechanisms for users
+
+#### 4. Promote Adoption
+
+- Training sessions for business users
+- Integration with existing workflows
+- Success stories and use cases
+- Executive sponsorship and support
+
+### Next Steps
+
+With a comprehensive business glossary in place, you're ready to implement automated governance policies that enforce your data standards at scale.
+
+
diff --git a/docs/learn-datahub/governance/data-classification.md b/docs/learn-datahub/governance/data-classification.md
new file mode 100644
index 00000000000000..ea98a9c64d779b
--- /dev/null
+++ b/docs/learn-datahub/governance/data-classification.md
@@ -0,0 +1,272 @@
+# Data Classification
+
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+
+
+## Protecting Sensitive Data Through Classification
+
+**Time Required**: 15 minutes
+
+### The Classification Challenge
+
+Your company handles customer PII, financial data, and proprietary business information across hundreds of datasets. Without proper classification, you can't:
+
+- **Comply with regulations** like GDPR, CCPA, or SOX
+- **Implement appropriate security controls** for different data types
+- **Respond to data subject requests** or audit requirements
+- **Prevent accidental exposure** of sensitive information
+
+**Real-World Scenario**: During a recent audit, your team couldn't quickly identify which datasets contained PII, leading to a 2-week manual review process and potential compliance penalties.
+
+### Understanding Data Classification Levels
+
+DataHub supports industry-standard classification levels:
+
+
+
+
+
+
+
+**Classification Levels**:
+
+- **Restricted**: PII, financial data, trade secrets (highest protection)
+- **Confidential**: Internal business data, customer insights
+- **🔵 Internal**: General business information, operational data
+- **Public**: Marketing materials, published reports
+
+### Exercise 1: Implement PII Detection
+
+Set up automated detection of personally identifiable information:
+
+#### Step 1: Enable PII Classification
+
+1. **Navigate to Settings** → **Classification**
+2. **Enable "Automatic PII Detection"**
+3. **Configure detection patterns** for:
+ - Email addresses (`*email*`, `*e_mail*`)
+ - Phone numbers (`*phone*`, `*mobile*`)
+ - Social Security Numbers (`*ssn*`, `*social_security*`)
+ - Credit card numbers (`*card*`, `*payment*`)
+
+#### Step 2: Review Detected PII
+
+1. **Go to the "Classification" dashboard**
+2. **Review automatically detected PII fields**
+3. **Verify accuracy** of the detection
+4. **Manually classify** any missed sensitive fields
+
+#### Step 3: Apply PII Tags
+
+For the customer dataset:
+
+1. **Open the dataset profile**
+2. **Navigate to the Schema tab**
+3. **For each PII column**, add appropriate tags:
+ - `email` column → Add "PII" and "Contact-Info" tags
+ - `phone` column → Add "PII" and "Contact-Info" tags
+ - `address` column → Add "PII" and "Location-Data" tags
+
+### Exercise 2: Set Up Classification Rules
+
+Create automated rules to classify data based on patterns:
+
+#### Create Classification Rules
+
+1. **Go to Settings** → **Classification Rules**
+2. **Create new rule**: "Financial Data Detection"
+
+ - **Pattern**: Column names containing `*amount*`, `*price*`, `*cost*`, `*revenue*`
+ - **Classification**: "Confidential"
+ - **Tags**: "Financial", "Sensitive"
+
+3. **Create new rule**: "Customer Data Detection"
+ - **Pattern**: Table names containing `*customer*`, `*user*`, `*client*`
+ - **Classification**: "Restricted"
+ - **Tags**: "Customer-Data", "High-Privacy"
+
+#### Test Classification Rules
+
+1. **Run classification** on sample datasets
+2. **Review results** in the Classification dashboard
+3. **Adjust rules** based on accuracy
+4. **Schedule regular re-classification** to catch new data
+
+### Exercise 3: Implement Data Sensitivity Levels
+
+Apply consistent sensitivity labeling across your data landscape:
+
+#### Step 1: Define Sensitivity Framework
+
+Create a company-wide sensitivity framework:
+
+```
+Sensitivity Level | Data Types | Access Controls | Examples
+-----------------|------------|-----------------|----------
+Restricted | PII, PHI, Financial | Role-based, Encrypted | SSN, Credit Cards
+Confidential | Business Critical | Department-based | Revenue, Strategy
+Internal | Operational | Employee Access | Logs, Metrics
+Public | Marketing | Open Access | Press Releases
+```
+
+#### Step 2: Apply Sensitivity Labels
+
+1. **Navigate to each critical dataset**
+2. **Add sensitivity tags**:
+
+ - Customer data → "Restricted"
+ - Financial reports → "Confidential"
+ - System logs → "Internal"
+ - Marketing content → "Public"
+
+3. **Document classification rationale** in dataset descriptions
+
+### Exercise 4: Set Up Compliance Monitoring
+
+Monitor classification compliance across your data landscape:
+
+#### Create Compliance Dashboard
+
+1. **Go to Analytics** → **Governance Metrics**
+2. **Create dashboard** with these metrics:
+ - Percentage of datasets classified
+ - Number of PII fields identified
+ - Compliance score by data domain
+ - Classification coverage trends
+
+#### Set Up Compliance Alerts
+
+1. **Configure alerts** for:
+ - New datasets without classification
+ - PII detected in unclassified data
+ - Changes to restricted data schemas
+ - Access to sensitive data outside business hours
+
+### Understanding Classification Impact
+
+Proper data classification enables:
+
+**Regulatory Compliance**:
+
+- Quick identification of data subject to regulations
+- Automated compliance reporting
+- Audit trail for data handling
+
+**Risk Management**:
+
+- Appropriate security controls for different data types
+- Incident response prioritization
+- Data breach impact assessment
+
+**Access Control**:
+
+- Role-based access to sensitive data
+- Automated access reviews
+- Principle of least privilege enforcement
+
+### Advanced Classification Techniques
+
+#### 1. Machine Learning-Based Classification
+
+Use DataHub's ML capabilities to:
+
+- Analyze data content patterns
+- Identify sensitive data in unstructured fields
+- Continuously improve classification accuracy
+
+#### 2. Column-Level Classification
+
+Apply granular classification:
+
+- Different sensitivity levels within the same table
+- Field-specific access controls
+- Detailed compliance mapping
+
+#### 3. Dynamic Classification
+
+Implement classification that adapts to:
+
+- Data content changes
+- Business context evolution
+- Regulatory requirement updates
+
+### Measuring Classification Success
+
+Track these key metrics:
+
+- **Classification Coverage**: Percentage of datasets classified
+- **PII Detection Accuracy**: True positives vs false positives
+- **Compliance Score**: Adherence to classification policies
+- **Time to Classify**: Speed of classifying new datasets
+- **Access Violations**: Unauthorized access to classified data
+
+### Best Practices for Data Classification
+
+#### 1. Start with High-Risk Data
+
+Prioritize classification of:
+
+- Customer PII
+- Financial information
+- Healthcare data
+- Intellectual property
+
+#### 2. Automate Where Possible
+
+Use automated detection for:
+
+- Common PII patterns
+- Standard data types
+- Regulatory data categories
+
+#### 3. Regular Review and Updates
+
+- Quarterly classification reviews
+- Updates for new data sources
+- Refinement of classification rules
+
+#### 4. Training and Awareness
+
+- Educate data teams on classification importance
+- Provide clear classification guidelines
+- Regular training on new regulations
+
+### Next Steps
+
+With data properly classified, you're ready to create a business glossary that provides consistent definitions and context for your data assets.
+
+
diff --git a/docs/learn-datahub/governance/governance-policies.md b/docs/learn-datahub/governance/governance-policies.md
new file mode 100644
index 00000000000000..0d0eb06ffb2786
--- /dev/null
+++ b/docs/learn-datahub/governance/governance-policies.md
@@ -0,0 +1,391 @@
+# Governance Policies
+
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+
+
+## Automating Governance at Scale
+
+**Time Required**: 11 minutes
+
+### The Policy Automation Challenge
+
+Your organization now has ownership, classification, and glossary terms in place, but governance still requires manual oversight. Without automated policies, you face:
+
+- **Inconsistent enforcement** of data standards across teams
+- **Manual reviews** that don't scale with data growth
+- **Policy violations** discovered too late to prevent impact
+- **Compliance gaps** that create regulatory risk
+
+**Real-World Scenario**: A developer accidentally deployed a new dataset containing PII without proper classification or approval, exposing sensitive customer data for 3 days before manual review caught the issue.
+
+### Understanding DataHub Policies
+
+DataHub policies automate governance enforcement through configurable rules that monitor, alert, and control data operations:
+
+**Policy Types**:
+
+- **Access Policies**: Control who can view or modify data
+- **Metadata Policies**: Enforce required metadata standards
+- **Quality Policies**: Monitor data quality and trigger alerts
+- **Approval Policies**: Require reviews for sensitive operations
+- **Compliance Policies**: Ensure regulatory requirement adherence
+
+### Exercise 1: Create Metadata Compliance Policies
+
+Ensure all datasets meet your organization's metadata standards:
+
+#### Step 1: Access Policy Management
+
+1. **Navigate to Settings** → **Policies**
+2. **Click "Create Policy"** to start building your first automated rule
+3. **Select "Metadata Policy"** as the policy type
+
+#### Step 2: Create "Required Ownership" Policy
+
+Build a policy that ensures all datasets have assigned owners:
+
+**Policy Configuration**:
+
+- **Name**: "Required Dataset Ownership"
+- **Description**: "All datasets must have at least one technical owner assigned"
+- **Scope**: All datasets in production domains
+- **Rule**: `ownership.owners.length >= 1 AND ownership.owners[].type == "TECHNICAL_OWNER"`
+- **Action**: Block dataset publication without ownership
+- **Notification**: Alert data governance team
+
+#### Step 3: Create "PII Classification" Policy
+
+Ensure PII data is properly classified:
+
+**Policy Configuration**:
+
+- **Name**: "PII Data Classification Required"
+- **Description**: "Datasets containing PII must be classified as Restricted"
+- **Trigger**: When PII tags are detected
+- **Rule**: `tags.contains("PII") IMPLIES classification == "RESTRICTED"`
+- **Action**: Require data steward approval
+- **Escalation**: Auto-escalate to privacy team after 24 hours
+
+### Exercise 2: Implement Access Control Policies
+
+Control who can access sensitive data based on classification:
+
+#### Step 1: Create Role-Based Access Policy
+
+**Policy Configuration**:
+
+- **Name**: "Restricted Data Access Control"
+- **Description**: "Only authorized roles can access restricted classification data"
+- **Scope**: Datasets with "Restricted" classification
+- **Allowed Roles**:
+ - Data Stewards
+ - Privacy Team
+ - Designated Business Owners
+- **Action**: Block unauthorized access attempts
+- **Logging**: Log all access attempts for audit
+
+#### Step 2: Set Up Time-Based Access
+
+For highly sensitive data, implement time-based restrictions:
+
+**Policy Configuration**:
+
+- **Name**: "After-Hours Restricted Access"
+- **Description**: "Restricted data access limited to business hours"
+- **Schedule**: Monday-Friday, 8 AM - 6 PM local time
+- **Exceptions**: Emergency access with manager approval
+- **Override**: Security team can grant temporary access
+
+### Exercise 3: Create Data Quality Policies
+
+Automatically monitor and enforce data quality standards:
+
+#### Step 1: Schema Change Policy
+
+Prevent breaking changes to critical datasets:
+
+**Policy Configuration**:
+
+- **Name**: "Critical Dataset Schema Protection"
+- **Description**: "Schema changes to critical datasets require approval"
+- **Scope**: Datasets tagged as "Critical" or "Production"
+- **Monitored Changes**:
+ - Column deletions
+ - Data type changes
+ - Primary key modifications
+- **Approval Required**: Technical owner + business owner
+- **Notification**: Alert downstream consumers of pending changes
+
+#### Step 2: Data Freshness Policy
+
+Ensure data meets freshness requirements:
+
+**Policy Configuration**:
+
+- **Name**: "Data Freshness SLA"
+- **Description**: "Critical datasets must be updated within SLA windows"
+- **SLA Definitions**:
+ - Customer data: 4 hours
+ - Financial data: 1 hour
+ - Analytics data: 24 hours
+- **Action**: Alert owners when SLA is breached
+- **Escalation**: Page on-call engineer for critical breaches
+
+### Exercise 4: Implement Compliance Automation
+
+Automate compliance with regulatory requirements:
+
+#### Step 1: GDPR Compliance Policy
+
+Ensure GDPR compliance for EU customer data:
+
+**Policy Configuration**:
+
+- **Name**: "GDPR Data Processing Compliance"
+- **Description**: "EU customer data must meet GDPR requirements"
+- **Scope**: Datasets containing EU customer PII
+- **Requirements**:
+ - Legal basis documented
+ - Data retention period defined
+ - Data processing purpose specified
+ - Privacy impact assessment completed
+- **Monitoring**: Track data subject requests and processing activities
+
+#### Step 2: SOX Compliance Policy
+
+Ensure financial data meets SOX requirements:
+
+**Policy Configuration**:
+
+- **Name**: "SOX Financial Data Controls"
+- **Description**: "Financial datasets must have SOX-compliant controls"
+- **Requirements**:
+ - Segregation of duties in data access
+ - Change management approval workflows
+ - Audit trail for all modifications
+ - Regular access reviews
+- **Reporting**: Generate SOX compliance reports quarterly
+
+### Exercise 5: Set Up Policy Monitoring and Alerting
+
+Create comprehensive monitoring for policy compliance:
+
+#### Step 1: Policy Dashboard
+
+1. **Create governance dashboard** with key metrics:
+
+ - Policy compliance percentage
+ - Active policy violations
+ - Resolution time trends
+ - Compliance by data domain
+
+2. **Set up real-time monitoring**:
+ - Policy violation alerts
+ - Compliance trend analysis
+ - Exception tracking and reporting
+
+#### Step 2: Automated Remediation
+
+Configure automatic responses to policy violations:
+
+**Immediate Actions**:
+
+- Block non-compliant operations
+- Quarantine problematic datasets
+- Revoke inappropriate access
+- Generate incident tickets
+
+**Escalation Procedures**:
+
+- Notify data owners within 15 minutes
+- Escalate to data governance team after 1 hour
+- Executive notification for critical violations
+- Automatic compliance reporting
+
+### Understanding Policy Impact
+
+Automated governance policies provide:
+
+**Consistent Enforcement**:
+
+- Policies applied uniformly across all data
+- No manual oversight gaps
+- 24/7 monitoring and enforcement
+
+**Proactive Risk Management**:
+
+- Issues caught before they impact business
+- Automatic remediation of common problems
+- Reduced compliance risk
+
+**Scalable Governance**:
+
+- Governance that grows with your data
+- Reduced manual effort for routine checks
+- Focus governance team on strategic initiatives
+
+### Advanced Policy Features
+
+#### 1. Machine Learning-Enhanced Policies
+
+Use ML to improve policy effectiveness:
+
+- **Anomaly Detection**: Identify unusual data access patterns
+- **Risk Scoring**: Automatically assess compliance risk
+- **Predictive Alerts**: Warn of potential policy violations
+
+#### 2. Policy Templates
+
+Create reusable policy templates for:
+
+- Industry-specific compliance (HIPAA, PCI-DSS)
+- Common governance patterns
+- Organizational standards
+
+#### 3. Policy Testing and Simulation
+
+Before deploying policies:
+
+- **Test policies** against historical data
+- **Simulate impact** of new policy rules
+- **Gradual rollout** with monitoring
+
+### Measuring Policy Success
+
+Track these key metrics:
+
+- **Policy Compliance Rate**: Percentage of data assets meeting policies
+- **Violation Resolution Time**: Speed of addressing policy violations
+- **False Positive Rate**: Accuracy of policy detection
+- **Coverage**: Percentage of data covered by policies
+- **Business Impact**: Reduction in compliance incidents
+
+### Best Practices for Governance Policies
+
+#### 1. Start Simple and Iterate
+
+- Begin with high-impact, low-complexity policies
+- Gather feedback and refine rules
+- Gradually add more sophisticated policies
+
+#### 2. Balance Automation and Human Oversight
+
+- Automate routine compliance checks
+- Require human approval for complex decisions
+- Provide override mechanisms for exceptions
+
+#### 3. Ensure Policy Transparency
+
+- Document policy rationale and business impact
+- Provide clear guidance for compliance
+- Regular communication about policy changes
+
+#### 4. Regular Policy Review
+
+- Quarterly review of policy effectiveness
+- Update policies based on business changes
+- Archive obsolete or redundant policies
+
+### Governance Maturity Assessment
+
+Evaluate your organization's governance maturity:
+
+
+
+### Congratulations!
+
+You've successfully implemented a comprehensive data governance framework using DataHub. Your organization now has:
+
+**Clear Ownership**: Accountability for every data asset
+**Proper Classification**: Risk-appropriate protection for sensitive data
+**Consistent Language**: Standardized business terminology
+**Automated Policies**: Scalable governance enforcement
+
+### Next Steps in Your Governance Journey
+
+1. **Expand Coverage**: Apply governance to additional data domains
+2. **Advanced Analytics**: Implement governance metrics and reporting
+3. **Integration**: Connect governance to your broader data platform
+4. **Culture**: Build a data-driven governance culture across teams
+
+Your data governance foundation is now ready to support your organization's growth and ensure compliance at scale.
+
+## Continue Learning
+
+Ready to explore more DataHub capabilities? Check out these related tutorials:
+
+- [Data Quality & Monitoring](../quality/overview.md) - Ensure data reliability
+
+
diff --git a/docs/learn-datahub/governance/overview.md b/docs/learn-datahub/governance/overview.md
new file mode 100644
index 00000000000000..00881474df14c3
--- /dev/null
+++ b/docs/learn-datahub/governance/overview.md
@@ -0,0 +1,188 @@
+import DataHubEntityCard from '@site/src/components/DataHubEntityCard';
+import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Data Governance Fundamentals
+
+
+
+## Professional Data Governance Journey
+
+**Time Required**: 50 minutes | **Skill Level**: Intermediate
+
+### Your Challenge: Establishing Data Governance at Scale
+
+You're a **Data Governance Lead** at a growing technology company. Your organization has hundreds of datasets across multiple platforms, but lacks consistent ownership, classification, and business context. Leadership wants to implement proper data governance to ensure compliance, reduce risk, and improve data quality.
+
+**The Business Impact**: Without proper governance, your company faces:
+
+- **Compliance Risks**: Inability to track PII and sensitive data
+- **Data Quality Issues**: No clear ownership for data problems
+- **Business Confusion**: Teams can't understand what data means
+- **Operational Inefficiency**: Time wasted searching for the right data
+
+### What You'll Learn
+
+This tutorial series walks you through implementing comprehensive data governance using DataHub's governance features:
+
+#### Chapter 1: Ownership Management (12 minutes)
+
+**Business Challenge**: No clear accountability for data quality and maintenance
+**Your Journey**:
+
+- Assign technical and business owners to critical datasets
+- Set up ownership notifications and responsibilities
+- Create ownership hierarchies for different data domains
+ **Organizational Outcome**: Clear accountability and faster issue resolution
+
+#### Chapter 2: Data Classification (15 minutes)
+
+**Business Challenge**: Sensitive data scattered across systems without proper labeling
+**Your Journey**:
+
+- Implement PII detection and classification
+- Apply sensitivity labels (Public, Internal, Confidential, Restricted)
+- Set up automated classification rules
+ **Organizational Outcome**: Compliance readiness and risk reduction
+
+#### Chapter 3: Business Glossary (12 minutes)
+
+**Business Challenge**: Business terms used inconsistently across teams and systems
+**Your Journey**:
+
+- Create standardized business definitions
+- Link glossary terms to datasets and columns
+- Establish term hierarchies and relationships
+ **Organizational Outcome**: Consistent business language and improved data understanding
+
+#### Chapter 4: Governance Policies (11 minutes)
+
+**Business Challenge**: Manual governance processes that don't scale
+**Your Journey**:
+
+- Set up automated governance policies
+- Configure approval workflows for sensitive data
+- Implement data access controls and monitoring
+ **Organizational Outcome**: Scalable governance that grows with your organization
+
+### DataHub Governance in Action
+
+See how proper governance transforms your data assets from unmanaged to enterprise-ready:
+
+
+
+
+
+
+
+**Governance Benefits Demonstrated**:
+
+- **Clear Ownership**: Every dataset has assigned business and technical owners
+- **Proper Classification**: Tags indicate sensitivity levels and compliance requirements
+- **Business Context**: Glossary terms provide standardized definitions
+- **Quality Assurance**: Health indicators show data reliability
+
+### Governance in Practice: End-to-End Data Flow
+
+See how governance controls are applied throughout a complete data pipeline:
+
+
+
+**Governance Flow Analysis**:
+
+- **Source Control**: Raw data properly classified as PII/Restricted with clear ownership
+- **Processing Governance**: Validation jobs ensure quality and compliance during transformation
+- **Output Classification**: Analytics data appropriately tagged and documented for business use
+- **Access Control**: Executive dashboards have appropriate sensitivity levels for broad access
+
+### Interactive Learning Experience
+
+Each chapter includes:
+
+- **Real Governance Scenarios**: Based on actual enterprise challenges
+- **Hands-on Exercises**: Using DataHub's sample data and governance features
+- **Best Practice Guidance**: Industry-standard approaches to data governance
+- **Measurable Outcomes**: Clear success metrics for each governance initiative
+
+### Prerequisites
+
+- Completed [DataHub Quickstart](../quickstart/overview.md)
+- Basic understanding of data management concepts
+- Access to DataHub instance with sample data
+
+### Ready to Begin?
+
+Start your data governance journey by establishing clear ownership and accountability for your organization's data assets.
+
+
diff --git a/docs/learn-datahub/governance/ownership-management.md b/docs/learn-datahub/governance/ownership-management.md
new file mode 100644
index 00000000000000..d7b2d5a4762198
--- /dev/null
+++ b/docs/learn-datahub/governance/ownership-management.md
@@ -0,0 +1,191 @@
+# Ownership Management
+
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+
+
+## Establishing Clear Data Ownership
+
+**Time Required**: 12 minutes
+
+### The Ownership Challenge
+
+Your organization has critical datasets like customer information, financial transactions, and product analytics, but when data quality issues arise, nobody knows who to contact. Teams waste hours trying to find the right person to fix problems or answer questions about data.
+
+**Real-World Impact**: A recent customer complaint about incorrect billing took 3 days to resolve because the team couldn't identify who owned the billing data pipeline.
+
+### Understanding DataHub Ownership Types
+
+DataHub supports multiple ownership types to reflect real organizational structures:
+
+
+
+
+
+**Ownership Types Explained**:
+
+- **Technical Owner**: Responsible for data pipeline maintenance, schema changes, and technical issues
+- **Business Owner**: Accountable for data accuracy, business rules, and stakeholder communication
+- **Data Steward**: Ensures data quality, compliance, and governance standards
+- **Data Owner**: Ultimate accountability for data asset (often a senior business leader)
+
+### Exercise 1: Assign Dataset Owners
+
+Let's establish ownership for your organization's key datasets:
+
+#### Step 1: Navigate to Dataset Ownership
+
+1. **Open DataHub** and search for "fct_users_created"
+2. **Click on the dataset** to open its profile page
+3. **Go to the "Properties" tab** and find the "Ownership" section
+4. **Click "Add Owners"** to begin assignment
+
+#### Step 2: Add Technical Owner
+
+1. **Select "Technical Owner"** from the ownership type dropdown
+2. **Enter the email**: `john.doe@company.com`
+3. **Add justification**: "Maintains the user analytics ETL pipeline"
+4. **Click "Add"** to save
+
+#### Step 3: Add Business Owner
+
+1. **Click "Add Owners"** again
+2. **Select "Business Owner"**
+3. **Enter the email**: `sarah.smith@company.com`
+4. **Add justification**: "Accountable for user metrics accuracy and business requirements"
+5. **Click "Add"** to save
+
+### Exercise 2: Set Up Ownership Notifications
+
+Configure automatic notifications so owners are alerted to important events:
+
+#### Configure Owner Notifications
+
+1. **Go to Settings** → **Notifications**
+2. **Enable "Dataset Quality Alerts"** for Technical Owners
+3. **Enable "Schema Change Notifications"** for Business Owners
+4. **Set up "Data Incident Alerts"** for Data Stewards
+
+**What This Achieves**: When data quality issues occur, the right people are automatically notified based on their ownership role.
+
+### Exercise 3: Create Ownership Hierarchies
+
+For large organizations, establish ownership hierarchies by domain:
+
+#### Domain-Based Ownership Structure
+
+**Customer Domain:**
+
+- **Technical Owners**: Data Engineering Team (infrastructure, pipelines, technical maintenance)
+- **Business Owners**: Customer Success Team (business requirements, use cases)
+- **Data Stewards**: Customer Data Governance (quality, compliance, documentation)
+- **Data Owner**: VP Customer Experience (strategic decisions, access approvals)
+
+**Financial Domain:**
+
+- **Technical Owners**: Financial Systems Team (ERP integration, data processing)
+- **Business Owners**: Finance Team (reporting requirements, business rules)
+- **Data Stewards**: Financial Data Governance (regulatory compliance, audit trails)
+- **Data Owner**: CFO (strategic oversight, regulatory accountability)
+
+#### Implement Domain Ownership
+
+1. **Navigate to "Domains"** in DataHub
+2. **Create "Customer Domain"** if it doesn't exist
+3. **Add datasets** to the appropriate domain
+4. **Assign domain-level owners** who oversee all datasets in that domain
+
+### Understanding Ownership Impact
+
+With proper ownership in place, your organization gains:
+
+**Faster Issue Resolution**:
+
+- Data quality problems get routed to the right technical owner
+- Business questions go directly to the business owner
+- Average resolution time drops from days to hours
+
+**Clear Accountability**:
+
+- Each dataset has designated responsible parties
+- Ownership information is visible to all data consumers
+- No more "orphaned" datasets without clear ownership
+
+**Improved Data Quality**:
+
+- Owners receive proactive alerts about their data
+- Regular ownership reviews ensure assignments stay current
+- Quality metrics are tied to specific owners
+
+### Best Practices for Ownership Management
+
+#### 1. Start with Critical Datasets
+
+Focus on your most important data assets first:
+
+- Customer data
+- Financial transactions
+- Product analytics
+- Regulatory reporting data
+
+#### 2. Use Multiple Ownership Types
+
+Don't rely on just one owner per dataset:
+
+- Technical owners for operational issues
+- Business owners for accuracy and requirements
+- Data stewards for governance and compliance
+
+#### 3. Regular Ownership Reviews
+
+Set up quarterly reviews to:
+
+- Verify ownership assignments are current
+- Update owners when people change roles
+- Add ownership to newly discovered datasets
+
+#### 4. Document Ownership Responsibilities
+
+Create clear expectations for each ownership type:
+
+- Response time commitments
+- Escalation procedures
+- Quality standards
+
+### Measuring Ownership Success
+
+Track these metrics to measure the impact of your ownership program:
+
+- **Mean Time to Resolution (MTTR)** for data issues
+- **Percentage of datasets with assigned owners**
+- **Owner response rate** to data quality alerts
+- **User satisfaction** with data issue resolution
+
+### Next Steps
+
+Now that you've established clear ownership, you're ready to implement data classification to identify and protect sensitive information.
+
+
diff --git a/docs/learn-datahub/ingestion/overview.md b/docs/learn-datahub/ingestion/overview.md
new file mode 100644
index 00000000000000..4fde6800e34961
--- /dev/null
+++ b/docs/learn-datahub/ingestion/overview.md
@@ -0,0 +1,253 @@
+# Data Ingestion Mastery
+
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+
+
+## Professional Data Integration at Scale
+
+**Time Required**: 60 minutes | **Skill Level**: Advanced
+
+### Your Challenge: Scaling Metadata Management
+
+You're a **Senior Data Engineer** at a rapidly growing organization. Your data landscape includes 50+ data sources across cloud and on-premises systems, with new sources added weekly. Your current metadata management approach is becoming unsustainable:
+
+- **Manual documentation** that's always outdated
+- **Inconsistent metadata** across different systems
+- **No automated discovery** of schema changes or new datasets
+- **Limited visibility** into data lineage and dependencies
+
+**The Business Impact**: Your data team spends 30% of their time answering "where is this data?" questions, and a recent compliance audit revealed significant gaps in data documentation, putting the organization at regulatory risk.
+
+### What You'll Learn
+
+This tutorial series teaches you to implement enterprise-grade metadata ingestion using DataHub's advanced capabilities:
+
+#### Chapter 1: Recipe Fundamentals (15 minutes)
+
+**Business Challenge**: Inconsistent and manual metadata collection across diverse data sources
+**Your Journey**:
+
+- Master DataHub recipe configuration for different source types
+- Implement authentication and connection management
+- Configure metadata extraction filters and transformations
+ **Organizational Outcome**: Standardized, automated metadata collection across all data sources
+
+#### Chapter 2: Stateful Ingestion (15 minutes)
+
+**Business Challenge**: Full re-ingestion causing performance issues and unnecessary processing
+**Your Journey**:
+
+- Implement incremental metadata updates
+- Configure change detection and delta processing
+- Optimize ingestion performance for large-scale environments
+ **Organizational Outcome**: Efficient metadata updates that scale with organizational growth
+
+#### Chapter 3: Data Profiling (15 minutes)
+
+**Business Challenge**: Limited understanding of actual data content and quality patterns
+**Your Journey**:
+
+- Enable automated data profiling and statistics collection
+- Configure custom profiling rules for business-specific metrics
+- Implement profiling for different data types and sources
+ **Organizational Outcome**: Deep insights into data content, quality, and usage patterns
+
+#### Chapter 4: Advanced Patterns (15 minutes)
+
+**Business Challenge**: Complex enterprise requirements that basic ingestion can't handle
+**Your Journey**:
+
+- Implement custom transformers and processors
+- Configure advanced lineage extraction
+- Set up multi-environment metadata management
+ **Organizational Outcome**: Sophisticated metadata management that handles enterprise complexity
+
+### Interactive Learning Experience
+
+Each chapter includes:
+
+- **Real Enterprise Scenarios**: Based on actual large-scale metadata challenges
+- **Hands-on Configuration**: Working with DataHub's ingestion framework
+- **Performance Optimization**: Techniques for production-scale deployments
+- **Troubleshooting Guidance**: Common issues and resolution strategies
+
+### Understanding Ingestion Architecture
+
+DataHub's ingestion framework provides enterprise-grade capabilities:
+
+
+
+
+
+
+
+**Key Ingestion Capabilities**:
+
+- **Universal Connectors**: 50+ pre-built connectors for popular data systems
+- **High Performance**: Optimized for large-scale enterprise environments
+- **Incremental Updates**: Stateful ingestion for efficient metadata synchronization
+- **Automated Profiling**: Deep data content analysis and quality metrics
+- **Flexible Configuration**: Customizable extraction, transformation, and loading
+
+### Ingestion Framework Components
+
+**Core Components**:
+
+- **Sources**: Connectors for different data systems (Snowflake, BigQuery, Kafka, etc.)
+- **Recipes**: Configuration files that define ingestion behavior
+- **Transformers**: Processors that modify metadata during ingestion
+- **Sinks**: Destinations for processed metadata (typically DataHub)
+- **State Management**: Tracking of ingestion progress and changes
+
+**Enterprise Features**:
+
+- **Authentication Management**: Secure credential handling and rotation
+- **Error Handling**: Robust failure recovery and retry mechanisms
+- **Monitoring**: Comprehensive ingestion observability and alerting
+- **Scheduling**: Flexible timing and dependency management
+- **Scaling**: Distributed processing for large environments
+
+### Prerequisites
+
+- Completed [DataHub Quickstart](../quickstart/overview.md)
+- Understanding of data architecture and metadata concepts
+- Access to DataHub CLI and sample data sources
+- Familiarity with YAML configuration and command-line tools
+- Basic knowledge of data systems (databases, streaming platforms, etc.)
+
+### Ingestion Maturity Levels
+
+
+
+### Common Ingestion Challenges
+
+**Technical Challenges**:
+
+- **Scale**: Processing metadata from hundreds of data sources
+- **Performance**: Minimizing ingestion time and resource usage
+- **Reliability**: Handling network issues, authentication failures, and source changes
+- **Complexity**: Managing diverse source types with different metadata models
+
+**Organizational Challenges**:
+
+- **Governance**: Ensuring consistent metadata standards across teams
+- **Security**: Managing credentials and access controls securely
+- **Change Management**: Adapting to evolving data infrastructure
+- **Cost Optimization**: Balancing metadata completeness with resource costs
+
+### Success Metrics
+
+**Technical Metrics**:
+
+- **Ingestion Coverage**: Percentage of data sources with automated metadata collection
+- **Ingestion Performance**: Time and resources required for metadata updates
+- **Data Freshness**: Lag between source changes and metadata updates
+- **Error Rate**: Percentage of successful vs. failed ingestion runs
+
+**Business Metrics**:
+
+- **Time to Discovery**: Speed of finding relevant data assets
+- **Metadata Completeness**: Percentage of assets with comprehensive metadata
+- **User Adoption**: Active usage of metadata for data discovery and governance
+- **Compliance Readiness**: Ability to respond to audit and regulatory requirements
+
+### Ready to Begin?
+
+Start your ingestion mastery journey by learning the fundamentals of DataHub recipes and how to configure them for different data sources.
+
+
diff --git a/docs/learn-datahub/lineage/impact-analysis.md b/docs/learn-datahub/lineage/impact-analysis.md
new file mode 100644
index 00000000000000..59897a72022dd6
--- /dev/null
+++ b/docs/learn-datahub/lineage/impact-analysis.md
@@ -0,0 +1,638 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode';
+
+# Performing Impact Analysis (15 minutes)
+
+
+
+**The Critical Decision**: The enterprise analytics migration is approved, but now you need to answer the CEO's question: _"What exactly will be affected, and how do we minimize business risk?"_ This is where impact analysis transforms from guesswork into science.
+
+**Your Mission**: Learn to perform systematic impact analysis that quantifies risk, prioritizes changes, and creates bulletproof migration plans.
+
+## What You'll Master
+
+By the end of this step, you'll be able to:
+
+- **Quantify downstream impact** with business metrics and risk scores
+- **Create stakeholder reports** that clearly communicate change effects
+- **Develop rollback strategies** based on lineage dependencies
+- **Coordinate cross-team changes** using lineage insights
+
+## The Impact Analysis Framework
+
+Professional impact analysis follows a systematic 5-step process:
+
+**Impact Analysis Process:**
+
+1. **Scope Definition** → Define what's changing and why
+2. **Downstream Mapping** → Identify all affected systems and stakeholders
+3. **Risk Assessment** → Quantify business impact and technical risks
+4. **Stakeholder Analysis** → Understand who needs to be involved
+5. **Mitigation Planning** → Develop rollback and contingency strategies
+
+## Step 1: Scope Definition
+
+Before analyzing impact, clearly define what's changing:
+
+
+
+**Change Scope Template**:
+
+```
+System/Dataset: ________________________
+Change Type: ___________________________
+Timeline: ______________________________
+Technical Details: _____________________
+Business Justification: ________________
+```
+
+**Common Change Types**:
+
+- **System Migration**: Moving from one platform to another
+- **Schema Changes**: Adding, removing, or modifying columns
+- **Performance Optimization**: Changing processing logic or infrastructure
+- **Security Updates**: Access control or data classification changes
+- **Deprecation**: Retiring old systems or datasets
+
+### Impact Analysis in Action
+
+Here's a real-world example showing how changes cascade through your data ecosystem:
+
+
+
+**Impact Assessment**: This migration affects 15+ downstream systems, including production ML models serving 1M+ customers daily. The health indicators show critical dependencies that require careful coordination.
+
+
+
+### TechFlow Migration Example
+
+Let's apply this to our scenario:
+
+
+
+
+**System/Dataset**: `customer_analytics_pipeline` (Hive tables)
+**Change Type**: Platform migration (Hive → Snowflake)
+**Timeline**: 48-hour maintenance window, next weekend
+**Technical Details**:
+
+- Migrate 5 core tables: `customers`, `orders`, `customer_metrics`, `daily_summaries`, `customer_segments`
+- Preserve all existing schemas and data
+- Update connection strings in downstream systems
+
+**Business Justification**:
+
+- 10x performance improvement for customer analytics
+- $50K/month cost savings
+- Enable real-time customer insights
+
+
+
+
+**High-Risk Elements**:
+
+- **Customer-facing dashboards**: Sales team uses these daily
+- **Automated reports**: CEO gets weekly customer metrics
+- **ML pipelines**: Customer segmentation models depend on this data
+- **API endpoints**: Mobile app queries customer data directly
+
+**Timing Risks**:
+
+- **Weekend migration**: Limited support staff available
+- **Monday morning**: Sales team needs dashboards for weekly planning
+- **Month-end**: Customer reporting deadline approaching
+
+**Technical Risks**:
+
+- **Data format differences**: Snowflake vs. Hive SQL variations
+- **Performance changes**: Query patterns may need optimization
+- **Connection failures**: Downstream systems need configuration updates
+
+
+
+
+## Step 2: Downstream Mapping
+
+Use DataHub's lineage to systematically map all affected systems:
+
+### The Downstream Discovery Method
+
+**Starting Point**: Your changing dataset (`customer_analytics_pipeline`)
+
+**Discovery Process**:
+
+1. **Open the dataset** in DataHub
+2. **Navigate to Lineage tab**
+3. **Switch to downstream view** (right side of lineage graph)
+4. **Document each downstream connection**:
+
+
+
+**Downstream Impact Template**:
+
+| System | Type | Business Impact | Technical Owner | Update Required |
+| ------------------ | ------------ | --------------------------- | ---------------- | ------------------- |
+| Customer Dashboard | BI Tool | High - Sales team daily use | @sarah-analytics | Connection string |
+| Weekly Reports | Automated | High - CEO visibility | @john-reporting | SQL query updates |
+| ML Pipeline | Data Science | Medium - Model retraining | @alex-ml | Data source config |
+| Mobile API | Application | High - Customer app | @dev-team | Database connection |
+| Data Warehouse | Storage | Low - Archive only | @data-ops | Monitoring updates |
+
+
+
+### Interactive Exercise: Downstream Mapping
+
+
+
+**Your Challenge**: Map downstream impact for TechFlow's user analytics
+
+**Step 1**: Open `fct_users_created` in your DataHub instance
+**Step 2**: Navigate to the Lineage tab
+**Step 3**: Identify all downstream connections
+**Step 4**: Fill out the impact template:
+
+```
+Downstream System 1: ___________________
+Business Impact: _______________________
+Technical Owner: _______________________
+Update Required: _______________________
+
+Downstream System 2: ___________________
+Business Impact: _______________________
+Technical Owner: _______________________
+Update Required: _______________________
+```
+
+**Success Criteria**: You've identified at least 3 downstream systems and assessed their business impact.
+
+
+
+## Step 3: Risk Assessment
+
+Transform your downstream map into quantified risk scores:
+
+### The Risk Scoring Matrix
+
+
+
+
+**Impact Scale (1-5)**:
+
+- **5 - Critical**: Customer-facing, revenue-impacting, or regulatory
+- **4 - High**: Executive reporting, key business processes
+- **3 - Medium**: Team productivity, internal analytics
+- **2 - Low**: Development tools, experimental systems
+- **1 - Minimal**: Archive, backup, or deprecated systems
+
+**Business Impact Factors**:
+
+- **User Count**: How many people depend on this system?
+- **Revenue Impact**: Does this directly affect sales or billing?
+- **Compliance**: Are there regulatory or audit requirements?
+- **Operational Criticality**: Is this needed for daily operations?
+
+
+
+
+**Complexity Scale (1-5)**:
+
+- **5 - Very Complex**: Custom code, multiple integrations, legacy systems
+- **4 - Complex**: Requires specialized knowledge, multiple teams
+- **3 - Moderate**: Standard configurations, documented processes
+- **2 - Simple**: Well-understood, single team ownership
+- **1 - Trivial**: Automated, self-service, or minimal changes
+
+**Technical Factors**:
+
+- **Integration Complexity**: How many systems need updates?
+- **Code Changes**: Are application changes required?
+- **Testing Requirements**: How extensive is validation needed?
+- **Rollback Difficulty**: How easy is it to undo changes?
+
+
+
+
+**Risk Score Formula**:
+
+```
+Risk Score = Business Impact × Technical Complexity × Urgency Factor
+```
+
+**Urgency Factors**:
+
+- **1.5x**: Tight deadline (< 1 week)
+- **1.2x**: Normal timeline (1-4 weeks)
+- **1.0x**: Flexible timeline (> 1 month)
+
+**Risk Categories**:
+
+- **20-25**: **Critical Risk** - Executive approval required
+- **15-19**: **High Risk** - Detailed mitigation plan needed
+- **10-14**: **Medium Risk** - Standard change process
+- **5-9**: 🔵 **Low Risk** - Routine change management
+- **1-4**: ⚪ **Minimal Risk** - Proceed with standard testing
+
+
+
+
+### Risk Assessment Exercise
+
+
+
+**TechFlow Customer Analytics Migration Risk Assessment**:
+
+| Downstream System | Business Impact | Technical Complexity | Risk Score | Category |
+| ----------------- | --------------- | -------------------- | ---------- | ---------- |
+| Sales Dashboard | 5 (Critical) | 3 (Moderate) | 22.5 | Critical |
+| CEO Reports | 4 (High) | 2 (Simple) | 12 | Medium |
+| ML Pipeline | 3 (Medium) | 4 (Complex) | 18 | High |
+| Mobile API | 5 (Critical) | 3 (Moderate) | 22.5 | Critical |
+| Archive System | 1 (Minimal) | 1 (Trivial) | 1.5 | ⚪ Minimal |
+
+**Analysis**: 2 Critical Risk systems require executive approval and detailed rollback plans.
+
+
+
+## Step 4: Stakeholder Analysis
+
+Identify who needs to be involved in the change:
+
+### Stakeholder Mapping Framework
+
+
+
+**Stakeholder Categories**:
+
+**Primary Stakeholders** (Directly affected):
+
+- **Data Consumers**: Teams using the affected data
+- **System Owners**: Technical teams responsible for downstream systems
+- **Business Users**: People whose work depends on the data
+
+**Secondary Stakeholders** (Coordination required):
+
+- **Infrastructure Teams**: Platform and DevOps support
+- **Security Teams**: Access control and compliance
+- **Project Management**: Timeline and resource coordination
+
+**📢 Communication Stakeholders** (Keep informed):
+
+- **Executive Leadership**: High-level impact awareness
+- **Customer Support**: Potential user impact preparation
+- **Documentation Teams**: Update procedures and guides
+
+
+
+### Communication Strategy
+
+
+
+
+**Technical Impact Report Template**:
+
+```markdown
+## System Change Impact: Customer Analytics Migration
+
+### Technical Changes Required
+
+- **Connection Updates**: Update database connection strings
+- **Query Modifications**: Adapt SQL for Snowflake syntax
+- **Testing Requirements**: Validate data accuracy and performance
+- **Rollback Plan**: Revert connection strings if issues occur
+
+### Timeline
+
+- **Preparation**: This week - update configurations
+- **Migration**: Weekend - 48-hour window
+- **Validation**: Monday morning - verify all systems
+
+### Support Contacts
+
+- **Migration Lead**: @data-engineering-team
+- **Emergency Contact**: @on-call-engineer
+```
+
+
+
+
+**Business Impact Summary Template**:
+
+```markdown
+## Customer Analytics Platform Upgrade
+
+### What's Changing
+
+We're upgrading our customer analytics platform to improve performance and reduce costs.
+
+### Business Benefits
+
+- **10x faster** customer reports and dashboards
+- **$50K monthly savings** in infrastructure costs
+- **Real-time insights** for better customer service
+
+### What You Need to Know
+
+- **When**: Next weekend (48-hour maintenance window)
+- **Impact**: Brief downtime Saturday evening, normal service by Monday
+- **Your Action**: No action required - all systems will work as before
+
+### Questions?
+
+Contact: @data-team or @project-manager
+```
+
+
+
+
+**Executive Impact Brief Template**:
+
+```markdown
+## Executive Brief: Customer Analytics Migration
+
+### Strategic Impact
+
+- **Business Value**: $600K annual savings + 10x performance improvement
+- **Risk Assessment**: 2 critical systems identified, mitigation plans in place
+- **Timeline**: 48-hour weekend migration, normal operations by Monday
+
+### Risk Mitigation
+
+- **Rollback Plan**: 4-hour recovery time if issues occur
+- **Testing Strategy**: Comprehensive validation before go-live
+- **Support Coverage**: 24/7 engineering support during migration
+
+### Success Metrics
+
+- **Zero customer impact**: No service disruptions
+- **Performance targets**: 10x improvement in dashboard load times
+- **Cost savings**: $50K monthly reduction starting next month
+
+### Approval Required
+
+Proceed with migration: [ ] Yes [ ] No
+Executive Sponsor: **\*\***\_\_\_\_**\*\***
+```
+
+
+
+
+## Step 5: Mitigation Planning
+
+Develop comprehensive plans to minimize risk:
+
+### The Mitigation Strategy Framework
+
+
+
+**Risk Mitigation Categories**:
+
+**Preventive Measures** (Avoid problems):
+
+- **Comprehensive testing**: Validate all connections before go-live
+- **Staged rollout**: Migrate non-critical systems first
+- **Communication plan**: Ensure all stakeholders are prepared
+- **Documentation updates**: Keep all procedures current
+
+**Detective Measures** (Catch problems early):
+
+- **Monitoring alerts**: Set up notifications for system failures
+- **Health checks**: Automated validation of data flow
+- **User feedback channels**: Quick reporting of issues
+- **Performance monitoring**: Track system response times
+
+**Corrective Measures** (Fix problems quickly):
+
+- **Rollback procedures**: Detailed steps to revert changes
+- **Emergency contacts**: 24/7 support team availability
+- **Escalation paths**: Clear decision-making authority
+- **Communication templates**: Pre-written status updates
+
+
+
+### Rollback Strategy Development
+
+**Critical Success Factor**: Every change needs a tested rollback plan.
+
+
+
+
+**Rollback Decision Matrix**:
+
+| Issue Type | Rollback Trigger | Recovery Time | Decision Authority |
+| ------------------ | ------------------------- | ------------- | --------------------- |
+| Data Corruption | Any data inconsistency | 2 hours | Data Engineering Lead |
+| Performance Issues | >50% slower than baseline | 4 hours | Technical Manager |
+| System Failures | Any critical system down | 1 hour | On-call Engineer |
+| User Complaints | >10 user reports | 6 hours | Product Manager |
+
+**Rollback Procedure Template**:
+
+```bash
+# Emergency Rollback: Customer Analytics Migration
+# Decision Authority: [Name] [Contact]
+# Estimated Time: 4 hours
+
+1. Stop new data processing
+2. Revert connection strings to original Hive system
+3. Restart downstream applications
+4. Validate data flow restoration
+5. Notify stakeholders of rollback completion
+```
+
+
+
+
+**Pre-Migration Testing Checklist**:
+
+**Data Validation**:
+
+- [ ] Row counts match between old and new systems
+- [ ] Sample data comparison (10% random sample)
+- [ ] Schema validation (all columns present and correct types)
+- [ ] Data freshness verification (latest timestamps match)
+
+**System Integration Testing**:
+
+- [ ] All downstream connections work with new system
+- [ ] Query performance meets or exceeds baseline
+- [ ] Authentication and authorization function correctly
+- [ ] Monitoring and alerting systems recognize new platform
+
+**User Acceptance Testing**:
+
+- [ ] Key dashboards load correctly with new data source
+- [ ] Reports generate successfully with expected data
+- [ ] API endpoints return correct responses
+- [ ] Mobile app functions normally with new backend
+
+
+
+
+**Migration Success Criteria**:
+
+**Technical Metrics**:
+
+- **Zero data loss**: 100% data integrity maintained
+- **Performance improvement**: >5x faster query response times
+- **Uptime target**: 99.9% availability during migration
+- **Error rate**: <0.1% failed requests
+
+**Business Metrics**:
+
+- **User satisfaction**: <5 user complaints about system changes
+- **Productivity impact**: No measurable decrease in team efficiency
+- **Cost savings**: Achieve projected $50K monthly reduction
+- **Timeline adherence**: Complete migration within 48-hour window
+
+**Validation Timeline**:
+
+- **Immediate** (0-4 hours): System connectivity and basic functionality
+- **Short-term** (1-7 days): Performance validation and user feedback
+- **Medium-term** (1-4 weeks): Cost savings realization and stability
+- **Long-term** (1-3 months): Full business value achievement
+
+
+
+
+## Real-World Impact Analysis Exercise
+
+
+
+**Your Challenge**: Perform a complete impact analysis for a system change
+
+**Scenario**: TechFlow wants to add a new `customer_lifetime_value` column to the `customers` table. This requires updating the ETL job and potentially affects all downstream systems.
+
+**Your Task**: Complete the 5-step impact analysis:
+
+**Step 1 - Scope Definition**:
+
+```
+System/Dataset: customers table
+Change Type: Schema addition (new column)
+Timeline: 2-week implementation
+Technical Details: Add CLV calculation to nightly ETL
+Business Justification: Enable customer segmentation for marketing
+```
+
+**Step 2 - Downstream Mapping**:
+Use DataHub to identify all systems that consume the `customers` table and document them.
+
+**Step 3 - Risk Assessment**:
+Score each downstream system using the Business Impact × Technical Complexity formula.
+
+**Step 4 - Stakeholder Analysis**:
+Identify who needs to be involved and create appropriate communication plans.
+
+**Step 5 - Mitigation Planning**:
+Develop testing strategy and rollback procedures.
+
+**Success Criteria**: You've created a comprehensive impact analysis that could be presented to stakeholders for approval.
+
+
+
+## Success Checkpoint
+
+
+
+**You've mastered impact analysis when you can:**
+
+**Planning Skills**:
+
+- Complete the 5-step impact analysis framework for any system change
+- Quantify risk using business impact and technical complexity scores
+- Create stakeholder-appropriate communication plans
+- Develop comprehensive rollback strategies
+
+**Analysis Skills**:
+
+- Map downstream dependencies using DataHub lineage
+- Assess business impact across different user types and use cases
+- Identify critical path dependencies and single points of failure
+- Prioritize changes based on risk scores and business value
+
+**Communication Skills**:
+
+- Present technical impact to business stakeholders clearly
+- Create executive summaries that enable informed decision-making
+- Coordinate cross-team changes using lineage insights
+- Document mitigation plans that teams can execute confidently
+
+**Final Validation**:
+Choose a real system change in your organization and perform a complete impact analysis using the framework you've learned.
+
+
+
+## What You've Accomplished
+
+**Outstanding work!** You've transformed from basic lineage viewing to expert-level impact analysis:
+
+- **Systematic approach**: You can now analyze any system change methodically
+- **Risk quantification**: You understand how to score and prioritize risks
+- **Stakeholder management**: You can communicate impact to any audience
+- **Mitigation planning**: You're prepared for both success and failure scenarios
+
+:::tip Mark Your Progress
+Check off "Performing Impact Analysis" in the progress tracker above! You're ready to troubleshoot lineage issues.
+:::
+
+---
+
+**Next**: Complete your lineage mastery by learning [lineage troubleshooting techniques](troubleshooting.md) →
diff --git a/docs/learn-datahub/lineage/overview.md b/docs/learn-datahub/lineage/overview.md
new file mode 100644
index 00000000000000..2c40e816b731c0
--- /dev/null
+++ b/docs/learn-datahub/lineage/overview.md
@@ -0,0 +1,220 @@
+import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode';
+
+# Data Lineage & Impact Analysis (40 minutes)
+
+**From Beginner to Expert**: You've learned basic lineage in the quickstart, but production data environments are complex beasts. This series transforms you into a lineage expert who can navigate multi-system architectures, perform systematic impact analysis, and troubleshoot the most challenging data pipeline issues.
+
+## Your Advanced Data Challenge
+
+**Meet the Scenario**: You're the senior data engineer at a growing technology company, and leadership has announced a major system migration. Your job is to assess the impact of moving the customer analytics pipeline from the legacy system to a new cloud platform. One wrong move could break customer-facing dashboards used by the entire sales team.
+
+**The Stakes**:
+
+- **15+ downstream systems** depend on customer analytics
+- **$2M+ in revenue** tracked through affected dashboards
+- **48-hour migration window** - no room for errors
+- **Your reputation** as the data reliability expert
+
+**Your Mission**: Master advanced lineage analysis to plan, execute, and validate this critical migration without breaking anything.
+
+### Enterprise Migration Challenge
+
+Here's the complex data pipeline you'll be analyzing throughout this tutorial series:
+
+
+
+**Migration Complexity**: This seemingly simple 4-node pipeline actually has 15+ downstream dependencies, cross-platform transformations, and business-critical dashboards that cannot afford downtime.
+
+**Enterprise Lineage Analysis Framework:**
+
+
+
+**Architecture Components**:
+
+- **Source Systems**: Raw data, databases, APIs, files
+- **Transformation Layers**: ETL/ELT processes, data pipelines, business logic, quality checks
+- **Target Systems**: Analytics/reports, dashboards, ML models, data products
+
+**Lineage Analysis Capabilities:**
+
+- **Upstream Tracing**: Follow data back to its original sources
+- **Downstream Impact**: Identify all systems affected by changes
+- **Transformation Logic**: Understand how data is processed and modified
+- **Dependency Mapping**: Visualize critical data relationships
+- **Change Impact Assessment**: Predict effects of schema or pipeline changes
+
+## Learning Path Overview
+
+## What You'll Master
+
+### **Reading Lineage Graphs** (15 minutes)
+
+**From**: Basic lineage viewing
+**To**: Expert multi-hop navigation across complex architectures
+
+**You'll Learn**:
+
+- Navigate 5+ hop lineage paths efficiently
+- Interpret different node types (datasets, jobs, applications)
+- Understand transformation logic through connections
+- Identify critical paths in data infrastructure
+
+**Real Scenario**: Trace revenue calculation errors through a 7-system pipeline spanning Kafka → Spark → Snowflake → dbt → Looker.
+
+### **Performing Impact Analysis** (15 minutes)
+
+**From**: "What uses this data?"
+**To**: Systematic impact assessment with risk scoring
+
+**You'll Learn**:
+
+- Quantify downstream impact with business metrics
+- Create change impact reports for stakeholders
+- Develop rollback strategies based on lineage
+- Coordinate cross-team changes using lineage insights
+
+**Real Scenario**: Plan the customer analytics migration by mapping all 15 downstream dependencies and creating a risk-ranked rollout plan.
+
+### **Lineage Troubleshooting** (10 minutes)
+
+**From**: "Why is lineage missing?"
+**To**: Proactive lineage quality management
+
+**You'll Learn**:
+
+- Debug missing lineage connections
+- Improve lineage accuracy through configuration
+- Handle edge cases and manual processes
+- Establish lineage monitoring and validation
+
+**Real Scenario**: Investigate why the new ML pipeline isn't showing up in lineage and fix the ingestion configuration.
+
+## Prerequisites
+
+**Required Knowledge**:
+
+- Completed [DataHub Quickstart](../quickstart/overview.md) (basic lineage understanding)
+- Familiarity with data pipelines and ETL concepts
+- Basic understanding of SQL and data transformations
+
+**Technical Setup**:
+
+- DataHub instance with sample data (from quickstart)
+- Access to lineage views and dataset details
+- Ability to navigate the DataHub UI confidently
+
+**Time Commitment**: 40 minutes of focused learning with hands-on exercises
+
+## Learning Approach
+
+**Scenario-Driven**: Every concept is taught through the lens of the enterprise migration challenge
+
+**Hands-On Practice**: Interactive exercises using your actual DataHub instance with sample data
+
+**Real-World Applications**: Techniques you'll use immediately in production environments
+
+**Team-Ready Skills**: Learn to communicate lineage insights to both technical and business stakeholders
+
+## Success Outcomes
+
+By completing this series, you'll be able to:
+
+**Technical Mastery**:
+
+- Navigate any lineage graph, no matter how complex
+- Perform comprehensive impact analysis for system changes
+- Troubleshoot and improve lineage quality
+- Use lineage for root cause analysis and debugging
+
+**Business Impact**:
+
+- Reduce system change risks through proper impact assessment
+- Accelerate troubleshooting with systematic lineage analysis
+- Improve cross-team coordination using lineage insights
+- Build confidence in data reliability and change management
+
+**Operational Excellence**:
+
+- Establish expertise in complex data pipeline analysis
+- Lead system migrations and architecture changes with confidence
+- Uplift team capability with lineage best practices
+- Strengthen governance and reliability initiatives
+
+## Ready to Begin?
+
+**Your journey to lineage mastery starts now**. Each tutorial builds on the previous one, taking you from basic lineage reading to expert-level impact analysis and troubleshooting.
+
+**Start with**: [Reading Lineage Graphs](reading-lineage.md) - Learn to navigate complex data flows like a pro
+
+---
+
+**Pro Tip**: Keep your DataHub instance open in another tab. You'll be using it extensively throughout these tutorials for hands-on practice with the sample data.
diff --git a/docs/learn-datahub/lineage/reading-lineage.md b/docs/learn-datahub/lineage/reading-lineage.md
new file mode 100644
index 00000000000000..38edada4fcda32
--- /dev/null
+++ b/docs/learn-datahub/lineage/reading-lineage.md
@@ -0,0 +1,1184 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+import DataHubEntityCard, { SampleEntities } from '@site/src/components/DataHubEntityCard';
+import DataHubLineageNode, { DataHubLineageFlow, SampleLineageFlows } from '@site/src/components/DataHubLineageNode';
+import ProcessFlow, { DataHubWorkflows } from '@site/src/components/ProcessFlow';
+
+# Reading Lineage Graphs (15 minutes)
+
+
+
+**The Expert's Challenge**: You've mastered basic lineage in the quickstart, but now you're facing a complex production scenario. The customer dashboard is showing inconsistent numbers, and you need to trace through a multi-hop data pipeline spanning 5 different systems to find the root cause.
+
+**Your Mission**: Learn to read complex lineage graphs like a seasoned data engineer, understanding every connection, transformation, and dependency in your data ecosystem.
+
+## What You'll Master
+
+By the end of this step, you'll be able to:
+
+- **Navigate multi-hop lineage** across complex data architectures
+- **Interpret different node types** (datasets, jobs, applications)
+- **Understand transformation logic** through lineage connections
+- **Identify critical paths** in your data infrastructure
+
+## The Lineage Reading Framework
+
+Professional data engineers follow a systematic approach to lineage analysis:
+
+
+
+## Level 1: Understanding Node Types
+
+Every element in a lineage graph tells a specific story:
+
+
+
+
+**Tables, Views, and Files**:
+
+- **Raw Tables**: Source system data (often rectangular nodes)
+- **Analytical Views**: Processed, business-ready data
+- **Materialized Views**: Pre-computed results for performance
+- **File Assets**: CSV, Parquet, JSON files in data lakes
+
+**Visual Cues in DataHub**:
+
+
+
+
+
+
+
+- **Platform logos**: Each node shows the actual platform logo and type
+- **Health indicators**: Color-coded dots show data quality status
+- **Node highlighting**: Selected or problematic nodes are visually emphasized
+
+**Reading Strategy**: Start with the dataset causing issues, then trace backward to find the source.
+
+
+
+
+**Data Processing Elements**:
+
+- **ETL Jobs**: Extract, Transform, Load processes
+- **Python Scripts**: Custom data processing logic
+- **dbt Models**: Data transformation workflows
+- **Spark Jobs**: Large-scale data processing
+
+**Connection Patterns**:
+
+- **Solid lines**: Direct data dependencies
+- **Dashed lines**: Indirect or inferred relationships
+- **Arrows**: Direction of data flow (always follows the arrows!)
+
+**Analysis Technique**: Jobs between datasets show _how_ data is transformed, not just _that_ it flows.
+
+
+
+
+**Business Applications**:
+
+- **BI Dashboards**: Looker, Tableau, PowerBI reports
+- **ML Models**: Training and inference pipelines
+- **Applications**: Customer-facing features
+- **Automated Reports**: Scheduled business reports
+
+**Business Impact Indicators**:
+
+- **User-facing systems**: High business impact if broken
+- **Internal tools**: Important for operations but lower external impact
+- **Experimental systems**: Can often tolerate temporary issues
+
+
+
+
+## Level 2: Multi-Hop Navigation
+
+Real production lineage often spans multiple systems and transformations:
+
+### The 6-Hop Analysis Method
+
+**Scenario**: Customer dashboard shows wrong revenue numbers. Let's trace it:
+
+
+
+**Navigation Strategy**:
+
+1. **Start at the problem** (executive dashboard)
+2. **Follow arrows backward** (upstream direction)
+3. **Document each hop**: What system, what transformation?
+ - Dashboard ← Chart ← View ← Table ← Job ← Raw Table
+4. **Identify the break point**: Where does data look wrong?
+ - Critical ETL job failure affecting downstream data
+5. **Focus investigation**: Drill into the problematic hop
+ - Expand columns to see field-level transformations
+ - Check tags and glossary terms for context
+
+### Interactive Exercise: Multi-Hop Tracing
+
+
+
+**Your Challenge**: Find the root cause of data quality issues
+
+**Step 1**: Open any complex dataset in your DataHub instance
+**Step 2**: Click "View Lineage" to see the full graph
+**Step 3**: Apply the 5-hop analysis method:
+
+**5-Hop Lineage Analysis Example:**
+
+
+
+**Analysis Questions for Each Hop:**
+
+1. **Hop 1**: What was the last transformation applied?
+2. **Hop 2**: What business logic was implemented?
+3. **Hop 3**: What quality checks were performed?
+4. **Hop 4**: How was the data originally ingested?
+5. **Hop 5**: What is the ultimate source system?
+
+**Professional Lineage Reading Strategy:**
+
+1. **Start at the Target**: Begin with the dataset you're investigating
+2. **Work Backwards**: Follow each upstream connection systematically
+3. **Document Each Hop**: Note the transformation type and business purpose
+4. **Identify Critical Points**: Mark systems that could cause widespread impact
+5. **Validate Understanding**: Confirm your analysis with data owners when possible
+
+**Analysis Questions**:
+
+- Which hop has the most complex transformation?
+- Where would you focus if data was missing?
+- Which systems are most critical to this pipeline?
+
+
+
+## Level 3: Understanding Transformation Logic
+
+The connections between nodes reveal how data is processed:
+
+### Reading Connection Types
+
+
+
+
+#### One-to-One Relationships
+
+
+
+**What this means**: Direct processing with filtering, aggregation, or enrichment. The transformation is straightforward and predictable.
+
+#### Many-to-One Relationships
+
+
+
+**What this means**: Data joining and consolidation from multiple sources. Complex business logic combines different data domains.
+
+**Analysis Approach**: Look for SQL logic, dbt models, or ETL job definitions to understand the exact transformation rules and join conditions.
+
+
+
+
+#### Fan-Out Patterns
+
+
+
+**Business Meaning**: One source feeding multiple business use cases. Each downstream system serves different teams and purposes.
+
+#### Fan-In Patterns
+
+
+
+**Business Meaning**: Data consolidation from various systems into a single, comprehensive view.
+
+**Risk Assessment**:
+
+- **Fan-out** = High impact if source breaks (affects multiple downstream systems)
+- **Fan-in** = Complex debugging if output is wrong (multiple potential failure points)
+
+
+
+
+#### Batch vs Real-Time Processing Patterns
+
+
+
+**Batch Processing Indicators**:
+
+- **Daily/Hourly jobs**: Look for time-based naming (daily_sales, hourly_events)
+- **Scheduled dependencies**: Jobs that run in sequence
+- **Lag indicators**: How fresh is each step in the pipeline?
+
+**Real-Time Processing Indicators**:
+
+- **Streaming connections**: Kafka topics, event streams
+- **Near real-time**: Minimal processing delay (seconds to minutes)
+- **Continuous updates**: Always-fresh data
+
+**Performance Insight**: Understanding processing schedules helps set proper expectations for data freshness and availability.
+
+
+
+
+## Level 4: Critical Path Analysis
+
+Identify the most important connections in your data ecosystem:
+
+### The Critical Path Method
+
+**High-Impact Paths**:
+
+- **Customer-facing dashboards** ← Highest priority
+- **Revenue reporting** ← Business critical
+- **Compliance reporting** ← Regulatory requirement
+- **Operational monitoring** ← System health
+
+**Dependency Mapping**:
+
+1. **Single points of failure**: One dataset feeding many critical systems
+2. **Bottleneck jobs**: Processing that everything depends on
+3. **Cross-platform bridges**: Connections between different systems
+
+### Interactive Exercise: Critical Path Identification
+
+
+
+**Scenario**: You're the Data Reliability Engineer at TechFlow Analytics. The CEO wants to know which data assets are most critical to business operations.
+
+
+
+**Dependency Count Analysis** (visible in the diagram above):
+
+- **customer_transactions** → feeds **3 systems** (see purple connection lines): revenue_pipeline, executive_datasource, sales_dashboard
+- **customer_metrics** → feeds **4 systems** (see blue connection lines): executive_datasource, churn_model, customer_api, compliance_report
+- **revenue_pipeline** → feeds **3 systems** (see orange connection lines): customer_metrics, executive_datasource, churn_model
+- **user_events** → feeds **2 systems** (see green connection lines): customer_metrics, churn_model
+
+**Your Analysis Task**:
+
+Using the lineage diagram above, calculate the **Critical Score** for each asset using this formula:
+
+**Critical Score = (Business Impact × Downstream Dependencies) + Failure Risk**
+
+Where:
+
+- **Business Impact**: 1-10 (10 = affects revenue/customers directly)
+- **Downstream Dependencies**: Count of systems that depend on this asset
+- **Failure Risk**: 1-10 (10 = high probability of failure)
+
+**Analysis Framework**:
+
+
+
+
+**Step 1**: Count the downstream dependencies for each asset by examining the lineage diagram:
+
+| Asset | Business Impact (1-10) | Downstream Count | Failure Risk (1-10) | Critical Score |
+| --------------------- | ---------------------- | ---------------- | ------------------- | -------------- |
+| customer_transactions | \_\_\_ | \_\_\_ | \_\_\_ | \_\_\_ |
+| revenue_pipeline | \_\_\_ | \_\_\_ | \_\_\_ | \_\_\_ |
+| customer_metrics | \_\_\_ | \_\_\_ | \_\_\_ | \_\_\_ |
+| user_events | \_\_\_ | \_\_\_ | \_\_\_ | \_\_\_ |
+
+**Step 2**: Rank your top 3 most critical assets:
+
+1. **Most Critical**: **\*\***\_\_\_\_**\*\***
+2. **Second Critical**: **\*\***\_\_\_\_**\*\***
+3. **Third Critical**: **\*\***\_\_\_\_**\*\***
+
+**Step 3**: Justify your choices:
+
+- **Why is #1 most critical?** **\*\***\_\_\_\_**\*\***
+- **What monitoring would you implement?** **\*\***\_\_\_\_**\*\***
+
+
+
+
+**Correct Analysis** (Data Reliability Engineer perspective):
+
+| Asset | Business Impact | Downstream Count | Failure Risk | Critical Score | Reasoning |
+| ------------------------- | --------------- | ---------------- | ------------ | -------------- | -------------------------------- |
+| **customer_transactions** | **10** | **4** | **6** | **46** | Revenue source feeding 4 systems |
+| **revenue_pipeline** | **9** | **3** | **8** | **35** | Critical ETL with Warning status |
+| **customer_metrics** | **8** | **4** | **5** | **37** | KPIs feeding multiple dashboards |
+| **user_events** | **7** | **2** | **4** | **18** | Important but fewer dependencies |
+
+**Top 3 Critical Assets** (in priority order):
+
+### 1. **customer_transactions** (Score: 46) - HIGHEST PRIORITY
+
+**Why Critical**:
+
+- Direct revenue impact (Business Impact: 10/10)
+- Feeds 4 downstream systems (revenue_pipeline, customer_metrics, executive_dashboard, sales_dashboard)
+- Single point of failure for all revenue reporting
+
+**Monitoring Strategy**:
+
+- Real-time transaction volume monitoring
+- Data freshness alerts (< 5 minute SLA)
+- Schema change detection
+- Database connection health checks
+- Automated failover to backup systems
+
+### 2. **customer_metrics** (Score: 37) - HIGH PRIORITY
+
+**Why Critical**:
+
+- Core business KPIs (Business Impact: 8/10)
+- Feeds executive dashboard, churn model, customer API, compliance reports
+- ML model dependency creates cascading failures
+
+**Monitoring Strategy**:
+
+- Data quality assertions on key metrics
+- Anomaly detection on metric values
+- Lineage validation checks
+- Model performance monitoring
+
+### 3. **revenue_pipeline** (Score: 35) - HIGH PRIORITY
+
+**Why Critical**:
+
+- Already showing Warning status (Failure Risk: 8/10)
+- Critical ETL processing revenue data
+- Scheduled dependency (failure affects daily reporting)
+
+**Monitoring Strategy**:
+
+- Job execution monitoring with alerts
+- Data pipeline SLA tracking
+- Resource utilization monitoring
+- Automated retry mechanisms
+- Escalation procedures for failures
+
+**Key Insight**: `customer_transactions` is the highest priority because it's both the revenue source AND feeds the most downstream systems. If it fails, everything breaks.
+
+
+
+
+**Mistake #1: Focusing Only on Business Impact**
+❌ **Wrong**: "Executive dashboard is most critical because the CEO uses it"
+✅ **Correct**: "customer_transactions is most critical because it feeds the executive dashboard AND 3 other systems"
+
+**Why**: Single points of failure with many dependencies are more critical than high-visibility endpoints.
+
+**Mistake #2: Ignoring Current Health Status**
+❌ **Wrong**: "All systems look healthy, so failure risk is low"
+✅ **Correct**: "revenue_pipeline shows Warning status, indicating higher failure risk"
+
+**Why**: Current system health is a leading indicator of future failures.
+
+**Mistake #3: Not Considering Cascading Failures**
+❌ **Wrong**: "Each system failure affects only its direct outputs"
+✅ **Correct**: "customer_transactions failure cascades through revenue_pipeline to all dashboards"
+
+**Why**: Data lineage shows how failures propagate through the entire ecosystem.
+
+**Mistake #4: Overlooking Processing Dependencies**
+❌ **Wrong**: "Dashboards are most critical because users see them"
+✅ **Correct**: "The ETL jobs feeding dashboards are more critical because dashboard failures often start there"
+
+**Why**: Processing bottlenecks are common failure points that affect multiple outputs.
+
+**Learning Checkpoint**: Did your analysis match the expert ranking? If not, review the lineage diagram to understand the dependency patterns you missed.
+
+
+
+
+**Success Validation**:
+✅ **Beginner**: Identified customer_transactions as high priority
+✅ **Intermediate**: Correctly calculated critical scores using the formula
+✅ **Advanced**: Recognized revenue_pipeline's Warning status as a risk factor
+✅ **Expert**: Proposed specific monitoring strategies for each critical asset
+
+## Pro Tips for Lineage Reading
+
+
+
+**Speed Techniques**:
+
+- **Start broad, then narrow**: Use overview mode first, then zoom into problem areas
+- **Follow the business logic**: Revenue flows are usually well-documented and critical
+- **Use platform knowledge**: Understand your organization's data architecture patterns
+
+**Accuracy Boosters**:
+
+- **Verify with owners**: Lineage might miss manual processes or external dependencies
+- **Check recency**: When was lineage last updated? Stale lineage can mislead
+- **Cross-reference documentation**: Combine lineage with technical docs and business context
+
+**Team Efficiency**:
+
+- **Document your findings**: Share critical path analysis with your team
+- **Create lineage maps**: Visual summaries for non-technical stakeholders
+- **Establish monitoring**: Set up alerts for critical path failures
+
+
+
+## Success Checkpoint
+
+
+
+**You've mastered lineage reading when you can:**
+
+**Speed Test**: Trace a 5-hop lineage path in under 3 minutes
+**Comprehension Test**: Identify all node types and transformation patterns
+**Analysis Test**: Determine the critical path for any business process
+**Communication Test**: Explain lineage findings to both technical and business stakeholders
+
+**Final Validation**:
+Choose a complex dataset in your DataHub instance and create a complete lineage analysis including:
+
+- All upstream dependencies (at least 3 hops)
+- Transformation logic at each step
+- Critical path assessment
+- Potential failure points
+
+
+
+## What You've Learned
+
+**Excellent progress!** You can now read lineage graphs like a professional data engineer:
+
+- **Multi-hop navigation**: Trace complex data flows across systems
+- **Node type recognition**: Understand datasets, jobs, and applications
+- **Transformation analysis**: Interpret how data changes through processing
+- **Critical path identification**: Focus on what matters most for business
+
+:::tip Mark Your Progress
+Check off "Reading Lineage Graphs" in the progress tracker above! You're ready to perform impact analysis.
+:::
+
+---
+
+**Next**: Now that you can read lineage expertly, let's learn how to [perform systematic impact analysis](impact-analysis.md) →
diff --git a/docs/learn-datahub/lineage/troubleshooting.md b/docs/learn-datahub/lineage/troubleshooting.md
new file mode 100644
index 00000000000000..b05488236a2ab2
--- /dev/null
+++ b/docs/learn-datahub/lineage/troubleshooting.md
@@ -0,0 +1,708 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import InteractiveDiagram from '@site/src/components/InteractiveDiagram';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Lineage Troubleshooting (10 minutes)
+
+
+
+**The Mystery**: Three weeks after the TechFlow migration, you notice something troubling. The new ML pipeline that processes customer segments isn't showing up in DataHub's lineage graph. The data team is asking questions, and you need to figure out why this critical connection is missing.
+
+**Your Mission**: Master the art of lineage troubleshooting - from diagnosing missing connections to proactively improving lineage quality across your entire data ecosystem.
+
+## What You'll Master
+
+By the end of this step, you'll be able to:
+
+- **Diagnose missing lineage** using systematic debugging techniques
+- **Fix ingestion issues** that cause incomplete lineage capture
+- **Handle edge cases** like manual processes and external dependencies
+- **Establish monitoring** to maintain lineage quality over time
+
+## The Lineage Troubleshooting Framework
+
+Professional lineage debugging follows a systematic approach:
+
+
+
+## Common Lineage Issues
+
+Understanding the most frequent problems helps you troubleshoot faster:
+
+
+
+**Missing Connections** (60% of issues):
+
+- New systems not yet configured for metadata ingestion
+- Changes in connection strings or authentication
+- Processing jobs that don't emit lineage metadata
+- Manual data movement processes
+
+**Incomplete Metadata** (25% of issues):
+
+- Partial schema information from source systems
+- Missing column-level lineage in transformations
+- Outdated metadata from infrequent ingestion runs
+- Custom applications without metadata instrumentation
+
+**Performance Problems** (10% of issues):
+
+- Lineage graphs too complex to render efficiently
+- Ingestion jobs timing out on large metadata volumes
+- UI responsiveness issues with deep lineage paths
+- Memory constraints during lineage computation
+
+**Stale Information** (5% of issues):
+
+- Metadata not refreshed after system changes
+- Cached lineage information showing old connections
+- Ingestion schedules not aligned with data pipeline changes
+- Manual metadata updates not propagated
+
+
+
+## Step 1: Identify the Gap
+
+Systematic gap identification prevents wasted troubleshooting effort:
+
+### The Gap Analysis Method
+
+
+
+
+**Gap Documentation Template**:
+
+```
+Missing Connection: ________________________
+Expected Source: ___________________________
+Expected Target: ___________________________
+Business Process: __________________________
+Technical Implementation: ___________________
+Last Known Working: ________________________
+```
+
+**TechFlow ML Pipeline Example**:
+
+```
+Missing Connection: Customer segments → ML training pipeline
+Expected Source: customer_segments (Snowflake table)
+Expected Target: ml_customer_model (MLflow model)
+Business Process: Nightly model retraining using latest customer data
+Technical Implementation: Python script with Snowflake connector
+Last Known Working: Never appeared in DataHub lineage
+```
+
+
+
+
+**Missing Lineage Impact**:
+
+**Business Impact**:
+
+- **Incomplete dependency mapping**: Can't assess full impact of customer data changes
+- **Risk management gaps**: ML model dependencies not visible to data governance
+- **Troubleshooting delays**: Root cause analysis missing critical connections
+- **Compliance concerns**: Audit trail incomplete for customer data usage
+
+**Technical Impact**:
+
+- **Change management risk**: Schema changes might break ML pipeline unknowingly
+- **Monitoring gaps**: No alerts if upstream customer data quality degrades
+- **Documentation inconsistency**: Technical architecture docs don't match reality
+- **Team coordination issues**: ML team not notified of customer data changes
+
+
+
+
+**Troubleshooting Priority Matrix**:
+
+| Business Impact | Technical Complexity | Priority | Action Timeline |
+| --------------- | -------------------- | ---------- | ------------------- |
+| High | Low | Critical | Fix within 24 hours |
+| High | High | Important | Fix within 1 week |
+| Medium | Low | Standard | Fix within 2 weeks |
+| Medium | High | 🔵 Planned | Fix within 1 month |
+| Low | Any | ⚪ Backlog | Fix when convenient |
+
+**TechFlow ML Pipeline**: High business impact (compliance risk) + Medium complexity = Important (1 week timeline)
+
+
+
+
+## Step 2: Check Data Sources
+
+Most lineage issues stem from ingestion configuration problems:
+
+### Ingestion Diagnostics Checklist
+
+
+
+**Source System Verification**:
+
+- [ ] **System connectivity**: Can DataHub reach the source system?
+- [ ] **Authentication**: Are credentials valid and permissions sufficient?
+- [ ] **Metadata availability**: Does the source system expose lineage information?
+- [ ] **Recent changes**: Have there been system updates or migrations?
+
+**Ingestion Configuration**:
+
+- [ ] **Recipe accuracy**: Is the ingestion recipe configured correctly?
+- [ ] **Scheduling**: Is the ingestion running on the expected schedule?
+- [ ] **Scope coverage**: Are all relevant databases/schemas included?
+- [ ] **Lineage extraction**: Is lineage extraction enabled in the recipe?
+
+**Execution Status**:
+
+- [ ] **Recent runs**: Has ingestion executed successfully recently?
+- [ ] **Error logs**: Are there any ingestion failures or warnings?
+- [ ] **Data volume**: Is the expected amount of metadata being ingested?
+- [ ] **Processing time**: Are ingestion jobs completing within expected timeframes?
+
+
+
+### Interactive Diagnostics Exercise
+
+
+
+**Your Challenge**: Diagnose the TechFlow ML pipeline lineage gap
+
+**Step 1 - Source System Check**:
+
+```
+ML Pipeline System: Python + MLflow + Snowflake
+Expected Metadata: Job definitions, data dependencies, model artifacts
+Current Status: ________________________________
+Issues Found: __________________________________
+```
+
+**Step 2 - Ingestion Configuration**:
+
+```
+Ingestion Recipe: ______________________________
+Last Successful Run: ___________________________
+Lineage Extraction Enabled: ____________________
+Scope Includes ML Systems: _____________________
+```
+
+**Step 3 - Gap Analysis**:
+
+```
+Root Cause Hypothesis: _________________________
+Confidence Level (1-10): _______________________
+Next Troubleshooting Step: _____________________
+```
+
+
+
+## Step 3: Validate Ingestion
+
+Deep-dive into ingestion mechanics to find the root cause:
+
+### Ingestion Debugging Techniques
+
+
+
+
+**Log Investigation Strategy**:
+
+**Error Pattern Recognition**:
+
+```bash
+# Common error patterns to search for
+grep -i "lineage" ingestion.log
+grep -i "connection" ingestion.log
+grep -i "timeout" ingestion.log
+grep -i "permission" ingestion.log
+grep -i "schema" ingestion.log
+```
+
+**Success Indicators**:
+
+```bash
+# Positive signals in logs
+grep "Successfully processed" ingestion.log
+grep "Lineage extracted" ingestion.log
+grep "Metadata ingested" ingestion.log
+```
+
+**TechFlow ML Pipeline Investigation**:
+
+```
+Expected Log Entry: "Successfully extracted lineage from ml_training_job"
+Actual Log Entry: "Warning: No lineage metadata found for Python scripts"
+Root Cause: Python ML scripts don't emit DataHub-compatible lineage
+```
+
+
+
+
+**Metadata Completeness Check**:
+
+**Dataset Metadata**:
+
+- **Schema information**: Are all columns and types captured?
+- **Ownership data**: Are dataset owners properly identified?
+- **Custom properties**: Are business-relevant attributes included?
+- **Platform details**: Is the source system correctly identified?
+
+**Lineage Metadata**:
+
+- **Job information**: Are transformation jobs captured as entities?
+- **Input/output mapping**: Are data dependencies clearly defined?
+- **Temporal information**: Are processing schedules and frequencies captured?
+- **Column-level lineage**: Are field-level transformations tracked?
+
+**Validation Queries**:
+
+```sql
+-- Check if ML pipeline datasets exist
+SELECT * FROM metadata_aspect
+WHERE urn LIKE '%ml_customer_model%';
+
+-- Verify lineage relationships
+SELECT * FROM metadata_aspect
+WHERE aspect = 'datasetLineage'
+AND urn LIKE '%customer_segments%';
+```
+
+
+
+
+**Recipe Optimization**:
+
+**Lineage Extraction Settings**:
+
+```yaml
+# Enhanced lineage extraction configuration
+source:
+ type: "snowflake"
+ config:
+ # Enable comprehensive lineage extraction
+ include_table_lineage: true
+ include_view_lineage: true
+ include_column_lineage: true
+
+ # Capture custom SQL and stored procedures
+ include_usage_statistics: true
+ sql_parser_use_external_process: true
+
+ # Extended metadata capture
+ profiling:
+ enabled: true
+ include_field_null_count: true
+ include_field_min_value: true
+ include_field_max_value: true
+```
+
+**Custom Lineage Injection**:
+
+```python
+# For systems that don't auto-emit lineage
+from datahub.emitter.mce_builder import make_lineage_mce
+from datahub.emitter.rest_emitter import DatahubRestEmitter
+
+# Create custom lineage for ML pipeline
+lineage_mce = make_lineage_mce(
+ upstream_urns=["urn:li:dataset:(urn:li:dataPlatform:snowflake,customer_segments,PROD)"],
+ downstream_urn="urn:li:dataset:(urn:li:dataPlatform:mlflow,ml_customer_model,PROD)"
+)
+
+emitter = DatahubRestEmitter(gms_server="http://localhost:8080")
+emitter.emit_mce(lineage_mce)
+```
+
+
+
+
+## Step 4: Handle Edge Cases
+
+Real-world data pipelines often include scenarios that standard ingestion can't capture:
+
+### Common Edge Cases and Solutions
+
+
+
+**Manual Data Processes**:
+
+- **Problem**: Excel files, manual data entry, ad-hoc scripts
+- **Solution**: Custom metadata emission or documentation-based lineage
+- **Implementation**: Create "virtual" datasets representing manual processes
+
+**External System Dependencies**:
+
+- **Problem**: Third-party APIs, vendor data feeds, external databases
+- **Solution**: Proxy datasets or external system connectors
+- **Implementation**: Document external dependencies as DataHub entities
+
+**Real-time Processing**:
+
+- **Problem**: Streaming pipelines, event-driven architectures, microservices
+- **Solution**: Event-based lineage capture or instrumentation
+- **Implementation**: Custom lineage emission from application code
+
+**Complex Transformations**:
+
+- **Problem**: Multi-step ETL, custom business logic, conditional processing
+- **Solution**: Job-level lineage with detailed transformation documentation
+- **Implementation**: Enhanced metadata with transformation descriptions
+
+
+
+### Edge Case Resolution Framework
+
+
+
+
+**Documentation-Based Lineage**:
+
+```python
+# Create lineage for manual Excel process
+from datahub.emitter.mce_builder import make_dataset_urn, make_lineage_mce
+
+# Define the manual process as a "dataset"
+manual_process_urn = make_dataset_urn(
+ platform="manual",
+ name="monthly_customer_review_excel",
+ env="PROD"
+)
+
+# Create lineage from automated data to manual process
+lineage_mce = make_lineage_mce(
+ upstream_urns=["urn:li:dataset:(urn:li:dataPlatform:snowflake,customer_segments,PROD)"],
+ downstream_urn=manual_process_urn
+)
+
+# Add custom properties to explain the manual process
+properties = {
+ "process_description": "Monthly customer review conducted by business team",
+ "frequency": "Monthly",
+ "owner": "customer_success_team",
+ "documentation_url": "https://wiki.company.com/customer-review-process"
+}
+```
+
+**Benefits**:
+
+- Complete lineage visibility including manual steps
+- Documentation of business processes in technical lineage
+- Compliance and audit trail for manual data handling
+
+
+
+
+**Proxy Dataset Approach**:
+
+```python
+# Create proxy for external API data source
+external_api_urn = make_dataset_urn(
+ platform="external_api",
+ name="customer_enrichment_service",
+ env="PROD"
+)
+
+# Document the external dependency
+external_properties = {
+ "api_endpoint": "https://api.customerdata.com/v2/enrichment",
+ "update_frequency": "Real-time",
+ "data_provider": "CustomerData Inc.",
+ "sla": "99.9% uptime",
+ "contact": "support@customerdata.com"
+}
+
+# Create lineage showing external data flow
+lineage_mce = make_lineage_mce(
+ upstream_urns=[external_api_urn],
+ downstream_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,enriched_customers,PROD)"
+)
+```
+
+**Benefits**:
+
+- Visibility into external data dependencies
+- Risk assessment for third-party data sources
+- Contact information for external data issues
+
+
+
+
+**Code-Level Lineage Emission**:
+
+```python
+# Instrument ML training pipeline
+from datahub.emitter.rest_emitter import DatahubRestEmitter
+from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetLineageType
+from datahub.metadata.schema_classes import DatasetLineageClass
+
+def train_customer_model():
+ # Your ML training code here
+ input_data = load_customer_segments()
+ model = train_model(input_data)
+ save_model(model)
+
+ # Emit lineage metadata
+ emitter = DatahubRestEmitter(gms_server="http://localhost:8080")
+
+ lineage = DatasetLineageClass(
+ upstreams=[
+ {
+ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,customer_segments,PROD)",
+ "type": DatasetLineageType.TRANSFORMED
+ }
+ ]
+ )
+
+ model_urn = "urn:li:dataset:(urn:li:dataPlatform:mlflow,ml_customer_model,PROD)"
+ emitter.emit_mcp(
+ MetadataChangeProposalWrapper(
+ entityType="dataset",
+ entityUrn=model_urn,
+ aspectName="datasetLineage",
+ aspect=lineage
+ )
+ )
+```
+
+**Benefits**:
+
+- Real-time lineage updates as code executes
+- Accurate capture of dynamic data dependencies
+- Integration with application deployment pipelines
+
+
+
+
+## Step 5: Implement Monitoring
+
+Proactive lineage quality management prevents future troubleshooting:
+
+### Lineage Quality Monitoring Framework
+
+
+
+**Quality Metrics**:
+
+- **Coverage**: Percentage of data assets with complete lineage
+- **Freshness**: How recently lineage information was updated
+- **Accuracy**: Validation of lineage against known data flows
+- **Completeness**: Presence of both upstream and downstream connections
+
+**Alert Conditions**:
+
+- **Missing lineage**: New datasets without any lineage connections
+- **Stale metadata**: Lineage not updated within expected timeframe
+- **Broken connections**: Previously connected systems showing gaps
+- **Ingestion failures**: Metadata extraction jobs failing repeatedly
+
+**Maintenance Tasks**:
+
+- **Regular validation**: Quarterly review of critical data lineage
+- **Configuration updates**: Adjust ingestion recipes as systems evolve
+- **Documentation sync**: Keep manual lineage documentation current
+- **Team training**: Ensure new team members understand lineage practices
+
+
+
+### Monitoring Implementation
+
+
+
+
+**Lineage Quality Dashboard**:
+
+```sql
+-- Lineage coverage metrics
+SELECT
+ platform,
+ COUNT(*) as total_datasets,
+ COUNT(CASE WHEN has_upstream_lineage THEN 1 END) as with_upstream,
+ COUNT(CASE WHEN has_downstream_lineage THEN 1 END) as with_downstream,
+ ROUND(100.0 * COUNT(CASE WHEN has_upstream_lineage THEN 1 END) / COUNT(*), 2) as upstream_coverage_pct
+FROM dataset_lineage_summary
+GROUP BY platform
+ORDER BY upstream_coverage_pct DESC;
+
+-- Stale lineage detection
+SELECT
+ dataset_urn,
+ last_lineage_update,
+ DATEDIFF(CURRENT_DATE, last_lineage_update) as days_since_update
+FROM dataset_metadata
+WHERE DATEDIFF(CURRENT_DATE, last_lineage_update) > 7
+ORDER BY days_since_update DESC;
+```
+
+**Automated Alerts**:
+
+```python
+# Lineage quality monitoring script
+def check_lineage_quality():
+ critical_datasets = [
+ "customer_segments",
+ "fct_users_created",
+ "ml_customer_model"
+ ]
+
+ for dataset in critical_datasets:
+ lineage_age = get_lineage_age(dataset)
+ if lineage_age > 7: # days
+ send_alert(f"Stale lineage for {dataset}: {lineage_age} days old")
+
+ if not has_upstream_lineage(dataset):
+ send_alert(f"Missing upstream lineage for {dataset}")
+```
+
+
+
+
+**Quarterly Lineage Review Process**:
+
+**Review Checklist**:
+
+- [ ] **Critical path validation**: Verify lineage for top 10 most important datasets
+- [ ] **New system integration**: Ensure recently added systems appear in lineage
+- [ ] **Accuracy spot checks**: Validate 5% random sample against known data flows
+- [ ] **Documentation updates**: Sync lineage with architecture documentation
+
+**Validation Template**:
+
+```
+Dataset: ___________________________________
+Expected Upstream Count: ____________________
+Actual Upstream Count: ______________________
+Expected Downstream Count: __________________
+Actual Downstream Count: ____________________
+Discrepancies Found: ________________________
+Action Required: ____________________________
+Validation Date: ____________________________
+Reviewer: ___________________________________
+```
+
+
+
+
+**Lineage Governance Framework**:
+
+**Roles and Responsibilities**:
+
+- **Data Engineers**: Ensure new pipelines emit proper lineage metadata
+- **Analytics Engineers**: Validate lineage for dbt models and transformations
+- **Data Platform Team**: Maintain ingestion infrastructure and monitoring
+- **Data Governance**: Review lineage completeness for compliance requirements
+
+**Process Integration**:
+
+- **Code Review**: Include lineage validation in data pipeline code reviews
+- **Deployment Gates**: Require lineage metadata before production deployment
+- **Incident Response**: Use lineage for root cause analysis and impact assessment
+- **Architecture Reviews**: Validate lineage against system design documents
+
+**Training and Documentation**:
+
+- **Onboarding**: Include lineage best practices in new team member training
+- **Playbooks**: Document troubleshooting procedures for common lineage issues
+- **Best Practices**: Maintain guidelines for lineage metadata emission
+- **Tool Training**: Regular sessions on DataHub lineage features and capabilities
+
+
+
+
+## Success Checkpoint
+
+
+
+**You've mastered lineage troubleshooting when you can:**
+
+**Diagnostic Skills**:
+
+- Systematically identify and categorize lineage gaps
+- Debug ingestion issues using logs and configuration analysis
+- Validate metadata completeness and accuracy
+- Prioritize troubleshooting efforts based on business impact
+
+**Technical Skills**:
+
+- Configure ingestion recipes for optimal lineage extraction
+- Implement custom lineage emission for edge cases
+- Handle manual processes and external system dependencies
+- Instrument applications for real-time lineage updates
+
+**Operational Skills**:
+
+- Establish monitoring and alerting for lineage quality
+- Create validation processes for ongoing lineage accuracy
+- Integrate lineage governance into team workflows
+- Train teams on lineage best practices and troubleshooting
+
+**Final Validation**:
+Identify a lineage gap in your organization and resolve it using the systematic troubleshooting framework you've learned.
+
+
+
+## Mission Accomplished: Lineage Mastery Complete!
+
+**Congratulations!** You've completed the entire Data Lineage & Impact Analysis series and achieved expert-level proficiency:
+
+**Reading Lineage Graphs**: Navigate any complexity with confidence
+**Performing Impact Analysis**: Systematically assess and communicate change risks
+**Lineage Troubleshooting**: Diagnose and resolve any lineage quality issue
+
+**Your New Capabilities**:
+
+- **Lead system migrations** with comprehensive impact analysis
+- **Troubleshoot data issues** using lineage-driven root cause analysis
+- **Improve data governance** through complete lineage visibility
+- **Mentor teams** on lineage best practices and troubleshooting techniques
+
+**Real-World Impact**: You're now equipped to handle the most complex data lineage challenges in production environments, from multi-system migrations to compliance audits to incident response.
+
+:::tip Mark Your Progress
+Check off "Lineage Troubleshooting" in the progress tracker above! You've completed the entire lineage mastery series!
+:::
+
+---
+
+**Ready for More?** Continue your DataHub expertise journey with:
+
+- **Data Governance Fundamentals** - Master ownership, classification, and business glossary
+- **Data Quality & Monitoring** - Learn assertions, health dashboards, and incident management
+- **Data Ingestion Mastery** - Deep dive into recipes, stateful ingestion, and profiling
diff --git a/docs/learn-datahub/overview.md b/docs/learn-datahub/overview.md
new file mode 100644
index 00000000000000..1854210e5d6859
--- /dev/null
+++ b/docs/learn-datahub/overview.md
@@ -0,0 +1,227 @@
+---
+title: "Learn DataHub"
+---
+
+# Learn DataHub
+
+Master DataHub through a comprehensive professional development journey. Follow a realistic business scenario as you progress from basic data discovery to advanced governance and compliance management.
+
+## Professional Data Management Journey
+
+**Your Role**: You're a data professional tasked with implementing enterprise-grade metadata management. This tutorial series follows realistic scenarios that data teams encounter when establishing DataHub in production environments.
+
+**The Business Context**: A growing technology company with data distributed across multiple platforms - Kafka for streaming, Hive for analytics, HDFS for storage. The organization needs to transition from ad-hoc data usage to systematic data governance and discovery.
+
+**Your Objective**: Implement DataHub to solve real data management challenges: discovery bottlenecks, compliance requirements, quality issues, and system integration complexity.
+
+---
+
+## Chapter 1: Foundation (30 minutes)
+
+### DataHub Quickstart
+
+**The Challenge**: You need to quickly assess the organization's data landscape and locate specific user engagement metrics for an executive presentation. The data exists across multiple systems, but there's no centralized metadata management.
+
+**Your Implementation**:
+
+- [Overview](quickstart/overview) - Understanding the business requirements
+- [Setup DataHub](quickstart/setup) (5 min) - Deploy the metadata platform locally
+- [First Ingestion](quickstart/first-ingestion) (10 min) - Connect multi-platform data sources
+- [Discovery Basics](quickstart/discovery-basics) (10 min) - Implement systematic data discovery
+- [Your First Lineage](quickstart/first-lineage) (5 min) - Analyze data dependencies and quality
+
+**Outcome**: Establish DataHub as the central metadata repository and demonstrate its value for data discovery and governance.
+
+---
+
+## Chapter 2: Scaling Discovery (45 minutes)
+
+### Data Discovery & Search
+
+**The Challenge**: Three months later, the organization has grown to 50+ datasets across 8 platforms. New team members spend days trying to find the right data, and analysts frequently use incorrect or outdated datasets for reports.
+
+**Business Impact**:
+
+- **Time Waste**: Data scientists spend 60% of their time searching for data instead of analyzing
+- **Inconsistent Metrics**: Different teams calculate customer metrics differently, leading to conflicting reports
+- **Compliance Risk**: Teams unknowingly use datasets containing PII without proper approvals
+
+**DataHub Solution**: Implement systematic data discovery that enables self-service analytics while maintaining governance controls.
+
+**Your Journey**:
+
+- **Advanced Search Techniques** (15 min) - Enable teams to find data using business terms, not technical names
+- **Understanding Dataset Profiles** (20 min) - Provide rich context so users choose the right data confidently
+- **Collaborative Discovery** (10 min) - Build institutional knowledge through documentation and Q&A
+
+**Organizational Outcome**: Reduce data discovery time from days to minutes, while ensuring teams use trusted, well-documented datasets.
+
+---
+
+## Chapter 3: Managing Dependencies (40 minutes)
+
+### Data Lineage & Impact Analysis
+
+**The Challenge**: The organization's customer analytics pipeline needs a major upgrade to support real-time personalization. However, this pipeline feeds 15+ downstream systems including customer-facing dashboards, ML models, and regulatory reports.
+
+**Business Impact**:
+
+- **Change Risk**: Modifying core data without understanding dependencies could break critical business processes
+- **Coordination Overhead**: Manual impact assessment requires weeks of meetings across multiple teams
+- **Incident Response**: When issues occur, root cause analysis takes hours without clear data flow visibility
+
+**DataHub Solution**: Use comprehensive lineage tracking to plan changes confidently and respond to incidents quickly.
+
+**Your Journey**:
+
+- **Reading Lineage Graphs** (15 min) - Navigate complex data flows spanning multiple systems and teams
+- **Performing Impact Analysis** (15 min) - Systematically assess risks and coordinate changes across stakeholders
+- **Lineage Troubleshooting** (10 min) - Ensure lineage accuracy for reliable decision-making
+
+**Organizational Outcome**: Execute complex data migrations with zero business disruption and reduce incident response time by 75%.
+
+---
+
+## Chapter 4: Establishing Governance (50 minutes)
+
+### Data Governance Fundamentals
+
+**The Challenge**: The organization is preparing for SOC 2 compliance and a potential acquisition. Auditors need clear data ownership, classification, and business definitions. Currently, critical datasets have unclear ownership and inconsistent business terminology.
+
+**Business Impact**:
+
+- **Compliance Gaps**: Inability to demonstrate data stewardship and access controls
+- **Business Confusion**: Same terms mean different things to different teams (e.g., "active customer")
+- **Accountability Issues**: When data quality problems occur, no clear owner to resolve them
+
+**DataHub Solution**: Implement systematic data governance that scales with organizational growth.
+
+**Your Journey**:
+
+- **Ownership & Stewardship** (15 min) - Establish clear accountability for every critical dataset
+- **Classification & Tagging** (20 min) - Organize data by sensitivity, domain, and business purpose
+- **Business Glossary Management** (15 min) - Create shared vocabulary that aligns technical and business teams
+
+**Organizational Outcome**: Pass compliance audits confidently and accelerate cross-team collaboration through shared understanding.
+
+---
+
+## Chapter 5: Ensuring Reliability (45 minutes)
+
+### Data Quality & Monitoring
+
+**The Challenge**: Organizational growth has led to data quality issues affecting customer experience. Revenue dashboards show inconsistent numbers, ML models receive corrupted training data, and customer support can't trust the data they see.
+
+**Business Impact**:
+
+- **Revenue Impact**: Incorrect pricing data led to $50K in lost revenue last quarter
+- **Customer Experience**: Personalization algorithms fail due to poor data quality
+- **Executive Confidence**: Leadership questions all data-driven decisions due to past inaccuracies
+
+**DataHub Solution**: Implement proactive data quality management that prevents issues before they impact business operations.
+
+**Your Journey**:
+
+- **Setting Up Data Assertions** (20 min) - Automated quality checks that catch issues immediately
+- **Data Health Dashboard** (15 min) - Centralized monitoring that provides early warning of problems
+- **Incident Management** (10 min) - Systematic response processes that minimize business impact
+
+**Organizational Outcome**: Achieve 99.9% data reliability and restore executive confidence in data-driven decisions.
+
+---
+
+## Chapter 6: Platform Mastery (60 minutes)
+
+### Data Ingestion Mastery
+
+**The Challenge**: The organization is acquiring two companies with different data architectures. You need to integrate 20+ new data sources while maintaining performance and ensuring consistent metadata quality across all systems.
+
+**Business Impact**:
+
+- **Integration Complexity**: Manual metadata management doesn't scale to hundreds of datasets
+- **Performance Degradation**: Naive ingestion approaches overwhelm DataHub and source systems
+- **Metadata Quality**: Inconsistent metadata leads to poor user experience and governance gaps
+
+**DataHub Solution**: Implement production-grade ingestion patterns that scale efficiently and maintain high metadata quality.
+
+**Your Journey**:
+
+- **Understanding Recipes** (20 min) - Configure ingestion for complex, heterogeneous environments
+- **Stateful Ingestion Patterns** (20 min) - Optimize for performance and minimize resource usage
+- **Data Profiling & Enrichment** (20 min) - Automatically generate rich metadata that enhances discoverability
+
+**Organizational Outcome**: Successfully integrate acquired companies' data with zero performance impact and improved metadata quality.
+
+---
+
+## Chapter 7: Compliance & Privacy (35 minutes)
+
+### Privacy & Compliance
+
+**The Challenge**: The organization operates in healthcare and finance sectors, requiring GDPR, HIPAA, and SOX compliance. Regulators need proof of data handling practices, and privacy teams need to track PII across all systems.
+
+**Business Impact**:
+
+- **Regulatory Risk**: Fines up to 4% of revenue for GDPR violations
+- **Audit Overhead**: Manual compliance reporting takes weeks of effort quarterly
+- **Privacy Breaches**: Inability to locate and protect sensitive data across systems
+
+**DataHub Solution**: Implement automated compliance workflows that provide continuous regulatory readiness.
+
+**Your Journey**:
+
+- **PII Detection & Classification** (15 min) - Automatically identify and classify sensitive data across all systems
+- **Compliance Forms & Workflows** (20 min) - Streamline regulatory reporting and audit preparation
+
+**Organizational Outcome**: Achieve continuous compliance readiness and reduce audit preparation time by 90%.
+
+---
+
+## Tutorial Structure
+
+Each tutorial follows a consistent, practical format:
+
+**Learning Objectives**: Clear outcomes you'll achieve
+**Time Estimates**: Realistic completion times
+**Hands-on Exercises**: Real scenarios with sample data
+**Success Checkpoints**: Verify your progress
+**What's Next**: Logical progression to related topics
+
+## Learning Paths
+
+### Complete Professional Journey (Recommended)
+
+Follow the full narrative from startup to enterprise-scale data management:
+**Chapters 1-7** → Experience the complete organizational transformation
+
+### Role-Focused Paths
+
+**Data Analysts & Scientists**
+**Chapters 1-3** → Master discovery, search, and lineage analysis for confident data usage
+
+**Data Engineers & Platform Teams**
+**Chapters 1, 3, 5-6** → Focus on technical implementation, quality, and ingestion mastery
+
+**Data Governance & Compliance Teams**
+**Chapters 1, 4, 7** → Establish governance frameworks and compliance processes
+
+**Leadership & Strategy Teams**
+**Chapter overviews only** → Understand business value and organizational impact
+
+## Getting Help
+
+**During tutorials:**
+
+- Each page includes troubleshooting sections
+- Common issues and solutions are documented
+- Links to relevant documentation sections
+
+**Community support:**
+
+- [DataHub Slack Community](https://datahub.com/slack)
+- [Full Documentation](../)
+- [GitHub Issues](https://github.com/datahub-project/datahub/issues)
+
+---
+
+**Ready to start learning?** Begin with the [DataHub Quickstart](quickstart/overview) →
diff --git a/docs/learn-datahub/privacy/compliance-workflows.md b/docs/learn-datahub/privacy/compliance-workflows.md
new file mode 100644
index 00000000000000..89d4d63c9e5f07
--- /dev/null
+++ b/docs/learn-datahub/privacy/compliance-workflows.md
@@ -0,0 +1,55 @@
+import TutorialProgress from '@site/src/components/TutorialProgress';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import NextStepButton from '@site/src/components/NextStepButton';
+import ProcessFlow from '@site/src/components/ProcessFlow';
+
+# Compliance Workflows (11 minutes)
+
+
+
+## Objective
+
+Operationalize compliance for DSARs (access, deletion, portability) and regulatory reporting.
+
+## Core Workflows
+
+
+
+
+
+
+- Locate a subject’s data across systems
+- Standardize export, deletion, and portability steps
+
+
+
+
+- Generate audit artifacts: lineage, access logs, retention status
+- Track review and approval history
+
+
+
+
+
+Back to Privacy & Compliance Overview
+
diff --git a/docs/learn-datahub/privacy/overview.md b/docs/learn-datahub/privacy/overview.md
new file mode 100644
index 00000000000000..4c2402023b7c9c
--- /dev/null
+++ b/docs/learn-datahub/privacy/overview.md
@@ -0,0 +1,312 @@
+# Privacy & Compliance
+
+import ProcessFlow from '@site/src/components/ProcessFlow';
+import DataHubEntityCard from '@site/src/components/DataHubEntityCard';
+import { HandsOnExercise } from '@site/src/components/TutorialExercise';
+import NextStepButton from '@site/src/components/NextStepButton';
+
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+
+
+## Professional Privacy Protection at Scale
+
+**Time Required**: 35 minutes | **Skill Level**: Advanced
+
+### Your Challenge: Comprehensive Privacy Management
+
+You're a **Privacy Engineering Lead** at a global technology company. Your organization processes personal data from millions of users across multiple jurisdictions, subject to GDPR, CCPA, and other privacy regulations. Current privacy management is fragmented and reactive:
+
+- **Manual PII discovery** that misses sensitive data in new systems
+- **Inconsistent privacy controls** across different data platforms
+- **Slow response** to data subject requests and regulatory inquiries
+- **Limited visibility** into personal data processing activities
+
+**The Business Impact**: A recent privacy audit revealed untracked personal data in 15 different systems, resulting in a $2.8M regulatory fine and significant remediation costs. Leadership demands a proactive, comprehensive privacy management approach.
+
+### What You'll Learn
+
+This tutorial series teaches you to implement enterprise-grade privacy protection using DataHub's privacy and compliance features:
+
+#### Chapter 1: PII Detection (12 minutes)
+
+**Business Challenge**: Hidden personal data creating compliance risks across the organization
+**Your Journey**:
+
+- Implement automated PII discovery across all data systems
+- Configure intelligent classification for different types of personal data
+- Set up continuous monitoring for new PII in data pipelines
+ **Organizational Outcome**: Complete visibility into personal data across your data landscape
+
+#### Chapter 2: Privacy Controls (12 minutes)
+
+**Business Challenge**: Inconsistent privacy protection and access controls for personal data
+**Your Journey**:
+
+- Implement data minimization and purpose limitation controls
+- Configure automated privacy impact assessments
+- Set up consent management and data retention policies
+ **Organizational Outcome**: Systematic privacy protection aligned with regulatory requirements
+
+#### Chapter 3: Compliance Workflows (11 minutes)
+
+**Business Challenge**: Manual compliance processes that can't scale with regulatory demands
+**Your Journey**:
+
+- Automate data subject request fulfillment (access, deletion, portability)
+- Implement regulatory reporting and audit trail generation
+- Set up cross-border data transfer compliance monitoring
+ **Organizational Outcome**: Efficient compliance operations that reduce regulatory risk and operational overhead
+
+### Interactive Learning Experience
+
+Each chapter includes:
+
+- **Real Privacy Scenarios**: Based on actual regulatory compliance challenges
+- **Hands-on Implementation**: Using DataHub's privacy management features
+- **Regulatory Alignment**: Mapping to GDPR, CCPA, and other privacy laws
+- **Audit Preparation**: Building evidence for regulatory compliance
+
+### Understanding Privacy Compliance Impact
+
+## Privacy Program Lifecycle
+
+
+
+## Representative Assets
+
+
+
+
+
+
+## Hands-On: Validate Your Privacy Posture
+
+
+
+
+Start: PII Detection
+
+
+Privacy violations carry severe consequences:
+
+- **GDPR Fines**: Up to 4% of global annual revenue or €20M (whichever is higher)
+- **CCPA Penalties**: Up to $7,500 per violation for intentional violations
+- **Reputational Damage**: Loss of customer trust and competitive advantage
+- **Operational Disruption**: Emergency remediation and system changes
+
+**Privacy-by-Design Benefits**:
+
+- **Regulatory Compliance**: Proactive adherence to privacy laws
+- **Risk Reduction**: Early identification and mitigation of privacy risks
+- **Operational Efficiency**: Automated compliance processes
+- **Customer Trust**: Transparent and responsible data handling
+
+### DataHub Privacy Features Overview
+
+DataHub provides comprehensive privacy management through:
+
+
+
+
+
+
+
+**Key Privacy Capabilities**:
+
+- **Automated PII Discovery**: ML-powered detection of personal data across all systems
+- **Privacy Controls**: Automated enforcement of data minimization and purpose limitation
+- **Compliance Automation**: Streamlined data subject request fulfillment
+- **Privacy Analytics**: Comprehensive reporting and audit trail generation
+- **Cross-Border Compliance**: Monitoring and controls for international data transfers
+
+### Privacy Regulatory Landscape
+
+**Major Privacy Regulations**:
+
+- **GDPR (EU)**: Comprehensive data protection with strict consent and rights requirements
+- **CCPA (California)**: Consumer privacy rights including access, deletion, and opt-out
+- **LGPD (Brazil)**: Brazilian data protection law similar to GDPR
+- **PIPEDA (Canada)**: Privacy protection for personal information in commercial activities
+- **Sector-Specific**: HIPAA (healthcare), FERPA (education), GLBA (financial services)
+
+**Common Privacy Requirements**:
+
+- **Lawful Basis**: Legal justification for processing personal data
+- **Data Minimization**: Collecting only necessary personal data
+- **Purpose Limitation**: Using data only for stated purposes
+- **Storage Limitation**: Retaining data only as long as necessary
+- **Individual Rights**: Access, rectification, erasure, portability, and objection
+
+### Prerequisites
+
+- Completed [Data Governance Fundamentals](../governance/overview.md)
+- Understanding of privacy regulations (GDPR, CCPA, etc.)
+- Access to DataHub instance with sample personal data
+- Familiarity with data classification and governance concepts
+- Basic knowledge of privacy engineering principles
+
+### Privacy Maturity Assessment
+
+
+
+### Success Metrics
+
+**Compliance Metrics**:
+
+- **PII Discovery Coverage**: Percentage of systems with automated PII detection
+- **Data Subject Request Response Time**: Speed of fulfilling privacy requests
+- **Privacy Violation Rate**: Number of privacy incidents and regulatory findings
+- **Audit Readiness**: Time required to respond to regulatory inquiries
+
+**Operational Metrics**:
+
+- **Privacy Assessment Automation**: Percentage of automated privacy impact assessments
+- **Consent Management Coverage**: Tracking of consent across data processing activities
+- **Cross-Border Transfer Compliance**: Adherence to international data transfer requirements
+- **Privacy Training Completion**: Staff awareness and competency in privacy practices
+
+### Ready to Begin?
+
+Start your privacy compliance journey by implementing automated PII detection that provides complete visibility into personal data across your organization.
+
+
diff --git a/docs/learn-datahub/privacy/pii-detection.md b/docs/learn-datahub/privacy/pii-detection.md
new file mode 100644
index 00000000000000..71029727b5980a
--- /dev/null
+++ b/docs/learn-datahub/privacy/pii-detection.md
@@ -0,0 +1,89 @@
+import TutorialProgress from '@site/src/components/TutorialProgress';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import DataHubEntityCard from '@site/src/components/DataHubEntityCard';
+import NextStepButton from '@site/src/components/NextStepButton';
+import { HandsOnExercise } from '@site/src/components/TutorialExercise';
+
+# PII Detection (12 minutes)
+
+
+
+## Objective
+
+Identify personal data across platforms using automated PII detection and classification, establishing comprehensive visibility into privacy-relevant assets.
+
+## What You'll Learn
+
+- Configure and run PII detection
+- Interpret classification results in UI
+- Prioritize remediation using tags and terms
+
+## PII Discovery Workflow
+
+**Scan** → **Classify** → **Review** → **Tag** → **Monitor**
+
+
+
+
+1. Navigate to a dataset with personal data (e.g., `customer_profiles`).
+2. Open the Schema tab to review detected fields.
+3. Confirm sensitive columns (e.g., `email`, `phone`).
+
+
+
+
+1. Apply appropriate sensitivity tags and glossary terms.
+2. Ensure PII classifications are consistent across similar datasets.
+
+
+
+
+## Example Asset
+
+
+
+
+
+## Hands-On Exercise
+
+
+
+
+Next: Privacy Controls
+
diff --git a/docs/learn-datahub/privacy/privacy-controls.md b/docs/learn-datahub/privacy/privacy-controls.md
new file mode 100644
index 00000000000000..4faf0984be544d
--- /dev/null
+++ b/docs/learn-datahub/privacy/privacy-controls.md
@@ -0,0 +1,83 @@
+import TutorialProgress from '@site/src/components/TutorialProgress';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import NextStepButton from '@site/src/components/NextStepButton';
+import ProcessFlow from '@site/src/components/ProcessFlow';
+import { HandsOnExercise } from '@site/src/components/TutorialExercise';
+
+# Privacy Controls (12 minutes)
+
+
+
+## Objective
+
+Implement practical privacy protections including minimization, access controls, and retention.
+
+
+
+## Control Implementation
+
+
+
+
+- Remove unnecessary personal fields
+- Apply field-level masking for sensitive attributes
+
+
+
+
+- Restrict PII access to approved roles
+- Require purpose-based approvals for sensitive data
+
+
+
+
+- Define retention policies per data category
+- Configure automated deletion after retention window
+
+
+
+
+## Hands-On Exercise
+
+
+
+
+Next: Compliance Workflows
+
diff --git a/docs/learn-datahub/quality/data-assertions.md b/docs/learn-datahub/quality/data-assertions.md
new file mode 100644
index 00000000000000..fd76648cbcdba0
--- /dev/null
+++ b/docs/learn-datahub/quality/data-assertions.md
@@ -0,0 +1,367 @@
+import DataHubEntityCard from '@site/src/components/DataHubEntityCard';
+
+# Data Assertions
+
+
+
+## Building Automated Data Quality Checks
+
+**Time Required**: 15 minutes
+
+### The Assertion Challenge
+
+Your data pipelines are processing customer transactions, but you're discovering quality issues after they've already impacted business operations:
+
+- **Missing customer IDs** causing failed order processing
+- **Negative transaction amounts** appearing in financial reports
+- **Duplicate records** inflating customer metrics
+- **Stale data** making real-time dashboards unreliable
+
+**Real-World Impact**: Last week, a batch of transactions with null customer IDs caused the customer service system to crash, resulting in 4 hours of downtime and frustrated customers.
+
+### Understanding DataHub Assertions
+
+Assertions are automated quality checks that continuously validate your data against business rules:
+
+
+
+
+
+**Assertion Types**:
+
+- **Completeness**: Ensure required fields are not null or empty
+- **Uniqueness**: Validate primary keys and unique constraints
+- **Range Validation**: Check numeric values fall within expected bounds
+- **Freshness**: Verify data is updated within acceptable time windows
+- **Referential Integrity**: Ensure foreign key relationships are valid
+- **Custom Rules**: Implement business-specific validation logic
+
+### Exercise 1: Create Completeness Assertions
+
+Ensure critical fields always contain valid data:
+
+#### Step 1: Navigate to Assertions
+
+1. **Open DataHub** and search for "fct_users_created"
+2. **Click on the dataset** to open its profile page
+3. **Go to the "Quality" tab** and click "Add Assertion"
+4. **Select "Column Assertion"** to validate specific fields
+
+#### Step 2: Create Customer ID Completeness Check
+
+**Assertion Configuration**:
+
+- **Name**: "Customer ID Required"
+- **Description**: "All records must have a valid customer_id"
+- **Column**: `customer_id`
+- **Type**: "Not Null"
+- **Severity**: "Error" (blocks downstream processing)
+- **Schedule**: "Every 15 minutes"
+
+**SQL Logic**:
+
+```sql
+SELECT COUNT(*) as null_count
+FROM fct_users_created
+WHERE customer_id IS NULL
+ OR customer_id = ''
+```
+
+**Success Criteria**: `null_count = 0`
+
+#### Step 3: Add Email Validation
+
+**Assertion Configuration**:
+
+- **Name**: "Valid Email Format"
+- **Description**: "Email addresses must follow standard format"
+- **Column**: `email`
+- **Type**: "Custom SQL"
+- **Severity**: "Warning"
+
+**SQL Logic**:
+
+```sql
+SELECT COUNT(*) as invalid_emails
+FROM fct_users_created
+WHERE email IS NOT NULL
+ AND email NOT REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
+```
+
+**Success Criteria**: `invalid_emails = 0`
+
+### Exercise 2: Implement Range Validations
+
+Validate that numeric values fall within business-acceptable ranges:
+
+#### Step 1: Transaction Amount Validation
+
+For financial data, create bounds checking:
+
+**Assertion Configuration**:
+
+- **Name**: "Valid Transaction Amount"
+- **Description**: "Transaction amounts must be positive and reasonable"
+- **Column**: `transaction_amount`
+- **Type**: "Range Check"
+- **Min Value**: 0.01 (no zero or negative transactions)
+- **Max Value**: 100000.00 (flag unusually large transactions)
+- **Severity**: "Error"
+
+#### Step 2: Date Range Validation
+
+Ensure dates are realistic and current:
+
+**Assertion Configuration**:
+
+- **Name**: "Recent Transaction Date"
+- **Description**: "Transaction dates should be within the last 2 years"
+- **Column**: `transaction_date`
+- **Type**: "Custom SQL"
+- **Severity**: "Warning"
+
+**SQL Logic**:
+
+```sql
+SELECT COUNT(*) as invalid_dates
+FROM customer_transactions
+WHERE transaction_date < CURRENT_DATE - INTERVAL '2 years'
+ OR transaction_date > CURRENT_DATE + INTERVAL '1 day'
+```
+
+### Exercise 3: Create Uniqueness Assertions
+
+Prevent duplicate records that can skew analytics:
+
+#### Step 1: Primary Key Uniqueness
+
+**Assertion Configuration**:
+
+- **Name**: "Unique Transaction ID"
+- **Description**: "Each transaction must have a unique identifier"
+- **Column**: `transaction_id`
+- **Type**: "Uniqueness"
+- **Severity**: "Error"
+- **Action**: "Block pipeline on failure"
+
+#### Step 2: Business Key Uniqueness
+
+For composite business keys:
+
+**Assertion Configuration**:
+
+- **Name**: "Unique Customer-Date Combination"
+- **Description**: "One transaction per customer per day (business rule)"
+- **Type**: "Custom SQL"
+- **Severity**: "Warning"
+
+**SQL Logic**:
+
+```sql
+SELECT customer_id, DATE(transaction_date) as txn_date, COUNT(*) as duplicate_count
+FROM customer_transactions
+GROUP BY customer_id, DATE(transaction_date)
+HAVING COUNT(*) > 1
+```
+
+### Exercise 4: Implement Freshness Checks
+
+Ensure data is updated according to business requirements:
+
+#### Step 1: Data Freshness Assertion
+
+**Assertion Configuration**:
+
+- **Name**: "Customer Data Freshness"
+- **Description**: "Customer data must be updated within 4 hours"
+- **Type**: "Freshness"
+- **Column**: `last_updated_timestamp`
+- **Max Age**: "4 hours"
+- **Severity**: "Error"
+
+#### Step 2: Partition Freshness
+
+For partitioned tables:
+
+**SQL Logic**:
+
+```sql
+SELECT MAX(partition_date) as latest_partition
+FROM customer_transactions
+WHERE partition_date >= CURRENT_DATE - INTERVAL '1 day'
+```
+
+**Success Criteria**: `latest_partition >= CURRENT_DATE`
+
+### Exercise 5: Custom Business Rule Assertions
+
+Implement organization-specific validation logic:
+
+#### Step 1: Customer Lifecycle Validation
+
+**Business Rule**: "Customers must have a registration date before their first transaction"
+
+**Assertion Configuration**:
+
+- **Name**: "Valid Customer Lifecycle"
+- **Description**: "Registration must precede first transaction"
+- **Type**: "Custom SQL"
+- **Severity**: "Error"
+
+**SQL Logic**:
+
+```sql
+SELECT COUNT(*) as lifecycle_violations
+FROM customer_transactions ct
+JOIN customer_profiles cp ON ct.customer_id = cp.customer_id
+WHERE ct.transaction_date < cp.registration_date
+```
+
+#### Step 2: Revenue Recognition Rules
+
+**Business Rule**: "Subscription revenue must be recognized monthly"
+
+**SQL Logic**:
+
+```sql
+SELECT COUNT(*) as recognition_errors
+FROM revenue_transactions
+WHERE product_type = 'subscription'
+ AND recognition_method != 'monthly'
+```
+
+### Understanding Assertion Results
+
+DataHub provides comprehensive assertion monitoring:
+
+**Assertion Status Indicators**:
+
+- **Passing**: All validation rules met
+- **Warning**: Minor issues detected, investigate soon
+- **Failing**: Critical issues found, immediate attention required
+- **Paused**: Assertion temporarily disabled
+- **Running**: Currently executing validation
+
+**Assertion History**:
+
+- Track assertion results over time
+- Identify patterns in quality issues
+- Measure quality improvement trends
+- Generate compliance reports
+
+### Best Practices for Data Assertions
+
+#### 1. Start with Critical Business Rules
+
+Focus on assertions that protect:
+
+- Revenue calculations
+- Customer data integrity
+- Regulatory compliance requirements
+- Downstream system dependencies
+
+#### 2. Use Appropriate Severity Levels
+
+- **Error**: Critical issues that must block processing
+- **Warning**: Issues that need investigation but don't stop pipelines
+- **Info**: Monitoring checks for trend analysis
+
+#### 3. Optimize Assertion Performance
+
+- Use efficient SQL queries
+- Leverage table statistics when possible
+- Schedule assertions based on data update frequency
+- Consider sampling for large datasets
+
+#### 4. Provide Clear Context
+
+- Write descriptive assertion names and descriptions
+- Document business rationale for each rule
+- Include remediation guidance
+- Link to relevant business stakeholders
+
+### Measuring Assertion Effectiveness
+
+Track these key metrics:
+
+- **Assertion Coverage**: Percentage of critical columns with assertions
+- **Pass Rate**: Percentage of assertions passing over time
+- **Detection Speed**: Time from data issue to assertion failure
+- **False Positive Rate**: Assertions failing due to rule issues
+- **Business Impact Prevention**: Issues caught before affecting operations
+
+### Advanced Assertion Techniques
+
+#### 1. Statistical Assertions
+
+Monitor data distributions and detect anomalies:
+
+```sql
+-- Detect unusual spikes in transaction volume
+SELECT COUNT(*) as daily_transactions
+FROM customer_transactions
+WHERE DATE(transaction_date) = CURRENT_DATE
+HAVING COUNT(*) > (
+ SELECT AVG(daily_count) * 2
+ FROM daily_transaction_stats
+ WHERE date >= CURRENT_DATE - INTERVAL '30 days'
+)
+```
+
+#### 2. Cross-Dataset Assertions
+
+Validate consistency across related datasets:
+
+```sql
+-- Ensure customer counts match between systems
+SELECT ABS(
+ (SELECT COUNT(DISTINCT customer_id) FROM crm_customers) -
+ (SELECT COUNT(DISTINCT customer_id) FROM billing_customers)
+) as customer_count_diff
+HAVING customer_count_diff <= 10 -- Allow small variance
+```
+
+#### 3. Time-Series Assertions
+
+Monitor trends and seasonal patterns:
+
+```sql
+-- Detect unusual drops in daily active users
+SELECT current_dau, previous_week_avg
+FROM (
+ SELECT
+ COUNT(DISTINCT user_id) as current_dau,
+ LAG(COUNT(DISTINCT user_id), 7) OVER (ORDER BY date) as previous_week_avg
+ FROM user_activity
+ WHERE date = CURRENT_DATE
+)
+WHERE current_dau < previous_week_avg * 0.8 -- 20% drop threshold
+```
+
+### Next Steps
+
+With automated assertions in place, you're ready to build comprehensive quality monitoring dashboards that provide real-time visibility into your data health.
+
+
diff --git a/docs/learn-datahub/quality/incident-management.md b/docs/learn-datahub/quality/incident-management.md
new file mode 100644
index 00000000000000..e500e031aa2656
--- /dev/null
+++ b/docs/learn-datahub/quality/incident-management.md
@@ -0,0 +1,363 @@
+# Incident Management
+
+
+
+## Rapid Response to Data Quality Issues
+
+**Time Required**: 10 minutes
+
+### The Incident Response Challenge
+
+Your quality monitoring is detecting issues, but your response process is still chaotic:
+
+- **Delayed notifications** mean issues impact business before teams respond
+- **Unclear ownership** leads to finger-pointing instead of resolution
+- **Manual escalation** processes that don't scale with your data growth
+- **No systematic learning** from incidents to prevent recurrence
+
+**Real-World Impact**: A data quality issue in the customer segmentation pipeline caused the marketing team to send promotional emails to churned customers, resulting in negative brand impact and a 2-day emergency response to identify and fix the root cause.
+
+### Understanding Incident Management
+
+Systematic incident management transforms chaotic fire-fighting into structured, efficient response:
+
+
+
+
+
+**Incident Management Components**:
+
+- **Automated Detection**: Intelligent alerting based on quality thresholds
+- **Structured Response**: Standardized workflows for different incident types
+- **SLA Management**: Time-bound response and resolution commitments
+- **Impact Assessment**: Business impact evaluation and prioritization
+- **Post-Incident Review**: Learning and improvement processes
+
+### Exercise 1: Set Up Incident Detection
+
+Configure intelligent alerting that triggers appropriate response levels:
+
+#### Step 1: Define Incident Severity Levels
+
+**Severity Classification**:
+
+- **Critical (P0)**: Complete data unavailability or major accuracy issues affecting revenue/customers
+- **High (P1)**: Significant quality degradation affecting business operations
+- **🟠 Medium (P2)**: Quality issues affecting specific use cases or reports
+- **Low (P3)**: Minor quality issues with workarounds available
+
+#### Step 2: Configure Automated Detection Rules
+
+**Critical Incident Triggers**:
+
+```sql
+-- Critical: Customer data pipeline failure
+SELECT COUNT(*) as missing_records
+FROM customer_daily_summary
+WHERE date = CURRENT_DATE
+HAVING COUNT(*) = 0; -- No records for today = P0 incident
+
+-- Critical: Financial data accuracy issue
+SELECT COUNT(*) as revenue_discrepancy
+FROM revenue_reconciliation
+WHERE ABS(system_a_total - system_b_total) > 10000 -- $10K+ discrepancy
+ AND reconciliation_date = CURRENT_DATE;
+```
+
+**High Priority Triggers**:
+
+```sql
+-- High: Significant data freshness delay
+SELECT MAX(last_updated) as latest_update
+FROM critical_datasets
+WHERE dataset_name = 'customer_transactions'
+HAVING latest_update < CURRENT_TIMESTAMP - INTERVAL '2 hours';
+
+-- High: Assertion failure rate spike
+SELECT failure_rate
+FROM (
+ SELECT COUNT(CASE WHEN status = 'FAIL' THEN 1 END) * 100.0 / COUNT(*) as failure_rate
+ FROM assertion_results
+ WHERE created_at >= CURRENT_TIMESTAMP - INTERVAL '1 hour'
+)
+WHERE failure_rate > 15; -- >15% failure rate = High priority
+```
+
+### Exercise 2: Create Response Workflows
+
+Build structured response processes for different incident types:
+
+#### Step 1: Critical Incident Response Workflow
+
+**P0 Incident Response (Target: 15 minutes)**:
+
+1. **Immediate Actions (0-5 minutes)**:
+
+ - Automated page to on-call engineer
+ - Create incident ticket with severity P0
+ - Notify stakeholders via Slack #data-incidents
+ - Activate incident bridge/war room
+
+2. **Assessment Phase (5-15 minutes)**:
+
+ - Confirm incident scope and business impact
+ - Identify affected systems and downstream dependencies
+ - Assign incident commander
+ - Begin impact mitigation
+
+3. **Resolution Phase (15+ minutes)**:
+ - Implement immediate fixes or workarounds
+ - Monitor for resolution confirmation
+ - Communicate status updates every 30 minutes
+ - Document actions taken
+
+#### Step 2: Automated Incident Creation
+
+**Incident Ticket Template**:
+
+```
+Title: [P0] Customer Transaction Pipeline Failure - [Timestamp]
+
+INCIDENT DETAILS:
+- Severity: P0 (Critical)
+- Detected: [Automated Detection System]
+- Affected System: Customer Transaction Pipeline
+- Business Impact: Customer-facing applications unable to process payments
+
+TECHNICAL DETAILS:
+- Failed Assertion: "Customer ID Completeness Check"
+- Error Rate: 100% (0/1000 records passing)
+- Last Successful Run: [Timestamp]
+- Affected Records: ~50,000 transactions
+
+IMMEDIATE ACTIONS REQUIRED:
+1. Investigate data source connectivity
+2. Check upstream system status
+3. Implement emergency data bypass if needed
+4. Notify customer service team of potential impact
+
+STAKEHOLDERS:
+- Incident Commander: [On-call Engineer]
+- Technical Owner: payments.team@company.com
+- Business Owner: customer.success@company.com
+- Executive Sponsor: [VP Engineering] (for P0 incidents)
+```
+
+### Exercise 3: Implement Escalation Procedures
+
+Create automatic escalation when response targets are missed:
+
+#### Step 1: Time-Based Escalation
+
+**Escalation Timeline**:
+
+- **15 minutes**: No acknowledgment → Escalate to backup on-call
+- **30 minutes**: No progress update → Notify engineering manager
+- **1 hour**: Unresolved P0 → Escalate to VP Engineering
+- **2 hours**: Unresolved P0 → Executive notification
+
+#### Step 2: Impact-Based Escalation
+
+**Business Impact Escalation**:
+
+```
+Revenue Impact > $100K/hour → Immediate C-level notification
+Customer-facing system down → Product team involvement
+Regulatory data affected → Compliance team notification
+Security implications → Security team involvement
+```
+
+### Exercise 4: Set Up Communication Protocols
+
+Ensure stakeholders receive appropriate information at the right time:
+
+#### Step 1: Stakeholder Communication Matrix
+
+**Communication Channels by Severity**:
+
+- **P0**: Slack #data-incidents + Email + Phone/Page
+- **P1**: Slack #data-quality + Email
+- **P2**: Slack #data-quality + Daily summary email
+- **P3**: Weekly quality report
+
+#### Step 2: Status Update Templates
+
+**Incident Status Update Template**:
+
+```
+INCIDENT UPDATE - [Incident ID] - [Time]
+
+STATUS: [Investigating/Mitigating/Resolved]
+IMPACT: [Brief business impact description]
+PROGRESS: [What has been done since last update]
+NEXT STEPS: [Immediate actions planned]
+ETA: [Expected resolution time]
+WORKAROUND: [Temporary solutions available]
+
+Technical Details: [Link to detailed technical updates]
+Questions: Contact [Incident Commander] in #data-incidents
+```
+
+### Exercise 5: Implement Post-Incident Reviews
+
+Learn from incidents to prevent recurrence:
+
+#### Step 1: Post-Incident Review Process
+
+**Review Timeline**:
+
+- **P0/P1**: Within 48 hours of resolution
+- **P2**: Within 1 week of resolution
+- **P3**: Monthly batch review
+
+**Review Agenda**:
+
+1. **Incident Timeline**: Detailed chronology of events
+2. **Root Cause Analysis**: Technical and process factors
+3. **Response Effectiveness**: What worked well and what didn't
+4. **Action Items**: Specific improvements to prevent recurrence
+5. **Process Updates**: Changes to monitoring, alerting, or procedures
+
+#### Step 2: Root Cause Analysis Framework
+
+**5 Whys Analysis Example**:
+
+```
+Problem: Customer segmentation data contained churned customers
+
+Why 1: Why did churned customers appear in active segments?
+→ The churn detection job failed to update customer status
+
+Why 2: Why did the churn detection job fail?
+→ The upstream CRM system had a schema change
+
+Why 3: Why didn't we detect the schema change?
+→ We don't have schema change monitoring on the CRM system
+
+Why 4: Why don't we have schema change monitoring?
+→ It wasn't considered critical for this data source
+
+Why 5: Why wasn't it considered critical?
+→ We lack a systematic approach to assessing data source criticality
+
+ROOT CAUSE: Missing systematic data source risk assessment
+ACTION ITEM: Implement data source criticality framework and monitoring
+```
+
+### Understanding Incident Metrics
+
+**Response Metrics**:
+
+- **Mean Time to Detection (MTTD)**: Time from issue occurrence to detection
+- **Mean Time to Acknowledgment (MTTA)**: Time from detection to human response
+- **Mean Time to Resolution (MTTR)**: Time from detection to full resolution
+- **Escalation Rate**: Percentage of incidents requiring escalation
+
+**Business Impact Metrics**:
+
+- **Revenue Impact**: Financial cost of data quality incidents
+- **Customer Impact**: Number of customers affected by incidents
+- **SLA Compliance**: Adherence to response time commitments
+- **Repeat Incidents**: Percentage of incidents that are recurring issues
+
+### Advanced Incident Management
+
+#### 1. Predictive Incident Detection
+
+Use machine learning to predict incidents before they occur:
+
+```sql
+-- Identify leading indicators of quality incidents
+WITH quality_trends AS (
+ SELECT
+ dataset_name,
+ date,
+ quality_score,
+ LAG(quality_score, 1) OVER (PARTITION BY dataset_name ORDER BY date) as prev_score,
+ LAG(quality_score, 7) OVER (PARTITION BY dataset_name ORDER BY date) as week_ago_score
+ FROM daily_quality_scores
+)
+SELECT
+ dataset_name,
+ quality_score,
+ CASE
+ WHEN quality_score < prev_score * 0.95 AND quality_score < week_ago_score * 0.90
+ THEN 'HIGH_RISK'
+ WHEN quality_score < prev_score * 0.98 AND quality_score < week_ago_score * 0.95
+ THEN 'MEDIUM_RISK'
+ ELSE 'LOW_RISK'
+ END as incident_risk
+FROM quality_trends
+WHERE date = CURRENT_DATE
+ AND incident_risk != 'LOW_RISK';
+```
+
+#### 2. Automated Remediation
+
+Implement self-healing responses for common issues:
+
+- **Data Refresh**: Automatically retry failed data loads
+- **Fallback Data**: Switch to backup data sources during outages
+- **Circuit Breakers**: Temporarily disable problematic data flows
+- **Auto-Scaling**: Increase resources during processing spikes
+
+#### 3. Cross-Team Coordination
+
+Integrate with broader incident management:
+
+- **ServiceNow Integration**: Link data incidents to IT service management
+- **PagerDuty Coordination**: Align with infrastructure incident response
+- **Slack Workflows**: Automate cross-team communication
+- **Jira Integration**: Track incident resolution as development work
+
+### Incident Management Best Practices
+
+#### 1. Prepare for Success
+
+- **Runbooks**: Document common incident types and responses
+- **Training**: Regular incident response drills and training
+- **Tools**: Ensure all responders have access to necessary systems
+- **Communication**: Pre-established channels and contact lists
+
+#### 2. Focus on Resolution
+
+- **Triage Effectively**: Prioritize based on business impact, not technical complexity
+- **Communicate Clearly**: Regular updates reduce stakeholder anxiety
+- **Document Everything**: Detailed logs enable effective post-incident analysis
+- **Learn Continuously**: Every incident is an opportunity to improve
+
+#### 3. Build Resilience
+
+- **Redundancy**: Multiple detection methods and backup systems
+- **Graceful Degradation**: Systems that fail safely with reduced functionality
+- **Quick Recovery**: Automated recovery procedures where possible
+- **Continuous Improvement**: Regular review and enhancement of processes
+
+### Next Steps
+
+With robust incident management in place, you're ready to implement quality automation that prevents issues before they occur and reduces the need for manual intervention.
+
+
diff --git a/docs/learn-datahub/quality/overview.md b/docs/learn-datahub/quality/overview.md
new file mode 100644
index 00000000000000..52f3ddc90f38b6
--- /dev/null
+++ b/docs/learn-datahub/quality/overview.md
@@ -0,0 +1,212 @@
+import DataHubEntityCard from '@site/src/components/DataHubEntityCard';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Data Quality & Monitoring
+
+
+
+## Professional Data Quality Management
+
+**Time Required**: 45 minutes | **Skill Level**: Intermediate
+
+### Your Challenge: Ensuring Data Reliability at Scale
+
+You're a **Data Platform Engineer** at a fast-growing company. Your data pipelines process millions of records daily, feeding critical business dashboards, ML models, and customer-facing applications. However, data quality issues are becoming frequent:
+
+- **Executive dashboards** showing incorrect revenue numbers
+- **ML models** making poor predictions due to data drift
+- **Customer applications** failing due to missing or malformed data
+- **Compliance reports** containing inaccurate information
+
+**The Business Impact**: A recent data quality incident caused the executive team to make a $5M investment decision based on incorrect customer churn metrics, highlighting the critical need for proactive data quality management.
+
+### What You'll Learn
+
+This tutorial series teaches you to implement comprehensive data quality monitoring using DataHub's quality management features:
+
+#### Chapter 1: Data Assertions (15 minutes)
+
+**Business Challenge**: No early warning system for data quality problems
+**Your Journey**:
+
+- Create automated data quality checks (completeness, uniqueness, range validation)
+- Set up custom business rule assertions
+- Configure assertion scheduling and execution
+ **Organizational Outcome**: Proactive detection of data quality issues before they impact business
+
+#### Chapter 2: Quality Monitoring (12 minutes)
+
+**Business Challenge**: Reactive approach to data quality management
+**Your Journey**:
+
+- Build comprehensive quality dashboards
+- Set up real-time quality monitoring
+- Create quality scorecards for different data domains
+ **Organizational Outcome**: Continuous visibility into data health across the organization
+
+#### Chapter 3: Incident Management (10 minutes)
+
+**Business Challenge**: Slow response to data quality incidents
+**Your Journey**:
+
+- Implement automated incident detection and alerting
+- Set up escalation procedures for critical quality failures
+- Create incident response workflows
+ **Organizational Outcome**: Rapid resolution of data quality issues with minimal business impact
+
+#### Chapter 4: Quality Automation (8 minutes)
+
+**Business Challenge**: Manual quality processes that don't scale
+**Your Journey**:
+
+- Automate quality validation in data pipelines
+- Set up quality gates for data promotion
+- Implement self-healing data quality processes
+ **Organizational Outcome**: Scalable quality management that prevents issues rather than just detecting them
+
+### Interactive Learning Experience
+
+Each chapter includes:
+
+- **Real Quality Scenarios**: Based on actual production data quality challenges
+- **Hands-on Exercises**: Using DataHub's sample data with realistic quality issues
+- **Best Practice Implementation**: Industry-standard approaches to data quality
+- **Measurable Outcomes**: Clear metrics for quality improvement
+
+### Understanding Data Quality Impact
+
+Poor data quality costs organizations an average of **$15 million annually** through:
+
+- **Operational Inefficiency**: Teams spending 40% of time cleaning data
+- **Poor Decision Making**: Executives losing trust in data-driven insights
+- **Customer Experience**: Applications failing due to data issues
+- **Compliance Risk**: Regulatory penalties for inaccurate reporting
+
+### DataHub Quality Features Overview
+
+DataHub provides comprehensive quality management through:
+
+
+
+
+
+
+
+**Key Quality Capabilities**:
+
+- **Automated Assertions**: Continuous validation of data quality rules
+- **Quality Dashboards**: Real-time visibility into data health
+- **Intelligent Alerting**: Smart notifications based on quality thresholds
+- **Trend Analysis**: Historical quality metrics and improvement tracking
+- **Pipeline Integration**: Quality gates in data processing workflows
+
+### Prerequisites
+
+- Completed [DataHub Quickstart](../quickstart/overview.md)
+- Basic understanding of data pipelines and SQL
+- Access to DataHub instance with sample data
+- Familiarity with data quality concepts
+
+### Quality Management Maturity Levels
+
+
+
+### Ready to Begin?
+
+Start your data quality journey by implementing automated assertions that catch quality issues before they impact your business.
+
+
diff --git a/docs/learn-datahub/quality/quality-automation.md b/docs/learn-datahub/quality/quality-automation.md
new file mode 100644
index 00000000000000..2a4909c9c983dc
--- /dev/null
+++ b/docs/learn-datahub/quality/quality-automation.md
@@ -0,0 +1,572 @@
+# Quality Automation
+
+
+
+## Preventing Issues Through Automation
+
+**Time Required**: 8 minutes
+
+### The Automation Challenge
+
+Your incident management is working well, but you're still fighting fires instead of preventing them:
+
+- **Reactive quality management** that catches issues after they occur
+- **Manual quality gates** that slow down data pipeline deployments
+- **Inconsistent quality standards** across different teams and projects
+- **Quality debt** accumulating as teams prioritize speed over reliability
+
+**Real-World Impact**: Your data engineering team spends 40% of their time on quality-related issues that could be prevented through better automation, reducing their capacity for strategic data platform improvements.
+
+### Understanding Quality Automation
+
+Quality automation shifts from reactive incident response to proactive issue prevention:
+
+**Automation Layers**:
+
+- **Pipeline Integration**: Quality checks embedded in data processing workflows
+- **Quality Gates**: Automated approval/rejection of data based on quality criteria
+- **Self-Healing**: Automatic remediation of common quality issues
+- **Continuous Improvement**: ML-driven optimization of quality processes
+- **Preventive Monitoring**: Early detection of quality degradation patterns
+
+### Exercise 1: Implement Pipeline Quality Gates
+
+Embed quality validation directly into your data pipelines:
+
+#### Step 1: Pre-Processing Quality Gates
+
+**Data Ingestion Quality Gate**:
+
+```python
+# Example: Airflow DAG with quality gates
+from airflow import DAG
+from airflow.operators.python_operator import PythonOperator
+from datahub_quality import QualityGate
+
+def validate_source_data(**context):
+ """Validate incoming data before processing"""
+ quality_gate = QualityGate(
+ dataset="raw_customer_data",
+ checks=[
+ "completeness_check",
+ "schema_validation",
+ "freshness_check"
+ ]
+ )
+
+ result = quality_gate.execute()
+ if not result.passed:
+ raise ValueError(f"Quality gate failed: {result.failures}")
+
+ return result.quality_score
+
+# DAG definition
+dag = DAG('customer_data_pipeline')
+
+# Quality gate before processing
+quality_check = PythonOperator(
+ task_id='validate_source_quality',
+ python_callable=validate_source_data,
+ dag=dag
+)
+
+# Data processing only runs if quality passes
+process_data = PythonOperator(
+ task_id='process_customer_data',
+ python_callable=process_data_function,
+ dag=dag
+)
+
+quality_check >> process_data # Quality gate blocks processing
+```
+
+#### Step 2: Post-Processing Quality Gates
+
+**Output Validation Gate**:
+
+```python
+def validate_output_quality(**context):
+ """Validate processed data before publishing"""
+ quality_checks = [
+ {
+ "name": "record_count_validation",
+ "query": """
+ SELECT COUNT(*) as record_count
+ FROM processed_customer_data
+ WHERE processing_date = CURRENT_DATE
+ """,
+ "expected_min": 10000, # Expect at least 10K records
+ "expected_max": 1000000 # But not more than 1M
+ },
+ {
+ "name": "revenue_reconciliation",
+ "query": """
+ SELECT ABS(
+ (SELECT SUM(amount) FROM processed_transactions) -
+ (SELECT SUM(amount) FROM source_transactions)
+ ) as revenue_diff
+ """,
+ "expected_max": 100 # Revenue difference < $100
+ }
+ ]
+
+ for check in quality_checks:
+ result = execute_quality_check(check)
+ if not result.passed:
+ # Block publication and alert stakeholders
+ send_quality_alert(check, result)
+ raise QualityGateFailure(f"Failed: {check['name']}")
+```
+
+### Exercise 2: Set Up Automated Data Validation
+
+Create comprehensive validation that runs automatically:
+
+#### Step 1: Schema Evolution Validation
+
+**Automated Schema Change Detection**:
+
+```sql
+-- Detect breaking schema changes
+WITH schema_changes AS (
+ SELECT
+ table_name,
+ column_name,
+ data_type,
+ is_nullable,
+ LAG(data_type) OVER (PARTITION BY table_name, column_name ORDER BY schema_version) as prev_type,
+ LAG(is_nullable) OVER (PARTITION BY table_name, column_name ORDER BY schema_version) as prev_nullable
+ FROM schema_history
+ WHERE schema_date >= CURRENT_DATE - INTERVAL '7 days'
+)
+SELECT
+ table_name,
+ column_name,
+ 'BREAKING_CHANGE' as change_type,
+ CASE
+ WHEN data_type != prev_type THEN 'Data type changed'
+ WHEN is_nullable = 'NO' AND prev_nullable = 'YES' THEN 'Column became non-nullable'
+ END as change_description
+FROM schema_changes
+WHERE (data_type != prev_type OR (is_nullable = 'NO' AND prev_nullable = 'YES'))
+ AND prev_type IS NOT NULL;
+```
+
+**Automated Response**:
+
+- Block deployment if breaking changes detected
+- Require explicit approval from data owners
+- Generate impact analysis for downstream consumers
+- Create migration tasks for affected systems
+
+#### Step 2: Business Rule Validation
+
+**Automated Business Logic Checks**:
+
+```python
+class BusinessRuleValidator:
+ def __init__(self, dataset_name):
+ self.dataset = dataset_name
+ self.rules = self.load_business_rules()
+
+ def validate_customer_lifecycle(self):
+ """Ensure customer data follows business logic"""
+ violations = []
+
+ # Rule: Registration date must precede first purchase
+ query = """
+ SELECT customer_id, registration_date, first_purchase_date
+ FROM customer_summary
+ WHERE first_purchase_date < registration_date
+ """
+
+ results = execute_query(query)
+ if results:
+ violations.append({
+ "rule": "customer_lifecycle_order",
+ "violations": len(results),
+ "severity": "ERROR"
+ })
+
+ return violations
+
+ def validate_financial_consistency(self):
+ """Ensure financial calculations are consistent"""
+ # Rule: Order total must equal sum of line items
+ query = """
+ SELECT
+ order_id,
+ order_total,
+ calculated_total,
+ ABS(order_total - calculated_total) as difference
+ FROM (
+ SELECT
+ o.order_id,
+ o.total_amount as order_total,
+ SUM(li.quantity * li.unit_price) as calculated_total
+ FROM orders o
+ JOIN line_items li ON o.order_id = li.order_id
+ WHERE o.order_date = CURRENT_DATE
+ GROUP BY o.order_id, o.total_amount
+ )
+ WHERE ABS(order_total - calculated_total) > 0.01
+ """
+
+ return self.check_rule(query, "financial_consistency")
+```
+
+### Exercise 3: Implement Self-Healing Mechanisms
+
+Create systems that automatically fix common quality issues:
+
+#### Step 1: Automated Data Repair
+
+**Common Data Fixes**:
+
+```python
+class DataRepairEngine:
+ def __init__(self):
+ self.repair_strategies = {
+ "missing_values": self.handle_missing_values,
+ "duplicate_records": self.handle_duplicates,
+ "format_inconsistencies": self.standardize_formats,
+ "referential_integrity": self.fix_foreign_keys
+ }
+
+ def handle_missing_values(self, dataset, column, strategy="default"):
+ """Automatically handle missing values"""
+ strategies = {
+ "default": f"UPDATE {dataset} SET {column} = 'UNKNOWN' WHERE {column} IS NULL",
+ "previous": f"""
+ UPDATE {dataset} SET {column} = (
+ SELECT {column} FROM {dataset} t2
+ WHERE t2.id < {dataset}.id AND t2.{column} IS NOT NULL
+ ORDER BY t2.id DESC LIMIT 1
+ ) WHERE {column} IS NULL
+ """,
+ "statistical": f"""
+ UPDATE {dataset} SET {column} = (
+ SELECT AVG({column}) FROM {dataset} WHERE {column} IS NOT NULL
+ ) WHERE {column} IS NULL
+ """
+ }
+
+ return strategies.get(strategy, strategies["default"])
+
+ def handle_duplicates(self, dataset, key_columns):
+ """Remove duplicate records automatically"""
+ return f"""
+ DELETE FROM {dataset}
+ WHERE id NOT IN (
+ SELECT MIN(id)
+ FROM {dataset}
+ GROUP BY {', '.join(key_columns)}
+ )
+ """
+```
+
+#### Step 2: Automated Pipeline Recovery
+
+**Pipeline Self-Healing**:
+
+```python
+class PipelineRecoveryManager:
+ def __init__(self):
+ self.recovery_strategies = [
+ self.retry_with_backoff,
+ self.switch_to_backup_source,
+ self.use_cached_data,
+ self.trigger_manual_intervention
+ ]
+
+ def retry_with_backoff(self, pipeline_id, error):
+ """Retry failed pipeline with exponential backoff"""
+ max_retries = 3
+ base_delay = 60 # seconds
+
+ for attempt in range(max_retries):
+ delay = base_delay * (2 ** attempt)
+ time.sleep(delay)
+
+ try:
+ result = execute_pipeline(pipeline_id)
+ if result.success:
+ log_recovery_success(pipeline_id, attempt + 1)
+ return result
+ except Exception as e:
+ log_retry_attempt(pipeline_id, attempt + 1, str(e))
+
+ return self.switch_to_backup_source(pipeline_id, error)
+
+ def switch_to_backup_source(self, pipeline_id, error):
+ """Switch to backup data source during outages"""
+ backup_config = get_backup_configuration(pipeline_id)
+ if backup_config:
+ try:
+ result = execute_pipeline_with_backup(pipeline_id, backup_config)
+ alert_backup_usage(pipeline_id, backup_config)
+ return result
+ except Exception as e:
+ log_backup_failure(pipeline_id, str(e))
+
+ return self.use_cached_data(pipeline_id, error)
+```
+
+### Exercise 4: Create Continuous Quality Improvement
+
+Use machine learning to continuously optimize quality processes:
+
+#### Step 1: Quality Pattern Analysis
+
+**ML-Driven Quality Insights**:
+
+```python
+class QualityMLAnalyzer:
+ def __init__(self):
+ self.model = load_quality_prediction_model()
+
+ def predict_quality_issues(self, dataset_features):
+ """Predict potential quality issues before they occur"""
+ features = [
+ dataset_features['record_count_trend'],
+ dataset_features['schema_change_frequency'],
+ dataset_features['source_system_health'],
+ dataset_features['processing_complexity'],
+ dataset_features['historical_failure_rate']
+ ]
+
+ risk_score = self.model.predict_proba([features])[0][1]
+
+ if risk_score > 0.8:
+ return {
+ "risk_level": "HIGH",
+ "recommended_actions": [
+ "Increase monitoring frequency",
+ "Add additional quality checks",
+ "Schedule proactive maintenance"
+ ]
+ }
+ elif risk_score > 0.6:
+ return {
+ "risk_level": "MEDIUM",
+ "recommended_actions": [
+ "Review recent changes",
+ "Validate upstream dependencies"
+ ]
+ }
+
+ return {"risk_level": "LOW", "recommended_actions": []}
+
+ def optimize_assertion_thresholds(self, assertion_history):
+ """Automatically tune assertion thresholds to reduce false positives"""
+ optimal_thresholds = {}
+
+ for assertion_id, history in assertion_history.items():
+ # Analyze false positive rate vs detection effectiveness
+ false_positive_rate = calculate_false_positive_rate(history)
+ detection_effectiveness = calculate_detection_rate(history)
+
+ # Find optimal threshold that minimizes false positives while maintaining detection
+ optimal_threshold = find_optimal_threshold(
+ false_positive_rate,
+ detection_effectiveness,
+ target_fp_rate=0.05 # 5% false positive target
+ )
+
+ optimal_thresholds[assertion_id] = optimal_threshold
+
+ return optimal_thresholds
+```
+
+#### Step 2: Automated Quality Recommendations
+
+**Intelligent Quality Suggestions**:
+
+```python
+class QualityRecommendationEngine:
+ def generate_recommendations(self, dataset_profile):
+ """Generate quality improvement recommendations"""
+ recommendations = []
+
+ # Analyze data patterns
+ if dataset_profile['null_percentage'] > 10:
+ recommendations.append({
+ "type": "DATA_COMPLETENESS",
+ "priority": "HIGH",
+ "description": f"High null rate ({dataset_profile['null_percentage']}%) detected",
+ "suggested_actions": [
+ "Add completeness assertions",
+ "Investigate upstream data source",
+ "Implement default value strategy"
+ ]
+ })
+
+ # Analyze quality trends
+ if dataset_profile['quality_trend'] == 'DECLINING':
+ recommendations.append({
+ "type": "QUALITY_DEGRADATION",
+ "priority": "MEDIUM",
+ "description": "Quality scores declining over past 30 days",
+ "suggested_actions": [
+ "Review recent pipeline changes",
+ "Increase assertion frequency",
+ "Schedule data source health check"
+ ]
+ })
+
+ return recommendations
+```
+
+### Exercise 5: Implement Quality-Driven Development
+
+Integrate quality into the development lifecycle:
+
+#### Step 1: Quality-First Pipeline Development
+
+**Quality-Driven Development Process**:
+
+1. **Quality Requirements**: Define quality criteria before development
+2. **Quality by Design**: Build quality checks into pipeline architecture
+3. **Quality Testing**: Test quality scenarios in development environments
+4. **Quality Gates**: Automated quality validation in CI/CD pipelines
+5. **Quality Monitoring**: Continuous quality tracking in production
+
+#### Step 2: Automated Quality Testing
+
+**Quality Test Framework**:
+
+```python
+class QualityTestSuite:
+ def __init__(self, pipeline_config):
+ self.pipeline = pipeline_config
+ self.test_data = load_test_datasets()
+
+ def test_data_completeness(self):
+ """Test that pipeline handles incomplete data correctly"""
+ # Inject missing values into test data
+ test_data_with_nulls = inject_nulls(self.test_data, percentage=20)
+
+ result = run_pipeline_with_data(self.pipeline, test_data_with_nulls)
+
+ assert result.completeness_score >= 0.95, "Pipeline should handle missing data"
+ assert result.error_count == 0, "No processing errors expected"
+
+ def test_schema_evolution(self):
+ """Test pipeline resilience to schema changes"""
+ # Test with added columns
+ extended_schema = add_columns(self.test_data.schema, ["new_column"])
+ result = run_pipeline_with_schema(self.pipeline, extended_schema)
+ assert result.success, "Pipeline should handle new columns gracefully"
+
+ # Test with removed columns (should fail gracefully)
+ reduced_schema = remove_columns(self.test_data.schema, ["optional_column"])
+ result = run_pipeline_with_schema(self.pipeline, reduced_schema)
+ assert result.handled_gracefully, "Pipeline should detect missing columns"
+```
+
+### Understanding Automation ROI
+
+**Quality Automation Benefits**:
+
+- **Reduced Manual Effort**: 60-80% reduction in manual quality management tasks
+- **Faster Issue Detection**: Issues caught in minutes instead of hours/days
+- **Improved Reliability**: 90%+ reduction in quality-related production incidents
+- **Increased Confidence**: Teams can deploy changes with confidence in quality
+- **Cost Savings**: Significant reduction in quality-related operational costs
+
+**Measuring Automation Success**:
+
+- **Automation Coverage**: Percentage of quality processes automated
+- **Prevention Rate**: Issues prevented vs. issues detected after occurrence
+- **Time to Resolution**: Speed improvement in quality issue resolution
+- **False Positive Rate**: Accuracy of automated quality detection
+- **Developer Productivity**: Time saved on manual quality tasks
+
+### Advanced Automation Techniques
+
+#### 1. Federated Quality Management
+
+Distribute quality management across teams while maintaining standards:
+
+- **Team-Specific Rules**: Allow teams to define domain-specific quality criteria
+- **Central Governance**: Maintain organization-wide quality standards
+- **Automated Compliance**: Ensure local rules align with global policies
+- **Quality Metrics Aggregation**: Roll up team metrics to organizational dashboards
+
+#### 2. Real-Time Quality Streaming
+
+Implement quality validation in streaming data pipelines:
+
+```python
+# Apache Kafka Streams quality validation
+class StreamingQualityProcessor:
+ def process_record(self, record):
+ """Validate each record in real-time"""
+ quality_result = validate_record(record)
+
+ if quality_result.passed:
+ return record # Forward to downstream
+ else:
+ # Route to dead letter queue for investigation
+ send_to_dlq(record, quality_result.errors)
+ emit_quality_metric("validation_failure", 1)
+ return None
+```
+
+### Quality Automation Best Practices
+
+#### 1. Start Simple, Scale Gradually
+
+- Begin with high-impact, low-complexity automation
+- Prove value with pilot projects before organization-wide rollout
+- Build automation incrementally based on lessons learned
+
+#### 2. Balance Automation and Human Oversight
+
+- Automate routine quality checks and responses
+- Maintain human decision-making for complex quality issues
+- Provide override mechanisms for exceptional cases
+
+#### 3. Design for Maintainability
+
+- Create modular, reusable quality components
+- Document automation logic and decision criteria
+- Plan for automation updates as business rules evolve
+
+### Congratulations!
+
+You've successfully implemented a comprehensive data quality management framework using DataHub. Your organization now has:
+
+**Automated Quality Checks**: Proactive detection of quality issues
+**Real-time Monitoring**: Continuous visibility into data health
+**Rapid Incident Response**: Structured processes for quality issues
+**Preventive Automation**: Systems that prevent issues before they occur
+
+### Next Steps in Your Quality Journey
+
+1. **Expand Coverage**: Apply quality automation to additional data domains
+2. **Advanced Analytics**: Implement ML-driven quality optimization
+3. **Cross-Platform Integration**: Extend quality management across your data ecosystem
+4. **Culture Development**: Build a quality-first mindset across data teams
+
+Your data quality foundation is now ready to support reliable, trustworthy data at scale.
+
+## Continue Learning
+
+Ready to explore more DataHub capabilities? Check out these related tutorials:
+
+- [Data Ingestion Mastery](../ingestion/overview.md) - Advanced data integration techniques
+- [Privacy & Compliance](../privacy/overview.md) - Comprehensive privacy protection
+- [Data Governance Fundamentals](../governance/overview.md) - Review governance best practices
+
+
diff --git a/docs/learn-datahub/quality/quality-monitoring.md b/docs/learn-datahub/quality/quality-monitoring.md
new file mode 100644
index 00000000000000..017456b17ce795
--- /dev/null
+++ b/docs/learn-datahub/quality/quality-monitoring.md
@@ -0,0 +1,387 @@
+import DataHubEntityCard from '@site/src/components/DataHubEntityCard';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+import NextStepButton from '@site/src/components/NextStepButton';
+import DataHubLineageNode, { DataHubLineageFlow, SampleLineageFlows } from '@site/src/components/DataHubLineageNode';
+
+# Quality Monitoring
+
+
+
+## Building Comprehensive Quality Dashboards
+
+**Time Required**: 12 minutes
+
+### The Monitoring Challenge
+
+You've implemented data assertions, but you need visibility into quality trends across your entire data landscape:
+
+- **Scattered quality information** across different systems and teams
+- **Reactive approach** - discovering issues only when stakeholders complain
+- **No quality trends** to identify deteriorating data sources
+- **Lack of accountability** for quality improvements
+
+**Real-World Impact**: Your CEO asked for a "data quality report" for the board meeting, but it took your team 3 days to manually gather quality metrics from various sources, and the information was already outdated by presentation time.
+
+### Understanding Quality Monitoring
+
+Quality monitoring provides continuous visibility into data health across your organization:
+
+
+
+**Monitoring Capabilities**:
+
+- **Real-time Dashboards**: Live quality metrics across all data assets
+- **Trend Analysis**: Historical quality patterns and improvement tracking
+- **Quality Scorecards**: Domain-specific quality assessments
+- **Proactive Alerting**: Early warning system for quality degradation
+- **Executive Reporting**: Summary views for leadership and stakeholders
+
+### Exercise 1: Create Quality Dashboards
+
+Build comprehensive dashboards for different stakeholder needs:
+
+#### Step 1: Executive Quality Dashboard
+
+Create a high-level view for leadership:
+
+1. **Navigate to Analytics** → **Quality Dashboards**
+2. **Create New Dashboard**: "Executive Data Quality Overview"
+3. **Add Key Metrics**:
+ - Overall quality score (percentage of passing assertions)
+ - Critical data assets health status
+ - Quality trend over last 90 days
+ - Top quality issues by business impact
+
+**Executive Dashboard Preview**:
+
+
+
+
+
+
+
+**Quality Metrics Summary**:
+
+- **Overall Quality Score**: 94.2% ↑ (+2.1% vs last month)
+- **Critical Assets**: Customer Data (98.5%), Financial Data (89.2% - needs attention)
+- **Trending Issues**: Payment processing delays, email validation failures
+
+#### Step 2: Operational Quality Dashboard
+
+Create detailed views for data teams:
+
+**Dashboard Configuration**:
+
+- **Name**: "Data Engineering Quality Operations"
+- **Refresh**: Every 5 minutes
+- **Scope**: All production datasets
+
+**Key Sections**:
+
+1. **Real-time Assertion Status**
+2. **Pipeline Quality Health**
+3. **Data Freshness Monitoring**
+4. **Quality Issue Queue**
+
+### Exercise 2: Set Up Quality Scorecards
+
+Create domain-specific quality assessments:
+
+#### Step 1: Customer Domain Scorecard
+
+**Scorecard Configuration**:
+
+- **Domain**: Customer Data
+- **Assets**: Customer profiles, transactions, interactions
+- **Quality Dimensions**:
+ - Completeness (weight: 30%)
+ - Accuracy (weight: 25%)
+ - Consistency (weight: 20%)
+ - Timeliness (weight: 15%)
+ - Validity (weight: 10%)
+
+**Scoring Logic**:
+
+```
+Customer Domain Quality Score =
+ (Completeness × 0.30) +
+ (Accuracy × 0.25) +
+ (Consistency × 0.20) +
+ (Timeliness × 0.15) +
+ (Validity × 0.10)
+```
+
+#### Step 2: Financial Domain Scorecard
+
+**Enhanced Requirements for Financial Data**:
+
+- **Regulatory Compliance**: SOX, GAAP adherence
+- **Audit Trail**: Complete lineage and change tracking
+- **Precision**: Exact decimal calculations
+- **Reconciliation**: Cross-system balance validation
+
+### Exercise 3: Implement Trend Analysis
+
+Monitor quality patterns over time:
+
+#### Step 1: Quality Trend Monitoring
+
+**Trend Metrics to Track**:
+
+- Daily assertion pass rates
+- Weekly quality score changes
+- Monthly quality improvement goals
+- Quarterly compliance assessments
+
+**Trend Analysis Queries**:
+
+```sql
+-- Daily quality trend
+SELECT
+ date,
+ COUNT(CASE WHEN status = 'PASS' THEN 1 END) * 100.0 / COUNT(*) as pass_rate,
+ COUNT(*) as total_assertions
+FROM assertion_results
+WHERE date >= CURRENT_DATE - INTERVAL '30 days'
+GROUP BY date
+ORDER BY date;
+
+-- Quality improvement by domain
+SELECT
+ domain,
+ AVG(CASE WHEN date >= CURRENT_DATE - INTERVAL '7 days' THEN quality_score END) as current_week,
+ AVG(CASE WHEN date >= CURRENT_DATE - INTERVAL '14 days'
+ AND date < CURRENT_DATE - INTERVAL '7 days' THEN quality_score END) as previous_week
+FROM domain_quality_scores
+GROUP BY domain;
+```
+
+#### Step 2: Seasonal Pattern Detection
+
+Identify recurring quality patterns:
+
+- **End-of-month** data processing spikes
+- **Holiday periods** with reduced data volumes
+- **Business cycle** impacts on data quality
+- **System maintenance** windows affecting freshness
+
+### Exercise 4: Create Quality Alerts
+
+Set up intelligent alerting for quality issues:
+
+#### Step 1: Threshold-Based Alerts
+
+**Alert Configuration**:
+
+- **Critical Alert**: Overall quality drops below 90%
+- **Warning Alert**: Domain quality drops below 95%
+- **Info Alert**: New quality issues detected
+
+**Alert Channels**:
+
+- Slack integration for immediate team notification
+- Email summaries for daily quality reports
+- PagerDuty integration for critical production issues
+- Jira ticket creation for tracking resolution
+
+#### Step 2: Anomaly Detection Alerts
+
+**Statistical Alerting**:
+
+```sql
+-- Detect unusual assertion failure rates
+WITH daily_stats AS (
+ SELECT
+ date,
+ COUNT(CASE WHEN status = 'FAIL' THEN 1 END) as failures,
+ COUNT(*) as total
+ FROM assertion_results
+ WHERE date >= CURRENT_DATE - INTERVAL '30 days'
+ GROUP BY date
+),
+baseline AS (
+ SELECT
+ AVG(failures * 100.0 / total) as avg_failure_rate,
+ STDDEV(failures * 100.0 / total) as stddev_failure_rate
+ FROM daily_stats
+ WHERE date < CURRENT_DATE
+)
+SELECT
+ ds.date,
+ ds.failures * 100.0 / ds.total as current_failure_rate,
+ b.avg_failure_rate + (2 * b.stddev_failure_rate) as alert_threshold
+FROM daily_stats ds, baseline b
+WHERE ds.date = CURRENT_DATE
+ AND ds.failures * 100.0 / ds.total > b.avg_failure_rate + (2 * b.stddev_failure_rate);
+```
+
+### Exercise 5: Build Quality Reports
+
+Create automated reporting for stakeholders:
+
+#### Step 1: Daily Quality Summary
+
+**Automated Daily Report**:
+
+- Overall quality status
+- New issues discovered
+- Issues resolved
+- Quality trends
+- Upcoming maintenance impacts
+
+**Report Template**:
+
+```
+Daily Data Quality Report - [Date]
+
+OVERALL STATUS
+Quality Score: 94.2% (↑ 0.3% from yesterday)
+Critical Issues: 2 (down from 5)
+New Issues: 1
+Resolved Issues: 4
+
+DOMAIN BREAKDOWN
+Customer Data: 96.1% (Good)
+Financial Data: 89.2% (Warning - investigating payment delays)
+Product Data: 95.8% (Good)
+Marketing Data: 94.5% (Good)
+
+ATTENTION REQUIRED
+1. Payment processing latency (Financial) - ETA: 2PM
+2. Customer email validation (CRM) - In progress
+
+TRENDS
+- 7-day average: 93.8% (improving)
+- Month-to-date: 94.1% (on track for 95% goal)
+```
+
+#### Step 2: Executive Monthly Report
+
+**Strategic Quality Report**:
+
+- Quality ROI and business impact
+- Quality initiative progress
+- Resource allocation recommendations
+- Compliance status updates
+
+### Understanding Quality Metrics
+
+**Key Performance Indicators (KPIs)**:
+
+**Operational Metrics**:
+
+- **Assertion Pass Rate**: Percentage of quality checks passing
+- **Mean Time to Detection (MTTD)**: Speed of quality issue identification
+- **Mean Time to Resolution (MTTR)**: Speed of quality issue fixes
+- **Data Freshness**: Timeliness of data updates
+
+**Business Metrics**:
+
+- **Quality-Related Incidents**: Business disruptions due to data issues
+- **Stakeholder Satisfaction**: User confidence in data quality
+- **Compliance Score**: Adherence to regulatory requirements
+- **Quality ROI**: Business value of quality improvements
+
+### Advanced Monitoring Techniques
+
+#### 1. Machine Learning-Enhanced Monitoring
+
+Use ML to improve quality detection:
+
+- **Anomaly Detection**: Identify unusual data patterns
+- **Predictive Quality**: Forecast potential quality issues
+- **Root Cause Analysis**: Automatically identify issue sources
+- **Quality Recommendations**: Suggest improvement actions
+
+#### 2. Real-Time Quality Streaming
+
+Monitor quality in real-time data streams:
+
+```sql
+-- Real-time quality monitoring
+SELECT
+ window_start,
+ COUNT(*) as records_processed,
+ COUNT(CASE WHEN quality_check = 'PASS' THEN 1 END) as quality_records,
+ COUNT(CASE WHEN quality_check = 'FAIL' THEN 1 END) as quality_failures
+FROM streaming_quality_results
+WHERE window_start >= CURRENT_TIMESTAMP - INTERVAL '1 hour'
+GROUP BY window_start
+ORDER BY window_start DESC;
+```
+
+#### 3. Cross-System Quality Correlation
+
+Monitor quality across integrated systems:
+
+- Correlate quality issues with system performance
+- Identify upstream causes of quality problems
+- Track quality impact propagation
+- Coordinate quality improvements across teams
+
+### Quality Monitoring Best Practices
+
+#### 1. Design for Different Audiences
+
+- **Executives**: High-level trends and business impact
+- **Data Teams**: Detailed technical metrics and alerts
+- **Business Users**: Domain-specific quality insights
+- **Compliance**: Regulatory adherence tracking
+
+#### 2. Balance Detail and Usability
+
+- Start with key metrics, add detail as needed
+- Use visual indicators for quick status assessment
+- Provide drill-down capabilities for investigation
+- Include contextual information and recommendations
+
+#### 3. Ensure Actionability
+
+- Link quality metrics to specific improvement actions
+- Provide clear ownership and escalation paths
+- Include remediation guidance and documentation
+- Track improvement progress over time
+
+### Next Steps
+
+With comprehensive quality monitoring in place, you're ready to implement incident management processes that ensure rapid response to quality issues.
+
+
diff --git a/docs/learn-datahub/quickstart/discovery-basics.md b/docs/learn-datahub/quickstart/discovery-basics.md
new file mode 100644
index 00000000000000..b3f3049c04f0c7
--- /dev/null
+++ b/docs/learn-datahub/quickstart/discovery-basics.md
@@ -0,0 +1,426 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import NextStepButton from '@site/src/components/NextStepButton';
+import DataHubEntityCard, { SampleEntities } from '@site/src/components/DataHubEntityCard';
+import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode';
+import { SearchExercise } from '@site/src/components/TutorialExercise';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Step 3: Discovery Basics (10 minutes)
+
+
+
+**Discovery Implementation**: With enterprise metadata now available in DataHub, you need to demonstrate systematic data discovery capabilities. This step focuses on enabling analysts to efficiently locate and understand relevant datasets.
+
+**Business Requirement**: Locate user engagement metrics to support executive reporting and strategic decision-making. The data exists within the analytics infrastructure but requires systematic discovery.
+
+**Your Objective**: Implement and demonstrate DataHub's discovery features to enable self-service data access across the organization.
+
+## What You'll Master
+
+By the end of this step, you'll be able to:
+
+- **Find specific datasets** using strategic search techniques
+- **Navigate enterprise data architecture** across multiple platforms
+- **Understand data relationships** through schema exploration
+- **Identify relevant data** for business requirements
+
+## Enterprise Data Discovery Framework
+
+This tutorial demonstrates systematic data discovery techniques used in professional data environments. These methods apply to any enterprise data catalog and are essential for effective data analysis.
+
+### Professional Data Discovery Approach
+
+**Requirements Analysis** → **Strategic Search** → **Asset Evaluation** → **Context Gathering** → **Access Planning**
+
+| Step | Focus | Key Actions |
+| ------------------------- | ------------------ | ---------------------------------------------- |
+| **Requirements Analysis** | Define objectives | Understand business questions and data needs |
+| **Strategic Search** | Target discovery | Use business terms and domain knowledge |
+| **Asset Evaluation** | Quality assessment | Review schemas, documentation, and freshness |
+| **Context Gathering** | Understand usage | Check lineage, owners, and related assets |
+| **Access Planning** | Prepare for use | Verify permissions and connection requirements |
+
+## Method 1: Strategic Search - Finding User Metrics
+
+**Your First Lead**: The business requirement focuses on "user" metrics. Let's start there and see what data is available.
+
+### Strategic Search: User-Related Datasets
+
+1. **Open DataHub** at [http://localhost:9002](http://localhost:9002)
+
+2. **Search for user data**:
+
+ ```
+ Search: "user"
+ ```
+
+3. **Analyze your results** - you should discover these datasets:
+
+
+
+
+
+
+**Search Results Analysis**: This search successfully identified both datasets required for user metrics analysis, demonstrating the effectiveness of targeted search strategies in enterprise data discovery.
+
+:::tip Real-World Search Strategy
+Notice how searching for "user" found tables with "users" in the name? DataHub's search is smart - it finds variations and related terms automatically. This is exactly how you'd search in production systems.
+:::
+
+### Advanced Search Techniques
+
+
+
+
+
+
+**Why this works**: This enterprise follows standard naming conventions (`fct_` for fact tables, descriptive names for events).
+
+
+
+
+
+**Filter by enterprise platforms:**
+
+- **Hive**: Click to see only warehouse tables (`fct_users_created`, `fct_users_deleted`, `logging_events`)
+- **Kafka**: Real-time streaming data (`SampleKafkaDataset`)
+- **HDFS**: Data lake storage (`SampleHdfsDataset`)
+
+**Pro Tip**: For user analytics, focus on the **Hive** platform first for processed data!
+
+
+
+
+**Advanced search operators:**
+
+```
+# Find all fact tables
+name:fct_*
+
+# Find user-related data
+user OR users
+
+# Exclude test data
+user NOT test NOT sample
+```
+
+**Learn More**: Check out the [complete search documentation](../../how/search.md) for all available operators and techniques.
+
+
+
+
+## Method 2: Browse by Organization
+
+Sometimes browsing is more effective than searching, especially when exploring unfamiliar data.
+
+### Browse by Platform
+
+1. **Click "Browse" in the top navigation**
+
+2. **Explore by data platform:**
+
+ - **Demo Data**: Sample retail datasets
+ - **PostgreSQL**: Operational databases
+ - **Snowflake**: Data warehouse tables
+ - **dbt**: Transformed analytics models
+
+3. **Drill down into a platform:**
+ - Click on "Demo Data"
+ - You'll see all datasets from that platform
+ - Notice the hierarchical organization
+
+### Browse by Domain (if configured)
+
+If your organization uses domains:
+
+1. **Look for domain groupings** like:
+
+ - Marketing Analytics
+ - Customer Operations
+ - Financial Reporting
+ - Product Analytics
+
+2. **Each domain contains** related datasets regardless of platform
+
+## Method 3: Explore Dataset Details
+
+Let's dive deep into a specific dataset to understand what information DataHub provides.
+
+### Find the Customer Dataset
+
+1. **Search for "customer"** or browse to find a customer-related table
+
+2. **Click on a dataset** (e.g., "customers" or "user_profiles")
+
+3. **Explore the dataset page** - you'll see several tabs:
+
+### Schema Tab - Understanding Your Data
+
+The Schema tab shows the structure of your dataset:
+
+**Column Information:**
+
+- **Name**: The column identifier
+- **Type**: Data type (string, integer, timestamp, etc.)
+- **Description**: Business meaning (if available)
+- **Nullable**: Whether the field can be empty
+
+**Key things to look for:**
+
+```
+Primary keys (usually ID fields)
+Foreign keys (relationships to other tables)
+Date fields (for time-based analysis)
+Categorical fields (for grouping/segmentation)
+Numeric fields (for calculations/metrics)
+```
+
+### Properties Tab - Metadata & Context
+
+**Dataset Properties:**
+
+- **Owner**: Who's responsible for this data
+- **Created**: When the dataset was first created
+- **Last Modified**: When data was last updated
+- **Tags**: Classification labels
+- **Custom Properties**: Business-specific metadata
+
+**Platform Details:**
+
+- **Database/Schema**: Where the data lives
+- **Table Type**: Table, view, or materialized view
+- **Row Count**: Approximate number of records
+
+### Documentation Tab - Business Context
+
+Look for:
+
+- **Dataset description**: What this data represents
+- **Column descriptions**: Business meaning of each field
+- **Usage notes**: How this data should be used
+- **Data quality notes**: Known issues or limitations
+
+## Understanding Data Relationships
+
+### Related Datasets
+
+At the bottom of any dataset page, look for:
+
+**"Frequently Co-occurring"**: Datasets often used together
+**"Similar Datasets"**: Tables with similar schemas
+**"Related by Lineage"**: Connected through data pipelines
+
+### Column-Level Relationships
+
+In the Schema tab:
+
+- **Foreign key indicators** show relationships to other tables
+- **Similar columns** across datasets are highlighted
+- **Column lineage** shows data transformation history
+
+## Practical Exercise: Customer Analysis Scenario
+
+Let's complete the original task - finding customer segmentation data:
+
+### Step 1: Search Strategy
+
+```
+1. Search for "customer segment"
+2. Filter results to "Datasets" only
+3. Look for tables with names like:
+ - customer_segments
+ - user_cohorts
+ - customer_analytics
+```
+
+### Step 2: Evaluate Options
+
+For each potential dataset, check:
+
+- **Schema**: Does it have the fields you need?
+- **Freshness**: Is the data recent enough?
+- **Owner**: Can you contact them with questions?
+- **Documentation**: Is the business logic clear?
+
+### Step 3: Understand the Data
+
+Click into the most promising dataset and review:
+
+- **Column definitions**: What does each field mean?
+- **Sample data**: What do actual values look like?
+- **Lineage**: Where does this data come from?
+
+## Discovery Best Practices
+
+### For Data Consumers
+
+1. **Start broad, then narrow**: Begin with keyword searches, then use filters
+2. **Check multiple sources**: The same business concept might exist in different systems
+3. **Read the documentation**: Don't assume column meanings from names alone
+4. **Contact owners**: When in doubt, reach out to dataset owners
+5. **Bookmark frequently used datasets**: Save time on repeat searches
+
+### For Data Producers
+
+1. **Add clear descriptions**: Help others understand your data
+2. **Tag appropriately**: Use consistent classification schemes
+3. **Document business logic**: Explain calculations and transformations
+4. **Keep metadata current**: Update descriptions when data changes
+
+## Understanding Data Relationships
+
+Now that you've discovered the key datasets, let's see how they connect in the data pipeline:
+
+### User Metrics Data Pipeline
+
+
+
+**Data Flow Analysis**:
+
+- **Source**: `logging_events` captures real-time user interactions
+- **Processing**: `user_analytics` job transforms raw events into structured metrics
+- **Output**: `fct_users_created` and `fct_users_deleted` provide business-ready analytics
+
+This lineage view shows you the complete data journey - from raw user events through processing to the final analytics tables. Understanding these relationships is crucial for data quality and impact analysis.
+
+## Common Discovery Patterns
+
+
+
+
+**Scenario**: "I need to understand what customer data we have"
+
+**Approach**:
+
+1. Search broadly: "customer"
+2. Browse by platform to see all sources
+3. Compare schemas across datasets
+4. Identify the most comprehensive source
+
+
+
+
+**Scenario**: "I need customer email addresses for a campaign"
+
+**Approach**:
+
+1. Search specifically: "email"
+2. Filter to datasets only
+3. Check column details for email fields
+4. Verify data freshness and quality
+
+
+
+
+**Scenario**: "What would break if I change this table?"
+
+**Approach**:
+
+1. Navigate to the dataset
+2. Check the Lineage tab
+3. Identify downstream consumers
+4. Contact owners of dependent systems
+
+
+
+
+## Success Checkpoint
+
+**You've successfully completed Step 3 when you can:**
+
+- Find datasets using both search and browse methods
+- Understand what information is available in dataset pages
+- Read and interpret schema information
+- Identify dataset relationships and dependencies
+
+**What you've learned:**
+
+- Multiple ways to discover data in DataHub
+- How to evaluate datasets for your analysis needs
+- Where to find business context and documentation
+- How to understand data relationships
+
+
+Next: Explore Data Lineage
+
diff --git a/docs/learn-datahub/quickstart/first-ingestion.md b/docs/learn-datahub/quickstart/first-ingestion.md
new file mode 100644
index 00000000000000..f8c84f5fe72cd8
--- /dev/null
+++ b/docs/learn-datahub/quickstart/first-ingestion.md
@@ -0,0 +1,493 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import NextStepButton from '@site/src/components/NextStepButton';
+import DataHubEntityCard, { SampleEntities } from '@site/src/components/DataHubEntityCard';
+import OSDetectionTabs from '@site/src/components/OSDetectionTabs';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Step 2: First Data Ingestion (10 minutes)
+
+
+
+**The Implementation Challenge**: You have an empty DataHub instance that needs to be populated with enterprise metadata. Before analysts can discover and use data effectively, you must establish connections to the organization's data systems.
+
+**Your Objective**: Connect multiple data platforms to DataHub and ingest comprehensive metadata that enables self-service data discovery across the organization.
+
+## What You'll Accomplish
+
+By the end of this step, you'll have:
+
+- **Enterprise analytics data** from multiple systems ingested into DataHub
+- **Multi-platform connectivity** established (Kafka streams, Hive warehouse, HDFS storage)
+- **Comprehensive metadata** including schemas, lineage, and business context
+- **Self-service foundation** enabling analysts to discover and understand data independently
+
+## Understanding Data Ingestion
+
+DataHub ingestion connects to your data systems and extracts comprehensive metadata through a standardized process:
+
+### Metadata Ingestion Workflow
+
+**1. Connection** → **2. Discovery** → **3. Extraction** → **4. Transformation** → **5. Loading**
+
+| Phase | Description | What Happens |
+| ------------------ | -------------------- | --------------------------------------------------------------------- |
+| **Connection** | Secure system access | DataHub establishes authenticated connections to source systems |
+| **Discovery** | Schema scanning | Identifies databases, tables, views, and data structures |
+| **Extraction** | Metadata collection | Pulls schema definitions, statistics, and lineage information |
+| **Transformation** | Standardization | Converts metadata into DataHub's unified format |
+| **Loading** | Storage & indexing | Stores metadata in DataHub's knowledge graph for search and discovery |
+
+**What gets ingested:**
+
+- **Schema information**: Table and column definitions
+- **Data statistics**: Row counts, data types, sample values
+- **Lineage**: How data flows between systems
+- **Usage patterns**: Query history and access patterns (when available)
+
+## Connecting Enterprise Data Systems
+
+**The Situation**: This tutorial uses a representative enterprise data architecture with data scattered across multiple systems - just like most real companies. Let's get it all connected to DataHub.
+
+**What You're About to Ingest**: This enterprise data architecture includes:
+
+
+
+
+
+
+```cmd
+# Connect sample data ecosystem to DataHub
+datahub docker ingest-sample-data
+
+# If datahub command not found:
+python -m datahub docker ingest-sample-data
+```
+
+
+
+
+```bash
+# Connect sample data ecosystem to DataHub
+datahub docker ingest-sample-data
+
+# If datahub command not found:
+python3 -m datahub docker ingest-sample-data
+```
+
+
+
+
+```bash
+# Connect sample data ecosystem to DataHub
+datahub docker ingest-sample-data
+
+# If datahub command not found:
+python3 -m datahub docker ingest-sample-data
+
+# If permission issues:
+sudo datahub docker ingest-sample-data
+```
+
+
+
+
+**Enterprise Data Landscape:**
+
+
+
+| System | Platform | What's Inside | Business Purpose |
+| -------------------- | -------- | ------------------------------------------------- | ------------------------------------- |
+| **Real-time Events** | Kafka | `SampleKafkaDataset` - Live user activity streams | Track user behavior as it happens |
+| **Data Warehouse** | Hive | `fct_users_created`, `fct_users_deleted` | Monthly user metrics for analytics |
+| **Event Logs** | Hive | `logging_events` - Detailed activity logs | Source data for user analytics |
+| **Data Lake** | HDFS | `SampleHdfsDataset` - Raw data storage | Historical data backup and processing |
+
+
+
+
+
+**Your Mission**: This ingestion will give you access to the complete enterprise data ecosystem. Pay special attention to the `fct_users_created` and `fct_users_deleted` tables - these contain the user metrics data.
+
+:::tip Real-World Context
+This mirrors what you'd find at most tech companies: streaming data (Kafka), processed analytics (Hive), and data lake storage (HDFS). You're learning with realistic, production-like data architecture!
+:::
+
+**Watch the Magic Happen**: As the ingestion runs, you'll see DataHub discovering these key datasets:
+
+
+
+
+
+
+DataHub automatically extracts:
+
+- **Table schemas** with column definitions and data types
+- **Data lineage** showing how tables connect across platforms
+- **Ownership information** (John Doe owns most of this sample data)
+- **Documentation** and business context
+
+**What happens during ingestion:**
+
+```
+Starting ingestion...
+Extracting metadata from demo source...
+Found 12 datasets
+Found 156 columns
+Found 8 lineage relationships
+Found 3 dashboards
+Found 2 data pipelines
+Ingestion completed successfully!
+```
+
+## Option 2: Connect a Real Database (Advanced)
+
+If you want to connect your own database, here's how to create an ingestion recipe:
+
+
+
+
+Create a file called `postgres-recipe.yml`:
+
+```yaml
+source:
+ type: postgres
+ config:
+ host_port: localhost:5432
+ database: retail_db
+ username: postgres
+ password: password
+ # Optional: specific schemas to ingest
+ schema_pattern:
+ allow: ["public", "analytics"]
+
+sink:
+ type: datahub-rest
+ config:
+ server: http://localhost:8080
+```
+
+Run the ingestion:
+
+```bash
+datahub ingest -c postgres-recipe.yml
+```
+
+
+
+
+Create a file called `mysql-recipe.yml`:
+
+```yaml
+source:
+ type: mysql
+ config:
+ host_port: localhost:3306
+ database: retail_db
+ username: root
+ password: password
+
+sink:
+ type: datahub-rest
+ config:
+ server: http://localhost:8080
+```
+
+Run the ingestion:
+
+```bash
+datahub ingest -c mysql-recipe.yml
+```
+
+
+
+
+For CSV files in a directory:
+
+```yaml
+source:
+ type: csv-enricher
+ config:
+ # Path to your CSV files
+ filename: "/path/to/csv/files/*.csv"
+
+sink:
+ type: datahub-rest
+ config:
+ server: http://localhost:8080
+```
+
+Run the ingestion:
+
+```bash
+datahub ingest -c csv-recipe.yml
+```
+
+
+
+
+## Mission Status: Did We Connect Enterprise Data?
+
+**The Moment of Truth**: Let's see if you successfully connected the enterprise data systems to DataHub.
+
+### 1. Check Your Ingestion Results
+
+Look for these success indicators in your terminal:
+
+```
+Ingestion completed successfully
+Processed 5 datasets (SampleKafkaDataset, fct_users_created, fct_users_deleted, logging_events, SampleHdfsDataset)
+Processed 15+ columns across all tables
+Discovered lineage relationships between tables
+No errors encountered
+```
+
+**Success Indicator**: If you see "Ingestion completed successfully", you have successfully connected a multi-platform data architecture to DataHub.
+
+### 2. Explore Enterprise Data in DataHub
+
+1. **Refresh DataHub** at [http://localhost:9002](http://localhost:9002)
+
+2. **Check the home page transformation**:
+
+ - Dataset count jumped from 0 to 5+ datasets
+ - Recent activity shows "SampleKafkaDataset", "fct_users_created", etc.
+ - You can see the enterprise data platforms: Kafka, Hive, HDFS
+
+3. **Quick victory lap** - click "Browse" in the top navigation:
+ - **Hive platform**: You should see `fct_users_created` and `fct_users_deleted` (the user metrics datasets)
+ - **Kafka platform**: Real-time streaming data (`SampleKafkaDataset`)
+ - **HDFS platform**: Data lake storage (`SampleHdfsDataset`)
+
+**Pro Tip**: Notice how DataHub automatically organized everything by platform? This is how you'll navigate complex data ecosystems in real companies.
+
+**Your Ingested Enterprise Data Assets:**
+
+
+
+
+
+
+
+
+
+### 3. Your First Dataset Deep-Dive: Exploring User Metrics Data
+
+**Time to investigate!** Let's look at the user metrics data. Click on `fct_users_created` (you'll find it under the Hive platform).
+
+**What You'll Discover**:
+
+**Schema Tab** - The data structure:
+
+- `user_id`: The key field for tracking individual users
+- `created_date`: When each user was created (perfect for monthly analysis!)
+- You'll see this is a proper fact table with clean, analytics-ready data
+
+**Properties Tab** - Business context:
+
+- **Owner**: John Doe (jdoe@linkedin.com) - now you know who to contact with questions!
+- **Platform**: Hive (enterprise data warehouse)
+- **Custom Properties**: You might see metadata like `prop1: fakeprop` - this is where business teams add context
+
+**Lineage Tab** - The data story:
+
+- **Upstream**: This table is built from `logging_events` (the raw event data)
+- **Downstream**: You'll see connections to other analytics tables
+- This shows you the complete data pipeline from raw events to business metrics
+
+**Mission Progress**: You've just found the user metrics data! The `fct_users_created` table has user creation data with timestamps - perfect for monthly analysis.
+
+:::tip Real-World Learning
+This exploration pattern is exactly what you'd do at any company: find the table, understand its structure, identify the owner, and trace its lineage. You're learning production data analysis skills!
+:::
+
+**Want to Learn More?** Check out the [full dataset documentation](/docs/generated/metamodel/entities/dataset.md) to understand all the metadata DataHub captures.
+
+## Understanding the Ingestion Process
+
+Let's break down what just happened:
+
+### 1. Connection & Discovery
+
+```
+DataHub Connector → Data Source
+├── Authenticates using provided credentials
+├── Discovers available schemas/databases
+└── Lists all tables and views
+```
+
+### 2. Metadata Extraction
+
+```
+For each table/view:
+├── Extract schema (columns, types, constraints)
+├── Collect statistics (row counts, data distribution)
+├── Identify relationships (foreign keys, joins)
+└── Gather usage information (if available)
+```
+
+### 3. Lineage Detection
+
+```
+DataHub analyzes:
+├── SQL queries in views and stored procedures
+├── ETL pipeline definitions
+├── Data transformation logic
+└── Cross-system data flows
+```
+
+### 4. Storage & Indexing
+
+```
+Metadata is stored in:
+├── MySQL (primary metadata storage)
+├── OpenSearch (search index)
+└── Kafka (real-time event stream)
+```
+
+## Ingestion Best Practices
+
+**For production environments:**
+
+1. **Start small**: Begin with a few important datasets
+2. **Use scheduling**: Set up regular ingestion to keep metadata fresh
+3. **Monitor performance**: Large databases may need configuration tuning
+4. **Secure credentials**: Use environment variables or secret management
+5. **Test first**: Always test ingestion recipes in development
+
+## Troubleshooting Common Issues
+
+
+
+
+**Error:** `Failed to connect to database`
+
+**Common causes:**
+
+- Incorrect host/port
+- Wrong credentials
+- Database not accessible from Docker container
+- Firewall blocking connection
+
+**Solutions:**
+
+```bash
+# Test connection manually
+telnet your-db-host 5432
+
+# For local databases, use host.docker.internal instead of localhost
+host_port: host.docker.internal:5432
+```
+
+
+
+
+**Error:** `Ingestion completed but no datasets found`
+
+**Common causes:**
+
+- Schema/database doesn't exist
+- User lacks permissions
+- Pattern filters too restrictive
+
+**Solutions:**
+
+```yaml
+# Check permissions
+GRANT SELECT ON ALL TABLES IN SCHEMA public TO datahub_user;
+
+# Broaden patterns
+schema_pattern:
+ allow: [".*"] # Allow all schemas
+```
+
+
+
+
+**Issue:** Ingestion taking very long
+
+**Solutions:**
+
+```yaml
+# Disable profiling for large tables
+profiling:
+ enabled: false
+
+# Limit table discovery
+table_pattern:
+ allow: ["important_table_.*"]
+```
+
+
+
+
+## Implementation Checkpoint: Verify Success
+
+**You've successfully completed the metadata ingestion when:**
+
+- **Enterprise data is live**: 5+ datasets visible in DataHub (Kafka, Hive, HDFS platforms)
+- **Analytics tables discovered**: You can see `fct_users_created` and `fct_users_deleted` in the Hive platform
+- **Data exploration complete**: You've clicked into a dataset and seen schema, properties, and lineage
+- **Owner identified**: You know John Doe owns the user analytics data
+
+**Implementation Success**: You've successfully connected a multi-platform data architecture to DataHub, establishing comprehensive metadata visibility across the organization's data ecosystem.
+
+**What you've accomplished:**
+
+- **Enterprise integration**: Connected Kafka streams, Hive warehouse, and HDFS storage systems
+- **Automated metadata discovery**: Extracted schemas, lineage, and ownership information
+- **Business enablement**: Created the foundation for self-service data discovery
+- **Production-ready skills**: Implemented the same processes used in enterprise environments
+
+**Next Phase**: With metadata ingestion complete, you can now enable systematic data discovery and analysis across the organization.
+
+
+Next: Discover and Explore Your Data
+
diff --git a/docs/learn-datahub/quickstart/first-lineage.md b/docs/learn-datahub/quickstart/first-lineage.md
new file mode 100644
index 00000000000000..36ef1f1d794662
--- /dev/null
+++ b/docs/learn-datahub/quickstart/first-lineage.md
@@ -0,0 +1,605 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Step 4: Your First Lineage (5 minutes)
+
+
+
+**The Final Piece**: You've located the user metrics data (`fct_users_created` and `fct_users_deleted`), but before delivering the analysis, you need to understand something crucial: _Where does this data come from?_ Is it reliable? What happens if something breaks upstream?
+
+**Your Objective**: Use DataHub's lineage features to trace the data pipeline and understand how the organization's user metrics are created. This knowledge will make you confident in your analysis and help you spot potential issues.
+
+## What You'll Accomplish
+
+By the end of this step, you'll be able to:
+
+- Navigate lineage graphs to understand data flow
+- Distinguish between upstream and downstream dependencies
+- Use lineage for impact analysis and troubleshooting
+- Understand column-level lineage relationships
+
+## Understanding Data Lineage
+
+Data lineage provides a comprehensive view of data flow throughout your organization, tracking data from its origin through all transformations to final consumption points.
+
+**Enterprise Lineage Components:**
+
+- **Source Systems**: Original data repositories and databases
+- **Transformation Layers**: ETL processes, data pipelines, and business logic
+- **Intermediate Storage**: Staging areas, data warehouses, and data lakes
+- **Consumption Points**: Reports, dashboards, and analytical applications
+- **Data Dependencies**: Relationships between datasets and processes
+
+**Why lineage matters:**
+
+- **Impact Analysis**: "What breaks if I change this table?"
+- **Root Cause Analysis**: "Why is this dashboard showing wrong numbers?"
+- **Data Governance**: "Where does this sensitive data flow?"
+- **Compliance**: "Can we trace this data back to its source?"
+
+## Tracing Enterprise Data Pipelines
+
+### Method 1: Following User Metrics Data Trail
+
+Let's trace the lineage of user analytics data:
+
+1. **Navigate to `fct_users_created`** (the table you found in discovery)
+
+2. **Click the "Lineage" tab** to see the data story
+
+3. **Analyze the enterprise data flow:**
+ - **Upstream (left)**: `logging_events` - This is where user creation events are captured
+ - **Current dataset (center)**: `fct_users_created` - The processed analytics table for user metrics
+ - **Downstream (right)**: Any dashboards or reports that use this data
+
+**What This Tells You**: The user creation data flows from raw events (`logging_events`) through processing into the analytics table (`fct_users_created`). This is a clean, reliable pipeline!
+
+### Method 2: Global Lineage View
+
+1. **From any dataset page**, click the **"View Lineage"** button
+
+2. **This opens the full lineage explorer** with:
+ - Interactive graph visualization
+ - Zoom and pan controls
+ - Filter options
+ - Multi-hop lineage traversal
+
+## Reading Lineage Graphs
+
+Let's understand the visual elements:
+
+### Node Types
+
+
+
+
+**Tables/Views** (rectangular nodes):
+
+- **Database tables**: Raw operational data
+- **Analytics views**: Transformed/aggregated data
+- **Materialized views**: Pre-computed results
+- **Files**: CSV, Parquet, JSON data files
+
+
+
+
+**Data Jobs** (circular nodes):
+
+- **ETL jobs**: Extract, Transform, Load processes
+- **dbt models**: Data transformation logic
+- **Python scripts**: Custom data processing
+- **Airflow DAGs**: Workflow orchestration
+
+
+
+
+**Consuming Applications** (diamond nodes):
+
+- **BI Dashboards**: Looker, Tableau, PowerBI
+- **ML Models**: Training and inference pipelines
+- **Applications**: Customer-facing features
+- **Reports**: Automated business reports
+
+
+
+
+### Connection Types
+
+**Solid lines**: Direct data dependencies
+**Dashed lines**: Indirect or inferred relationships
+**Colored lines**: Different types of transformations
+
+## Practical Lineage Scenarios
+
+### Scenario 1: Impact Analysis
+
+**Question**: "I need to update the customer table schema. What will be affected?"
+
+
+
+**Steps to Analyze Impact**:
+
+1. **Navigate to the `customers` table** in DataHub
+2. **Click the Lineage tab** to see the full dependency graph
+3. **Look at downstream dependencies** (right side of the lineage view)
+4. **Identify all affected systems**:
+ - Analytics tables that read from customers
+ - Dashboards that display customer data
+ - ML models that use customer features
+ - Reports that include customer metrics
+
+**Impact Assessment**: Any schema change to the `customers` table will potentially affect 8 downstream systems, requiring coordinated updates and testing.
+
+### Scenario 2: Root Cause Analysis
+
+**Question**: "The customer dashboard shows wrong numbers. Where's the problem?"
+
+
+
+**Debugging Steps**:
+
+1. **Start at the `customer_dashboard`** (the problem location)
+2. **Trace upstream dependencies** (left side of lineage view)
+3. **Check each step systematically**:
+ - **ETL Job**: Did it run successfully? Check logs for failures
+ - **Customer Metrics**: Is the data fresh? Look at last update timestamp
+ - **Raw Customers**: Is source data being updated correctly?
+
+**Root Cause Investigation Priority**:
+
+1. **Check ETL Job first** - Most common failure point
+2. **Verify data freshness** - Look for stale or missing data
+3. **Validate transformations** - Ensure business logic is correct
+4. **Confirm source data quality** - Check for upstream issues
+
+**Common Issues Found**:
+
+- ETL job failed silently due to schema changes
+- Data pipeline running but processing stale data
+- Transformation logic changed without proper testing
+- Source system connectivity problems
+
+### Scenario 3: Data Governance
+
+**Question**: "This table contains PII. Where does this sensitive data flow?"
+
+
+
+**Governance Investigation Steps**:
+
+1. **Find the PII source** (e.g., `customer_profiles` table)
+2. **Examine all downstream paths** using DataHub lineage
+3. **Identify systems receiving sensitive data**:
+
+ - CRM systems (legitimate business use)
+ - Marketing platforms (verify consent)
+ - Analytics systems (should be anonymized)
+ - Third-party integrations (compliance risk)
+
+4. **Verify proper controls**:
+ - Access permissions and role-based security
+ - Data anonymization where required
+ - Consent management for marketing use
+ - Audit trails for compliance reporting
+
+**Compliance Checklist**:
+
+- ✅ **Anonymized Analytics**: PII removed, GDPR compliant
+- ✅ **CRM System**: Legitimate business purpose, access controlled
+- ⚠️ **Marketing Campaigns**: Verify consent and opt-in status
+- ✅ **Compliance Audit**: Full access tracking enabled
+
+**Action Items**: Review marketing system access to ensure proper consent management and consider additional anonymization.
+
+## Column-Level Lineage
+
+For detailed analysis, DataHub can show how individual columns flow through transformations:
+
+### Viewing Column Lineage
+
+1. **In the Schema tab** of any dataset
+2. **Click on a specific column**
+3. **Select "View Column Lineage"**
+
+This shows:
+
+- **Source columns**: Which upstream columns contribute to this field
+- **Transformation logic**: How the column is calculated or derived
+- **Downstream usage**: Where this column is used in other systems
+
+### Example: Customer Segment Column
+
+```sql
+-- Source: customers.customer_type + orders.total_spent
+-- Transformation:
+CASE
+ WHEN total_spent > 1000 THEN 'Premium'
+ WHEN total_spent > 500 THEN 'Standard'
+ ELSE 'Basic'
+END as customer_segment
+
+-- Used in: marketing_campaigns, customer_dashboard, ml_features
+```
+
+## Lineage Best Practices
+
+### For Data Consumers
+
+1. **Always check lineage** before using unfamiliar data
+2. **Trace to the source** to understand data freshness and quality
+3. **Identify alternatives** by looking at similar downstream datasets
+4. **Contact upstream owners** when you need data changes
+
+### For Data Producers
+
+1. **Document transformations** so lineage is meaningful
+2. **Use consistent naming** to make lineage easier to follow
+3. **Tag critical paths** to highlight important data flows
+4. **Monitor downstream usage** to understand impact of changes
+
+## Advanced Lineage Features
+
+### Multi-Hop Lineage
+
+**View end-to-end data journeys:**
+
+- Set lineage depth to 3+ hops
+- Trace from raw source to final application
+- Understand complete data supply chains
+
+### Lineage Filtering
+
+**Focus on specific aspects:**
+
+- Filter by entity type (datasets only, pipelines only)
+- Filter by platform (show only Snowflake → dbt flow)
+- Filter by time (show recent lineage changes)
+
+### Lineage Search
+
+**Find specific relationships:**
+
+- "Show me all paths from customers to dashboards"
+- "Find datasets that depend on this API"
+- "Trace this column through all transformations"
+
+## Troubleshooting Lineage Issues
+
+
+
+
+**Issue**: Expected lineage connections don't appear
+
+**Common causes**:
+
+- Ingestion didn't capture SQL parsing
+- Complex transformations not detected
+- Cross-platform connections not configured
+
+**Solutions**:
+
+- Enable SQL parsing in ingestion config
+- Add manual lineage for complex cases
+- Check cross-platform lineage settings
+
+
+
+
+**Issue**: Lineage shows wrong relationships
+
+**Common causes**:
+
+- Temporary tables confusing lineage detection
+- Dynamic SQL not parsed correctly
+- Naming conflicts between systems
+
+**Solutions**:
+
+- Review and correct automatic lineage
+- Add manual lineage overrides
+- Use more specific naming conventions
+
+
+
+
+**Issue**: Lineage graphs load slowly
+
+**Common causes**:
+
+- Very deep lineage (many hops)
+- Large number of connected entities
+- Complex transformation logic
+
+**Solutions**:
+
+- Limit lineage depth
+- Use filters to focus on relevant paths
+- Break down complex transformations
+
+
+
+
+## Tutorial Objectives Achieved
+
+**You've successfully completed your DataHub journey when you can:**
+
+- **Navigate lineage confidently**: Trace enterprise data from `logging_events` to `fct_users_created`
+- **Understand data reliability**: Know that user metrics come from a clean, traceable pipeline
+- **Identify data owners**: You know John Doe owns the user analytics pipeline
+- **Assess data quality**: The lineage shows a proper fact table structure
+
+**Your Achievement**: In 30 minutes, you've mastered essential DataHub skills! You can now:
+
+- **Deploy DataHub** and connect multi-platform data architectures
+- **Find specific datasets** using strategic search techniques
+- **Understand data pipelines** through lineage analysis
+- **Deliver confident analysis** backed by metadata insights
+
+**Analysis Ready**: You now have everything needed to answer business questions about user creation vs. deletion metrics, plus the confidence that comes from understanding the complete data pipeline.
+
+:::tip Mark Your Progress
+Check off "Your First Lineage" in the progress tracker above! You've completed the entire DataHub Quickstart.
+:::
+
+## Tutorial Complete
+
+You've completed the **DataHub in 30 Minutes** tutorial! You now have hands-on experience with DataHub's core capabilities:
+
+**Deployed DataHub** locally and understand its architecture
+**Ingested metadata** from data sources
+**Discovered datasets** using search and browse features
+**Traced data lineage** to understand dependencies
+
+## What's Next?
+
+Now that you understand DataHub fundamentals, explore these advanced topics:
+
+### Immediate Next Steps
+
+- **[Data Discovery & Search](../discovery/overview.md)** - Master advanced search techniques and filters
+- **[Data Lineage & Impact Analysis](../lineage/overview.md)** - Deep dive into lineage analysis and troubleshooting
+- **Data Governance Fundamentals** - Learn about ownership, classification, and business glossaries
+
+### For Your Organization
+
+- **Plan your DataHub deployment** for production use
+- **Identify key data sources** to ingest first
+- **Establish governance processes** for metadata management
+- **Train your team** on DataHub best practices
+
+### Get Help & Stay Connected
+
+- **[Join DataHub Slack](https://datahub.com/slack)** - Connect with the community
+- **[Read the full documentation](../../)** - Comprehensive guides and references
+- **[Watch DataHub tutorials](https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w)** - Video walkthroughs
+- **[Report issues](https://github.com/datahub-project/datahub/issues)** - Help improve DataHub
+
+**Happy data discovering!**
diff --git a/docs/learn-datahub/quickstart/overview.md b/docs/learn-datahub/quickstart/overview.md
new file mode 100644
index 00000000000000..110df154862e38
--- /dev/null
+++ b/docs/learn-datahub/quickstart/overview.md
@@ -0,0 +1,155 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+
+# Chapter 1: DataHub Foundation (30 minutes)
+
+
+
+:::tip Professional Development Journey
+This tutorial follows realistic challenges that data professionals face when implementing metadata management in production environments.
+:::
+
+## The Business Challenge
+
+**Your Role**: You're a data professional tasked with implementing metadata management for a growing technology organization. Data is distributed across multiple systems without centralized discovery or governance.
+
+**The Business Challenge**: Executive leadership requires user engagement metrics for strategic decision-making. The data exists across various systems, but there's no efficient way to locate, validate, and understand data relationships.
+
+**What You'll Accomplish**:
+
+- **Deploy DataHub** as the central metadata management platform
+- **Connect enterprise data systems** across streaming, analytics, and storage platforms
+- **Implement systematic data discovery** to reduce time-to-insight
+- **Establish data lineage tracking** for quality assurance and impact analysis
+
+**Business Outcome**: Enable self-service data discovery while establishing enterprise-grade metadata governance.
+
+## Tutorial Structure
+
+This tutorial is designed to be completed in sequence.
+
+**Total Time: 30 minutes**
+
+## Prerequisites
+
+Before starting, ensure you have:
+
+- **Docker Desktop** installed and running
+- **Python 3.9+** installed
+- **Basic familiarity** with databases and data concepts
+- **15 minutes** of uninterrupted time
+
+## The Business Scenario
+
+**Organizational Context**: You're implementing DataHub for a technology company experiencing rapid growth. Data teams are struggling with:
+
+- **Discovery bottlenecks**: Analysts spend 60% of their time finding relevant data
+- **Quality uncertainty**: No systematic way to validate data reliability
+- **Compliance gaps**: Difficulty tracking data lineage for regulatory requirements
+- **Knowledge silos**: Critical data knowledge trapped with individual team members
+
+**Your Implementation Goal**: Establish DataHub as the central metadata platform to solve these enterprise challenges.
+
+**Enterprise Data Architecture**: You'll work with a realistic multi-platform data ecosystem:
+
+- **Analytics Layer**: User behavior metrics and business KPIs
+- **Streaming Platform**: Real-time event data from Kafka
+- **Data Warehouse**: Processed analytical datasets in Hive
+- **Data Lake**: Raw data storage in HDFS
+
+**Why This Matters**: This architecture represents common enterprise patterns where data teams need centralized metadata management to maintain productivity and compliance.
+
+### DataHub Integration Architecture
+
+DataHub acts as the central metadata hub connecting your entire data ecosystem:
+
+
+
+**Key Integration Points**:
+
+- **Automated Discovery**: DataHub connectors extract metadata from your existing systems
+- **Unified View**: All metadata is standardized and searchable through a single interface
+- **Real-time Updates**: Changes in source systems are reflected immediately in DataHub
+- **API Access**: Programmatic access enables integration with your existing workflows
+
+## Learning Outcomes
+
+After completing this tutorial, you'll be able to:
+
+- **Deploy DataHub** in a local development environment
+- **Connect data sources** and understand ingestion concepts
+- **Find datasets** using DataHub's search and discovery features
+- **Read data lineage** to understand data dependencies
+- **Navigate the DataHub UI** confidently for daily data work
+
+## What's Next?
+
+This tutorial provides the foundation for more advanced DataHub concepts. After completion, consider exploring:
+
+- **[Data Discovery & Search](../discovery/overview.md)** - Master advanced search techniques
+- **[Data Lineage & Impact Analysis](../lineage/overview.md)** - Deep dive into lineage and impact analysis
+- **Data Governance Fundamentals** - Learn ownership, classification, and glossaries
+
+## Need Help?
+
+If you encounter issues during this tutorial:
+
+- Check the [Troubleshooting Guide](../../troubleshooting/quickstart.md)
+- Visit the [DataHub Slack Community](https://datahub.com/slack)
+- Review the [Full Quickstart Documentation](../../quickstart.md)
+
+---
+
+**Ready to get started?** Let's begin with [Setting up DataHub](setup.md) →
diff --git a/docs/learn-datahub/quickstart/setup.md b/docs/learn-datahub/quickstart/setup.md
new file mode 100644
index 00000000000000..b59f287ee94697
--- /dev/null
+++ b/docs/learn-datahub/quickstart/setup.md
@@ -0,0 +1,459 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import NextStepButton from '@site/src/components/NextStepButton';
+import TutorialProgress from '@site/src/components/TutorialProgress';
+import OSDetectionTabs from '@site/src/components/OSDetectionTabs';
+
+# Step 1: Setup DataHub (5 minutes)
+
+
+
+In this step, you'll deploy DataHub locally using Docker. This gives you a complete DataHub environment running on your machine for learning and experimentation.
+
+## What You'll Accomplish
+
+By the end of this step, you'll have:
+
+- DataHub running locally at `http://localhost:9002`
+- Understanding of DataHub's core components
+- Access to the DataHub web interface
+
+## Prerequisites Check
+
+Before we begin, verify you have the required software:
+
+
+
+
+**Run these commands to verify your setup:**
+
+
+
+```bash
+# Check Docker
+docker --version
+docker-compose --version
+
+# Check Python
+python3 --version
+
+# Check Docker is running
+docker ps
+```
+
+**Expected output:**
+
+- Docker version 20.10+
+- Docker Compose version 2.0+
+- Python 3.9+
+- Docker ps should run without errors
+
+
+
+:::tip Success Indicator
+If all commands run without errors, you're ready to proceed!
+:::
+
+
+
+
+If you're missing any prerequisites, follow the OS-specific instructions below:
+
+
+
+
+**Install Docker Desktop:**
+
+1. Download [Docker Desktop for Windows](https://www.docker.com/products/docker-desktop/)
+2. Run the installer and follow the setup wizard
+3. Restart your computer when prompted
+4. Launch Docker Desktop from the Start menu
+
+**Install Python 3.9+:**
+
+1. Download from [python.org](https://www.python.org/downloads/windows/)
+2. **Important**: Check "Add Python to PATH" during installation
+3. Verify installation: Open Command Prompt and run `python --version`
+
+**System Requirements:**
+
+- Windows 10 64-bit: Pro, Enterprise, or Education (Build 16299 or later)
+- WSL 2 feature enabled (Docker Desktop will help set this up)
+- 2 CPUs minimum, 8GB RAM minimum, 12GB free disk space
+
+
+
+
+**Install Docker Desktop:**
+
+1. Download [Docker Desktop for Mac](https://www.docker.com/products/docker-desktop/)
+2. Drag Docker.app to your Applications folder
+3. Launch Docker Desktop from Applications
+4. Follow the setup assistant
+
+**Install Python 3.9+:**
+
+```bash
+# Using Homebrew (recommended)
+brew install python@3.9
+
+# Or download from python.org
+# Visit: https://www.python.org/downloads/macos/
+```
+
+**System Requirements:**
+
+- macOS 10.15 or newer
+- Apple chip (M1/M2) or Intel chip
+- 2 CPUs minimum, 8GB RAM minimum, 12GB free disk space
+
+
+
+
+**Install Docker:**
+
+```bash
+# Ubuntu/Debian
+sudo apt-get update
+sudo apt-get install docker.io docker-compose-plugin
+sudo systemctl start docker
+sudo systemctl enable docker
+
+# CentOS/RHEL/Fedora
+sudo yum install docker docker-compose
+sudo systemctl start docker
+sudo systemctl enable docker
+
+# Add your user to docker group (logout/login required)
+sudo usermod -aG docker $USER
+```
+
+**Install Python 3.9+:**
+
+```bash
+# Ubuntu/Debian
+sudo apt-get install python3 python3-pip
+
+# CentOS/RHEL/Fedora
+sudo yum install python3 python3-pip
+
+# Verify installation
+python3 --version
+```
+
+**System Requirements:**
+
+- 64-bit Linux distribution
+- Kernel version 3.10 or higher
+- 2 CPUs minimum, 8GB RAM minimum, 12GB free disk space
+
+
+
+
+**Common Resource Requirements:**
+
+- 2 CPUs minimum
+- 8GB RAM minimum
+- 12GB free disk space
+
+
+
+
+## Install DataHub CLI
+
+The DataHub CLI is your primary tool for managing DataHub deployments and ingestion.
+
+
+
+
+```cmd
+# Install the DataHub CLI (Command Prompt or PowerShell)
+python -m pip install --upgrade pip wheel setuptools
+python -m pip install --upgrade acryl-datahub
+
+# Verify installation
+datahub version
+```
+
+**Troubleshooting:**
+
+- If `python` command not found, try `py` instead
+- If `datahub` command not found, use `python -m datahub version`
+- Ensure Python was added to PATH during installation
+
+
+
+
+```bash
+# Install the DataHub CLI
+python3 -m pip install --upgrade pip wheel setuptools
+python3 -m pip install --upgrade acryl-datahub
+
+# Verify installation
+datahub version
+```
+
+**Troubleshooting:**
+
+- If `datahub` command not found, use `python3 -m datahub version`
+- On M1/M2 Macs, you might need to install Rosetta 2 for some dependencies
+
+
+
+
+```bash
+# Install the DataHub CLI
+python3 -m pip install --upgrade pip wheel setuptools
+python3 -m pip install --upgrade acryl-datahub
+
+# Verify installation
+datahub version
+
+# Alternative: Install with user flag if permission issues
+python3 -m pip install --user --upgrade acryl-datahub
+```
+
+**Troubleshooting:**
+
+- If `datahub` command not found, use `python3 -m datahub version`
+- Add `~/.local/bin` to PATH if using `--user` flag
+- Use `sudo` only if installing system-wide (not recommended)
+
+
+
+
+**Expected output:**
+
+```
+DataHub CLI version: 0.13.x
+Python version: 3.x.x
+```
+
+## Deploy DataHub
+
+Now let's start DataHub using the quickstart deployment:
+
+
+
+
+```cmd
+# Deploy DataHub locally (Command Prompt)
+datahub docker quickstart
+
+# If datahub command not found, use:
+python -m datahub docker quickstart
+```
+
+**Windows-specific notes:**
+
+- Ensure Docker Desktop is running before executing
+- The process may take longer on Windows due to WSL 2 overhead
+- If you encounter permission issues, run Command Prompt as Administrator
+
+
+
+
+```bash
+# Deploy DataHub locally
+datahub docker quickstart
+
+# If datahub command not found, use:
+python3 -m datahub docker quickstart
+```
+
+**macOS-specific notes:**
+
+- Ensure Docker Desktop is running and has sufficient resources allocated
+- On M1/M2 Macs, some images may need to be built for ARM architecture
+- Grant Docker Desktop access to your file system when prompted
+
+
+
+
+```bash
+# Deploy DataHub locally
+datahub docker quickstart
+
+# If datahub command not found, use:
+python3 -m datahub docker quickstart
+
+# If permission issues with Docker:
+sudo datahub docker quickstart
+```
+
+**Linux-specific notes:**
+
+- Ensure Docker service is running: `sudo systemctl status docker`
+- If using sudo, DataHub files will be owned by root
+- Consider adding your user to the docker group to avoid sudo
+
+
+
+
+This command will:
+
+1. **Download** the DataHub Docker Compose configuration
+2. **Pull** all required Docker images (this may take a few minutes)
+3. **Start** all DataHub services
+
+**What's happening behind the scenes:**
+
+### DataHub Deployment Process
+
+The `datahub docker quickstart` command orchestrates a complete DataHub deployment:
+
+**Phase 1: Environment Preparation**
+
+- Validates Docker installation and system requirements
+- Checks available ports (9002 for frontend, 8080 for backend)
+- Prepares configuration files and networking
+
+**Phase 2: Infrastructure Setup**
+
+- Downloads the latest docker-compose configuration
+- Pulls required Docker images:
+ - `acryldata/datahub-gms` (Backend services)
+ - `acryldata/datahub-frontend-react` (Web interface)
+ - `mysql:8` (Metadata storage)
+ - `opensearchproject/opensearch` (Search index)
+ - `confluentinc/cp-kafka` (Message queue)
+
+**Phase 3: Service Orchestration**
+
+- Starts core infrastructure (MySQL, OpenSearch, Kafka)
+- Initializes DataHub backend services (GMS)
+- Launches the web frontend
+- Configures DataHub Actions for automation
+
+**Expected Timeline**: Initial deployment takes 3-5 minutes depending on your internet connection and system performance.
+
+## Verify Deployment
+
+When deployment completes successfully, you should see:
+
+```
+DataHub is now running
+Ingest some demo data using `datahub docker ingest-sample-data`,
+or head to http://localhost:9002 (username: datahub, password: datahub) to play around with the frontend.
+```
+
+**Let's verify everything is working:**
+
+1. **Check running containers:**
+
+ ```bash
+ docker ps
+ ```
+
+ You should see 6-8 containers running with names like:
+
+ - `datahub-frontend-quickstart-1`
+ - `datahub-datahub-gms-quickstart-1`
+ - `datahub-mysql-1`
+ - `datahub-opensearch-1`
+
+2. **Access the DataHub UI:**
+
+ - Open your browser to [http://localhost:9002](http://localhost:9002)
+ - You should see the DataHub login page
+
+3. **Sign in with default credentials:**
+ ```
+ Username: datahub
+ Password: datahub
+ ```
+
+## Understanding DataHub Architecture
+
+Now that DataHub is running, let's understand what you've deployed:
+
+| Component | Purpose | Port |
+| -------------------- | --------------------------------- | ---- |
+| **DataHub Frontend** | Web UI for users | 9002 |
+| **DataHub GMS** | Metadata API and business logic | 8080 |
+| **MySQL** | Stores metadata and configuration | 3306 |
+| **OpenSearch** | Powers search and discovery | 9200 |
+| **Kafka** | Handles real-time metadata events | 9092 |
+| **DataHub Actions** | Automation and workflows | - |
+
+**Data Flow:**
+
+1. **Metadata ingestion** → GMS API → MySQL (storage) + OpenSearch (search)
+2. **User searches** → Frontend → GMS → OpenSearch → Results
+3. **Real-time updates** → Kafka → Actions → UI notifications
+
+## Troubleshooting
+
+**Common issues and solutions:**
+
+
+
+
+**Error:** `Port already in use`
+
+**Solution:**
+
+```bash
+# Check what's using the port
+lsof -i :9002
+
+# Stop conflicting services or use different ports
+datahub docker quickstart --port 9003
+```
+
+
+
+
+**Error:** `Container fails to start` or `Out of memory`
+
+**Solution:**
+
+1. Increase Docker Desktop memory to 8GB+
+2. Close other applications
+3. Restart Docker Desktop
+
+
+
+
+**Issue:** Services taking a long time to start
+
+**This is normal for first-time setup:**
+
+- Image downloads: 5-10 minutes
+- Service initialization: 2-3 minutes
+- Total first-time setup: 10-15 minutes
+
+
+
+
+## Success Checkpoint
+
+**You've successfully completed Step 1 when:**
+
+- DataHub UI loads at http://localhost:9002
+- You can sign in with datahub/datahub credentials
+- You see the empty DataHub home page
+- All Docker containers are running properly
+
+**What you've learned:**
+
+- How to deploy DataHub locally using Docker
+- DataHub's core architecture components
+- How to verify a successful deployment
+
+
+Next: Ingest Your First Dataset
+