From 5c10a0998a1a7bdf4c2da50a98b83c270d8b6164 Mon Sep 17 00:00:00 2001 From: Jonny Dixon Date: Wed, 29 Oct 2025 13:58:02 +0000 Subject: [PATCH 01/10] initial commit --- .github/workflows/sync-tutorial-styles.yml | 98 ++ docs-website/build.gradle | 44 +- docs-website/generateDocsDir.ts | 1 + docs-website/package.json | 4 +- docs-website/scripts/sync-datahub-styles.js | 224 +++++ docs-website/sidebars.js | 189 ++++ .../components/ArchitectureDiagram/index.jsx | 110 +++ .../ArchitectureDiagram/styles.module.css | 216 +++++ .../components/DataHubEntityCard/index.jsx | 303 ++++++ .../DataHubEntityCard/styles.module.css | 503 ++++++++++ .../components/DataHubLineageNode/index.jsx | 708 ++++++++++++++ .../DataHubLineageNode/styles.module.css | 864 ++++++++++++++++++ .../components/InteractiveDiagram/index.jsx | 195 ++++ .../InteractiveDiagram/styles.module.css | 222 +++++ .../src/components/NextStepButton/index.jsx | 47 + .../NextStepButton/styles.module.css | 65 ++ .../src/components/OSDetectionTabs/index.jsx | 88 ++ .../OSDetectionTabs/styles.module.css | 77 ++ .../src/components/ProcessFlow/index.jsx | 148 +++ .../components/ProcessFlow/styles.module.css | 270 ++++++ .../src/components/StepCompletion/index.jsx | 50 + .../StepCompletion/styles.module.css | 76 ++ .../src/components/TutorialExercise/index.jsx | 137 +++ .../TutorialExercise/styles.module.css | 284 ++++++ .../src/components/TutorialProgress/index.jsx | 203 ++++ .../TutorialProgress/styles.module.css | 379 ++++++++ docs-website/src/css/mermaid-custom.css | 166 ++++ .../src/pages/datahub-components-demo.md | 165 ++++ docs-website/yarn.lock | 360 ++++++++ .../discovery/advanced-search.md | 592 ++++++++++++ .../discovery/collaborative-discovery.md | 543 +++++++++++ .../discovery/dataset-profiles.md | 512 +++++++++++ docs/learn-datahub/discovery/overview.md | 260 ++++++ .../governance/business-glossary.md | 306 +++++++ .../governance/data-classification.md | 271 ++++++ .../governance/governance-policies.md | 332 +++++++ docs/learn-datahub/governance/overview.md | 189 ++++ .../governance/ownership-management.md | 190 ++++ docs/learn-datahub/ingestion/overview.md | 195 ++++ docs/learn-datahub/lineage/impact-analysis.md | 642 +++++++++++++ docs/learn-datahub/lineage/overview.md | 231 +++++ docs/learn-datahub/lineage/reading-lineage.md | 427 +++++++++ docs/learn-datahub/lineage/troubleshooting.md | 738 +++++++++++++++ docs/learn-datahub/overview.md | 227 +++++ docs/learn-datahub/privacy/overview.md | 184 ++++ docs/learn-datahub/quality/data-assertions.md | 367 ++++++++ .../quality/incident-management.md | 363 ++++++++ docs/learn-datahub/quality/overview.md | 155 ++++ .../quality/quality-automation.md | 572 ++++++++++++ .../quality/quality-monitoring.md | 384 ++++++++ .../quickstart/discovery-basics.md | 412 +++++++++ .../quickstart/first-ingestion.md | 449 +++++++++ .../learn-datahub/quickstart/first-lineage.md | 362 ++++++++ docs/learn-datahub/quickstart/overview.md | 137 +++ docs/learn-datahub/quickstart/setup.md | 460 ++++++++++ 55 files changed, 15692 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/sync-tutorial-styles.yml create mode 100644 docs-website/scripts/sync-datahub-styles.js create mode 100644 docs-website/src/components/ArchitectureDiagram/index.jsx create mode 100644 docs-website/src/components/ArchitectureDiagram/styles.module.css create mode 100644 docs-website/src/components/DataHubEntityCard/index.jsx create mode 100644 docs-website/src/components/DataHubEntityCard/styles.module.css create mode 100644 docs-website/src/components/DataHubLineageNode/index.jsx create mode 100644 docs-website/src/components/DataHubLineageNode/styles.module.css create mode 100644 docs-website/src/components/InteractiveDiagram/index.jsx create mode 100644 docs-website/src/components/InteractiveDiagram/styles.module.css create mode 100644 docs-website/src/components/NextStepButton/index.jsx create mode 100644 docs-website/src/components/NextStepButton/styles.module.css create mode 100644 docs-website/src/components/OSDetectionTabs/index.jsx create mode 100644 docs-website/src/components/OSDetectionTabs/styles.module.css create mode 100644 docs-website/src/components/ProcessFlow/index.jsx create mode 100644 docs-website/src/components/ProcessFlow/styles.module.css create mode 100644 docs-website/src/components/StepCompletion/index.jsx create mode 100644 docs-website/src/components/StepCompletion/styles.module.css create mode 100644 docs-website/src/components/TutorialExercise/index.jsx create mode 100644 docs-website/src/components/TutorialExercise/styles.module.css create mode 100644 docs-website/src/components/TutorialProgress/index.jsx create mode 100644 docs-website/src/components/TutorialProgress/styles.module.css create mode 100644 docs-website/src/css/mermaid-custom.css create mode 100644 docs-website/src/pages/datahub-components-demo.md create mode 100644 docs/learn-datahub/discovery/advanced-search.md create mode 100644 docs/learn-datahub/discovery/collaborative-discovery.md create mode 100644 docs/learn-datahub/discovery/dataset-profiles.md create mode 100644 docs/learn-datahub/discovery/overview.md create mode 100644 docs/learn-datahub/governance/business-glossary.md create mode 100644 docs/learn-datahub/governance/data-classification.md create mode 100644 docs/learn-datahub/governance/governance-policies.md create mode 100644 docs/learn-datahub/governance/overview.md create mode 100644 docs/learn-datahub/governance/ownership-management.md create mode 100644 docs/learn-datahub/ingestion/overview.md create mode 100644 docs/learn-datahub/lineage/impact-analysis.md create mode 100644 docs/learn-datahub/lineage/overview.md create mode 100644 docs/learn-datahub/lineage/reading-lineage.md create mode 100644 docs/learn-datahub/lineage/troubleshooting.md create mode 100644 docs/learn-datahub/overview.md create mode 100644 docs/learn-datahub/privacy/overview.md create mode 100644 docs/learn-datahub/quality/data-assertions.md create mode 100644 docs/learn-datahub/quality/incident-management.md create mode 100644 docs/learn-datahub/quality/overview.md create mode 100644 docs/learn-datahub/quality/quality-automation.md create mode 100644 docs/learn-datahub/quality/quality-monitoring.md create mode 100644 docs/learn-datahub/quickstart/discovery-basics.md create mode 100644 docs/learn-datahub/quickstart/first-ingestion.md create mode 100644 docs/learn-datahub/quickstart/first-lineage.md create mode 100644 docs/learn-datahub/quickstart/overview.md create mode 100644 docs/learn-datahub/quickstart/setup.md diff --git a/.github/workflows/sync-tutorial-styles.yml b/.github/workflows/sync-tutorial-styles.yml new file mode 100644 index 00000000000000..04d006836bbd80 --- /dev/null +++ b/.github/workflows/sync-tutorial-styles.yml @@ -0,0 +1,98 @@ +name: Sync Tutorial Component Styles + +on: + # Run when DataHub web-react styles are updated + push: + paths: + - "datahub-web-react/src/alchemy-components/theme/**" + - "datahub-web-react/src/alchemy-components/theme/foundations/colors.ts" + - "datahub-web-react/src/alchemy-components/theme/semantic-tokens.ts" + + # Allow manual triggering + workflow_dispatch: + + # Run weekly to catch any missed updates + schedule: + - cron: "0 2 * * 1" # Every Monday at 2 AM UTC + +jobs: + sync-styles: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: "18" + cache: "yarn" + cache-dependency-path: docs-website/yarn.lock + + - name: Install dependencies + working-directory: docs-website + run: yarn install --frozen-lockfile + + - name: Sync DataHub styles + working-directory: docs-website + run: yarn sync-datahub-styles + + - name: Check for changes + id: changes + run: | + if git diff --quiet; then + echo "changed=false" >> $GITHUB_OUTPUT + else + echo "changed=true" >> $GITHUB_OUTPUT + fi + + - name: Commit and push changes + if: steps.changes.outputs.changed == 'true' + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + git add docs-website/src/components/*/styles.module.css + git commit -m "🎨 Auto-sync tutorial component styles with DataHub UI + + - Updated design tokens from DataHub web-react + - Synced colors, spacing, and styling + - Ensures tutorial components match actual DataHub UI" + git push + + - name: Create PR comment (if applicable) + if: steps.changes.outputs.changed == 'true' && github.event_name == 'push' + uses: actions/github-script@v7 + with: + script: | + const { owner, repo } = context.repo; + const sha = context.sha; + + // Find associated PR + const prs = await github.rest.pulls.list({ + owner, + repo, + state: 'open', + head: `${owner}:${context.ref.replace('refs/heads/', '')}` + }); + + if (prs.data.length > 0) { + const pr = prs.data[0]; + await github.rest.issues.createComment({ + owner, + repo, + issue_number: pr.number, + body: `🎨 **Tutorial styles auto-updated!** + + The tutorial components have been automatically updated to match the latest DataHub UI styling changes. + + **What changed:** + - Design tokens synced from \`datahub-web-react\` + - Component colors and styling updated + - Tutorial components now match production DataHub UI + + This ensures users see consistent styling between tutorials and the actual DataHub interface.` + }); + } diff --git a/docs-website/build.gradle b/docs-website/build.gradle index ee3cffe2e4cc71..d67f69e6b58415 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -110,7 +110,38 @@ task yarnInstall(type: YarnTask) { outputs.dir('node_modules') } -task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, +// Sync tutorial component styles with DataHub UI at build time +task syncDataHubStyles(type: YarnTask, dependsOn: [yarnInstall]) { + description = 'Sync tutorial component styles with actual DataHub UI design tokens' + + // Input: DataHub design token files + inputs.files( + file('../datahub-web-react/src/alchemy-components/theme/foundations/colors.ts'), + file('../datahub-web-react/src/alchemy-components/theme/semantic-tokens.ts'), + file('scripts/sync-datahub-styles.js') + ) + + // Output: Component style files + outputs.files( + file('src/components/DataHubEntityCard/styles.module.css'), + file('src/components/DataHubLineageNode/styles.module.css') + ) + + // Cache the sync result for performance + outputs.cacheIf { true } + + args = ['run', 'sync-datahub-styles'] + + doFirst { + logger.info('🎨 Syncing tutorial component styles with DataHub UI...') + } + + doLast { + logger.info('✅ Tutorial component styles synced successfully') + } +} + +task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, syncDataHubStyles, generateGraphQLSchema, generateJsonSchema, ':metadata-ingestion:modelDocGen', ':metadata-ingestion:docGen', ]) { @@ -142,8 +173,9 @@ task yarnLintFix(type: YarnTask, dependsOn: [yarnInstall]) { args = ['run', 'lint-fix'] } -task serve(type: YarnTask, dependsOn: [yarnInstall] ) { - args = ['run', 'serve'] +// Development server with hot reloads (recommended for development) +task dev(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate, downloadHistoricalVersions]) { + args = ['run', 'start'] } @@ -167,6 +199,12 @@ task yarnBuild(type: YarnTask, dependsOn: [yarnLint, yarnGenerate, downloadHisto args = ['run', 'build'] } + +// Serve built site (requires build first) +task serve(type: YarnTask, dependsOn: [yarnBuild] ) { + args = ['run', 'serve'] +} + task yarnClear(type: YarnTask) { args = ['run','clear'] } diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index dbae4b5ee1e77f..b695ff60f66501 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -199,6 +199,7 @@ const allowed_broken_links = [ "docs/how/graph-onboarding.md", "docs/how/search-onboarding.md", "docs/how/build-metadata-service.md", + "docs/learn-datahub/overview.md", ]; function markdown_guess_title( diff --git a/docs-website/package.json b/docs-website/package.json index 5501cfc2a5d718..aad7485c55ef90 100644 --- a/docs-website/package.json +++ b/docs-website/package.json @@ -20,7 +20,8 @@ "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js", "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js", "_list-link-check-files": "find ./genDocs -name '*.md' -not \\( -path './genDocs/python-sdk/*' -o -path './genDocs/releases.md' \\)", - "check-links": "yarn run -s _list-link-check-files -print0 | xargs -0 -n1 -t markdown-link-check -q -c markdown-link-check-config.json" + "check-links": "yarn run -s _list-link-check-files -print0 | xargs -0 -n1 -t markdown-link-check -q -c markdown-link-check-config.json", + "sync-datahub-styles": "node scripts/sync-datahub-styles.js" }, "dependencies": { "@ant-design/icons": "^4.7.0", @@ -48,6 +49,7 @@ "react": "^18.2.0", "react-dom": "18.2.0", "react-use-draggable-scroll": "^0.4.7", + "reactflow": "^11.11.4", "sass": "^1.43.2", "swc-loader": "^0.2.6", "swiper": "^11.1.4", diff --git a/docs-website/scripts/sync-datahub-styles.js b/docs-website/scripts/sync-datahub-styles.js new file mode 100644 index 00000000000000..dd5291a6bf0070 --- /dev/null +++ b/docs-website/scripts/sync-datahub-styles.js @@ -0,0 +1,224 @@ +#!/usr/bin/env node + +/** + * Sync DataHub Styles Script + * + * This script automatically extracts design tokens from the DataHub web-react + * codebase and updates the tutorial component styles to match the actual UI. + * + * Usage: node scripts/sync-datahub-styles.js + */ + +const fs = require('fs'); +const path = require('path'); + +// Paths +const DATAHUB_COLORS_PATH = '../../datahub-web-react/src/alchemy-components/theme/foundations/colors.ts'; +const DATAHUB_SEMANTIC_TOKENS_PATH = '../../datahub-web-react/src/alchemy-components/theme/semantic-tokens.ts'; +const DOCS_COMPONENTS_DIR = './src/components'; + +/** + * Extract color values from DataHub's colors.ts file + */ +function extractDataHubColors() { + try { + const colorsFile = fs.readFileSync(path.resolve(__dirname, DATAHUB_COLORS_PATH), 'utf8'); + + // Extract color definitions using regex + const colorMatches = colorsFile.match(/(\w+):\s*{([^}]+)}/g) || []; + const singleColorMatches = colorsFile.match(/(\w+):\s*'([^']+)'/g) || []; + + const colors = {}; + + // Parse nested color objects (e.g., gray: { 100: '#EBECF0', ... }) + colorMatches.forEach(match => { + const [, colorName, colorValues] = match.match(/(\w+):\s*{([^}]+)}/); + const values = {}; + + const valueMatches = colorValues.match(/(\d+):\s*'([^']+)'/g) || []; + valueMatches.forEach(valueMatch => { + const [, key, value] = valueMatch.match(/(\d+):\s*'([^']+)'/); + values[key] = value; + }); + + colors[colorName] = values; + }); + + // Parse single color values (e.g., white: '#FFFFFF') + singleColorMatches.forEach(match => { + const [, colorName, colorValue] = match.match(/(\w+):\s*'([^']+)'/); + colors[colorName] = colorValue; + }); + + return colors; + } catch (error) { + console.warn('Could not read DataHub colors file:', error.message); + return null; + } +} + +/** + * Extract semantic tokens from DataHub's semantic-tokens.ts file + */ +function extractSemanticTokens() { + try { + const semanticFile = fs.readFileSync(path.resolve(__dirname, DATAHUB_SEMANTIC_TOKENS_PATH), 'utf8'); + + // Extract semantic token mappings + const tokenMatches = semanticFile.match(/'([^']+)':\s*colors\.([^,\s]+)/g) || []; + const tokens = {}; + + tokenMatches.forEach(match => { + const [, tokenName, colorPath] = match.match(/'([^']+)':\s*colors\.([^,\s]+)/); + tokens[tokenName] = colorPath; + }); + + return tokens; + } catch (error) { + console.warn('Could not read DataHub semantic tokens file:', error.message); + return null; + } +} + +/** + * Generate CSS variables from DataHub colors + */ +function generateCSSVariables(colors, semanticTokens) { + if (!colors) return ''; + + let cssVars = `/* Auto-generated DataHub Design Tokens */\n:root {\n`; + + // Core color mappings based on DataHub's actual usage + const colorMappings = { + 'datahub-primary': colors.primary?.[500] || colors.violet?.[500] || '#533FD1', + 'datahub-primary-dark': colors.primary?.[600] || colors.violet?.[600] || '#4C39BE', + 'datahub-primary-light': colors.primary?.[400] || colors.violet?.[400] || '#7565DA', + 'datahub-primary-lightest': colors.primary?.[0] || colors.violet?.[0] || '#F1F3FD', + 'datahub-gray-100': colors.gray?.[100] || '#EBECF0', + 'datahub-gray-600': colors.gray?.[600] || '#374066', + 'datahub-gray-1700': colors.gray?.[1700] || '#5F6685', + 'datahub-gray-1800': colors.gray?.[1800] || '#8088A3', + 'datahub-gray-1500': colors.gray?.[1500] || '#F9FAFC', + 'datahub-white': colors.white || '#FFFFFF', + 'datahub-success': colors.green?.[500] || '#77B750', + 'datahub-warning': colors.yellow?.[500] || '#EEAE09', + 'datahub-error': colors.red?.[500] || '#CD0D24', + 'datahub-border': colors.gray?.[1400] || '#E9EAEE', + }; + + // Add CSS variables + Object.entries(colorMappings).forEach(([varName, value]) => { + cssVars += ` --${varName}: ${value};\n`; + }); + + // Add shadows and other design tokens + cssVars += ` --datahub-shadow: 0px 1px 2px 0px rgba(33, 23, 95, 0.07);\n`; + cssVars += ` --datahub-shadow-hover: 0 2px 8px rgba(83, 63, 209, 0.15);\n`; + cssVars += ` --datahub-node-width: 320px;\n`; + cssVars += ` --datahub-node-height: 90px;\n`; + cssVars += ` --datahub-transformation-size: 40px;\n`; + cssVars += `}\n\n`; + + // Dark mode variables + cssVars += `/* Dark mode colors */\n[data-theme='dark'] {\n`; + const darkMappings = { + 'datahub-primary': colors.primary?.[400] || colors.violet?.[400] || '#7565DA', + 'datahub-primary-dark': colors.primary?.[500] || colors.violet?.[500] || '#533FD1', + 'datahub-primary-light': colors.primary?.[300] || colors.violet?.[300] || '#8C7EE0', + 'datahub-primary-lightest': colors.primary?.[800] || colors.violet?.[800] || '#2E2373', + 'datahub-gray-100': colors.gray?.[700] || '#2F3657', + 'datahub-gray-600': colors.gray?.[200] || '#CFD1DA', + 'datahub-gray-1700': colors.gray?.[300] || '#A9ADBD', + 'datahub-gray-1800': colors.gray?.[400] || '#81879F', + 'datahub-gray-1500': colors.gray?.[2000] || '#1E2338', + 'datahub-white': colors.gray?.[800] || '#272D48', + 'datahub-border': colors.gray?.[600] || '#374066', + }; + + Object.entries(darkMappings).forEach(([varName, value]) => { + cssVars += ` --${varName}: ${value};\n`; + }); + + cssVars += `}\n\n`; + + return cssVars; +} + +/** + * Update component CSS files with new design tokens + */ +function updateComponentStyles(cssVariables) { + const componentDirs = ['DataHubEntityCard', 'DataHubLineageNode']; + + componentDirs.forEach(componentDir => { + const styleFile = path.join(DOCS_COMPONENTS_DIR, componentDir, 'styles.module.css'); + + try { + let content = fs.readFileSync(styleFile, 'utf8'); + + // Replace the CSS variables section + const variableRegex = /\/\* Auto-generated DataHub Design Tokens \*\/[\s\S]*?}\s*\n\s*\n/; + + if (variableRegex.test(content)) { + content = content.replace(variableRegex, cssVariables); + } else { + // If no existing variables section, add at the top + content = cssVariables + content; + } + + fs.writeFileSync(styleFile, content); + console.log(`✅ Updated ${componentDir} styles`); + + } catch (error) { + console.error(`❌ Failed to update ${componentDir}:`, error.message); + } + }); +} + +/** + * Main execution + */ +function main() { + console.log('🔄 Syncing DataHub styles...\n'); + + const colors = extractDataHubColors(); + const semanticTokens = extractSemanticTokens(); + + if (!colors) { + console.warn('⚠️ Could not extract DataHub colors from source files.'); + console.log(' Using fallback design tokens to ensure build continues...\n'); + + // Use fallback colors to ensure build doesn't fail + const fallbackColors = { + primary: { 500: '#533FD1', 600: '#4C39BE', 400: '#7565DA', 0: '#F1F3FD' }, + gray: { 100: '#EBECF0', 600: '#374066', 1700: '#5F6685', 1800: '#8088A3', 1500: '#F9FAFC' }, + white: '#FFFFFF', + green: { 500: '#77B750' }, + yellow: { 500: '#EEAE09' }, + red: { 500: '#CD0D24' } + }; + + const cssVariables = generateCSSVariables(fallbackColors, null); + updateComponentStyles(cssVariables); + + console.log('✅ Applied fallback styling - components will use default DataHub colors'); + return; + } + + console.log('📊 Extracted DataHub design tokens'); + console.log(` - Colors: ${Object.keys(colors).length} palettes`); + console.log(` - Semantic tokens: ${semanticTokens ? Object.keys(semanticTokens).length : 0} mappings\n`); + + const cssVariables = generateCSSVariables(colors, semanticTokens); + updateComponentStyles(cssVariables); + + console.log('\n🎉 DataHub styles sync completed!'); + console.log(' Tutorial components now match the latest DataHub UI styling.'); +} + +// Run the script +if (require.main === module) { + main(); +} + +module.exports = { extractDataHubColors, generateCSSVariables, updateComponentStyles }; diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 27d538bea639f8..07f1e37aa9b40a 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -34,6 +34,195 @@ module.exports = { }, ], }, + { + type: "category", + label: "Learn DataHub", + collapsed: false, + link: { type: "doc", id: "docs/learn-datahub/overview" }, + items: [ + { + type: "category", + label: "DataHub Quickstart (30 min)", + collapsed: true, + link: { type: "doc", id: "docs/learn-datahub/quickstart/overview" }, + items: [ + { + type: "doc", + label: "Setup DataHub (5 min)", + id: "docs/learn-datahub/quickstart/setup", + }, + { + type: "doc", + label: "First Data Ingestion (10 min)", + id: "docs/learn-datahub/quickstart/first-ingestion", + }, + { + type: "doc", + label: "Discovery Basics (10 min)", + id: "docs/learn-datahub/quickstart/discovery-basics", + }, + { + type: "doc", + label: "Your First Lineage (5 min)", + id: "docs/learn-datahub/quickstart/first-lineage", + }, + ], + }, + { + type: "category", + label: "Data Discovery & Search (45 min)", + collapsed: true, + link: { type: "doc", id: "docs/learn-datahub/discovery/overview" }, + items: [ + { + type: "doc", + label: "Advanced Search Techniques (15 min)", + id: "docs/learn-datahub/discovery/advanced-search", + }, + { + type: "doc", + label: "Understanding Dataset Profiles (20 min)", + id: "docs/learn-datahub/discovery/dataset-profiles", + }, + { + type: "doc", + label: "Collaborative Discovery (10 min)", + id: "docs/learn-datahub/discovery/collaborative-discovery", + }, + ], + }, + { + type: "category", + label: "Data Lineage & Impact Analysis (40 min)", + collapsed: true, + link: { type: "doc", id: "docs/learn-datahub/lineage/overview" }, + items: [ + { + type: "doc", + label: "Reading Lineage Graphs (15 min)", + id: "docs/learn-datahub/lineage/reading-lineage", + }, + { + type: "doc", + label: "Performing Impact Analysis (15 min)", + id: "docs/learn-datahub/lineage/impact-analysis", + }, + { + type: "doc", + label: "Lineage Troubleshooting (10 min)", + id: "docs/learn-datahub/lineage/troubleshooting", + }, + ], + }, + { + type: "category", + label: "Data Governance Fundamentals (50 min)", + collapsed: true, + link: { type: "doc", id: "docs/learn-datahub/governance/overview" }, + items: [ + { + type: "doc", + label: "Ownership Management (12 min)", + id: "docs/learn-datahub/governance/ownership-management", + }, + { + type: "doc", + label: "Data Classification (15 min)", + id: "docs/learn-datahub/governance/data-classification", + }, + { + type: "doc", + label: "Business Glossary (12 min)", + id: "docs/learn-datahub/governance/business-glossary", + }, + { + type: "doc", + label: "Governance Policies (11 min)", + id: "docs/learn-datahub/governance/governance-policies", + }, + ], + }, + { + type: "category", + label: "Data Quality & Monitoring (45 min)", + collapsed: true, + link: { type: "doc", id: "docs/learn-datahub/quality/overview" }, + items: [ + { + type: "doc", + label: "Data Assertions (15 min)", + id: "docs/learn-datahub/quality/data-assertions", + }, + { + type: "doc", + label: "Quality Monitoring (12 min)", + id: "docs/learn-datahub/quality/quality-monitoring", + }, + { + type: "doc", + label: "Incident Management (10 min)", + id: "docs/learn-datahub/quality/incident-management", + }, + { + type: "doc", + label: "Quality Automation (8 min)", + id: "docs/learn-datahub/quality/quality-automation", + }, + ], + }, + // { + // type: "category", + // label: "Data Ingestion Mastery (60 min)", + // collapsed: true, + // link: { type: "doc", id: "docs/learn-datahub/ingestion/overview" }, + // items: [ + // { + // type: "doc", + // label: "Recipe Fundamentals (15 min)", + // id: "docs/learn-datahub/ingestion/recipe-fundamentals", + // }, + // { + // type: "doc", + // label: "Stateful Ingestion (15 min)", + // id: "docs/learn-datahub/ingestion/stateful-ingestion", + // }, + // { + // type: "doc", + // label: "Data Profiling (15 min)", + // id: "docs/learn-datahub/ingestion/data-profiling", + // }, + // { + // type: "doc", + // label: "Advanced Patterns (15 min)", + // id: "docs/learn-datahub/ingestion/advanced-patterns", + // }, + // ], + // }, + // { + // type: "category", + // label: "Privacy & Compliance (35 min)", + // collapsed: true, + // link: { type: "doc", id: "docs/learn-datahub/privacy/overview" }, + // items: [ + // { + // type: "doc", + // label: "PII Detection (12 min)", + // id: "docs/learn-datahub/privacy/pii-detection", + // }, + // { + // type: "doc", + // label: "Privacy Controls (12 min)", + // id: "docs/learn-datahub/privacy/privacy-controls", + // }, + // { + // type: "doc", + // label: "Compliance Workflows (11 min)", + // id: "docs/learn-datahub/privacy/compliance-workflows", + // }, + // ], + // }, + ], + }, { type: "category", label: "Features", diff --git a/docs-website/src/components/ArchitectureDiagram/index.jsx b/docs-website/src/components/ArchitectureDiagram/index.jsx new file mode 100644 index 00000000000000..8a42048167064e --- /dev/null +++ b/docs-website/src/components/ArchitectureDiagram/index.jsx @@ -0,0 +1,110 @@ +import React from 'react'; +import styles from './styles.module.css'; + +const ArchitectureDiagram = ({ type = 'integration' }) => { + if (type === 'integration') { + return ( +
+
DataHub Integration Architecture
+ +
+ {/* Source Systems Layer */} +
+
Source Systems
+
+
+
🗄️
+
Kafka Streams
+
Real-time Events
+
+
+
🏢
+
Hive Tables
+
Data Warehouse
+
+
+
📁
+
HDFS Files
+
Data Lake
+
+
+
+ + {/* Arrows */} +
+
+
+
+
+ + {/* DataHub Core Layer */} +
+
DataHub Core
+
+
+
🔗
+
Metadata API
+
GraphQL & REST
+
+
+
🕸️
+
Graph Database
+
Relationships
+
+
+
🔍
+
Search Index
+
Elasticsearch
+
+
+
+ + {/* Arrows */} +
+
+
+
+
+ + {/* User Interface Layer */} +
+
User Interface
+
+
+
🔎
+
Search & Browse
+
Data Discovery
+
+
+
🌐
+
Lineage View
+
Data Flow
+
+
+
📊
+
Data Profiles
+
Quality Metrics
+
+
+
+
+ +
+
+ Data Flow: + Extract Metadata + + Process & Store + + Search & Discover +
+
+
+ ); + } + + // Add other diagram types as needed + return null; +}; + +export default ArchitectureDiagram; diff --git a/docs-website/src/components/ArchitectureDiagram/styles.module.css b/docs-website/src/components/ArchitectureDiagram/styles.module.css new file mode 100644 index 00000000000000..64ae703bffd60c --- /dev/null +++ b/docs-website/src/components/ArchitectureDiagram/styles.module.css @@ -0,0 +1,216 @@ +/* Architecture Diagram Styles */ +.architectureDiagram { + background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%); + border: 2px solid var(--ifm-color-primary-light); + border-radius: 12px; + padding: 24px; + margin: 24px 0; + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1); +} + +.diagramTitle { + text-align: center; + font-size: 1.4rem; + font-weight: 600; + color: var(--ifm-color-primary-dark); + margin-bottom: 24px; + padding-bottom: 12px; + border-bottom: 2px solid var(--ifm-color-primary-lightest); +} + +.diagramContainer { + display: flex; + align-items: center; + justify-content: space-between; + gap: 16px; + flex-wrap: wrap; +} + +.layer { + flex: 1; + min-width: 200px; +} + +.layerTitle { + text-align: center; + font-size: 1.1rem; + font-weight: 600; + color: var(--ifm-color-emphasis-700); + margin-bottom: 16px; + padding: 8px 12px; + background: var(--ifm-color-primary-lightest); + border-radius: 6px; + border: 1px solid var(--ifm-color-primary-light); +} + +.nodeGroup { + display: flex; + flex-direction: column; + gap: 12px; +} + +.node { + background: white; + border: 2px solid; + border-radius: 8px; + padding: 12px; + text-align: center; + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08); + transition: all 0.3s ease; + cursor: pointer; +} + +.node:hover { + transform: translateY(-2px); + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.15); +} + +.sourceNode { + border-color: #10b981; + background: linear-gradient(135deg, #ecfdf5 0%, #d1fae5 100%); +} + +.sourceNode:hover { + border-color: #059669; + background: linear-gradient(135deg, #d1fae5 0%, #a7f3d0 100%); +} + +.coreNode { + border-color: #3b82f6; + background: linear-gradient(135deg, #eff6ff 0%, #dbeafe 100%); +} + +.coreNode:hover { + border-color: #2563eb; + background: linear-gradient(135deg, #dbeafe 0%, #bfdbfe 100%); +} + +.uiNode { + border-color: #8b5cf6; + background: linear-gradient(135deg, #f3f4f6 0%, #e5e7eb 100%); +} + +.uiNode:hover { + border-color: #7c3aed; + background: linear-gradient(135deg, #e5e7eb 0%, #d1d5db 100%); +} + +.nodeIcon { + font-size: 1.5rem; + margin-bottom: 8px; +} + +.nodeLabel { + font-weight: 600; + font-size: 0.9rem; + color: var(--ifm-color-emphasis-800); + margin-bottom: 4px; +} + +.nodeSubtext { + font-size: 0.75rem; + color: var(--ifm-color-emphasis-600); + font-style: italic; +} + +.arrowLayer { + display: flex; + flex-direction: column; + justify-content: center; + align-items: center; + gap: 12px; + flex-shrink: 0; +} + +.arrow { + font-size: 1.5rem; + color: var(--ifm-color-primary); + font-weight: bold; + animation: pulse 2s infinite; +} + +@keyframes pulse { + 0%, 100% { opacity: 0.7; } + 50% { opacity: 1; } +} + +.diagramFooter { + margin-top: 24px; + padding-top: 16px; + border-top: 1px solid var(--ifm-color-primary-lightest); +} + +.dataFlow { + display: flex; + align-items: center; + justify-content: center; + gap: 12px; + flex-wrap: wrap; +} + +.flowLabel { + font-weight: 600; + color: var(--ifm-color-primary-dark); +} + +.flowStep { + background: var(--ifm-color-primary-lightest); + padding: 6px 12px; + border-radius: 20px; + font-size: 0.85rem; + font-weight: 500; + color: var(--ifm-color-primary-dark); + border: 1px solid var(--ifm-color-primary-light); +} + +.flowArrow { + color: var(--ifm-color-primary); + font-weight: bold; +} + +/* Responsive Design */ +@media (max-width: 768px) { + .diagramContainer { + flex-direction: column; + } + + .arrowLayer { + flex-direction: row; + transform: rotate(90deg); + } + + .arrow { + transform: rotate(90deg); + } + + .dataFlow { + flex-direction: column; + gap: 8px; + } +} + +/* Dark mode support */ +[data-theme='dark'] .architectureDiagram { + background: linear-gradient(135deg, #1e293b 0%, #334155 100%); + border-color: var(--ifm-color-primary-dark); +} + +[data-theme='dark'] .node { + background: var(--ifm-color-emphasis-100); + color: var(--ifm-color-emphasis-800); +} + +[data-theme='dark'] .sourceNode { + background: linear-gradient(135deg, #064e3b 0%, #065f46 100%); + color: #ecfdf5; +} + +[data-theme='dark'] .coreNode { + background: linear-gradient(135deg, #1e3a8a 0%, #1e40af 100%); + color: #eff6ff; +} + +[data-theme='dark'] .uiNode { + background: linear-gradient(135deg, #581c87 0%, #6b21a8 100%); + color: #f3f4f6; +} diff --git a/docs-website/src/components/DataHubEntityCard/index.jsx b/docs-website/src/components/DataHubEntityCard/index.jsx new file mode 100644 index 00000000000000..813827912ddd46 --- /dev/null +++ b/docs-website/src/components/DataHubEntityCard/index.jsx @@ -0,0 +1,303 @@ +import React from 'react'; +import styles from './styles.module.css'; + +// Health icon components matching DataHub's HealthIcon (same as lineage nodes) +const HealthIcon = ({ health, size = 14 }) => { + const iconStyle = { + width: `${size}px`, + height: `${size}px`, + display: 'inline-block', + marginLeft: '6px', + verticalAlign: 'middle', + }; + + if (health === 'Good') { + return ( + + + + ); + } + + if (health === 'Warning' || health === 'Critical') { + const color = health === 'Critical' ? '#ff4d4f' : '#faad14'; + return ( + + + + ); + } + + return null; +}; + +// Simplified version of DataHub's DefaultPreviewCard for tutorials +const DataHubEntityCard = ({ + name, + type = 'Dataset', + platform = 'Hive', + description, + owners = [], + tags = [], + glossaryTerms = [], + assertions = { passing: 0, failing: 0, total: 0 }, + health = 'Good', + url = '#', + className = '', +}) => { + // Use actual DataHub platform logos from the docs website + const getPlatformLogo = (platformName) => { + const logoMap = { + 'Hive': '/img/logos/platforms/hive.svg', + 'Kafka': '/img/logos/platforms/kafka.svg', + 'HDFS': '/img/logos/platforms/hadoop.svg', + 'Snowflake': '/img/logos/platforms/snowflake.svg', + 'BigQuery': '/img/logos/platforms/bigquery.svg', + 'Spark': '/img/logos/platforms/spark.svg', + 'PostgreSQL': '/img/logos/platforms/postgres.svg', + 'Postgres': '/img/logos/platforms/postgres.svg', + 'postgres': '/img/logos/platforms/postgres.svg', + 'MySQL': '/img/logos/platforms/mysql.svg', + 'MongoDB': '/img/logos/platforms/mongodb.svg', + 'Elasticsearch': '/img/logos/platforms/elasticsearch.svg', + 'Redshift': '/img/logos/platforms/redshift.svg', + 'Databricks': '/img/logos/platforms/databricks.png', + 'dbt': '/img/logos/platforms/dbt.svg', + 'Airflow': '/img/logos/platforms/airflow.svg', + 'Looker': '/img/logos/platforms/looker.svg', + 'Tableau': '/img/logos/platforms/tableau.png', + 'PowerBI': '/img/logos/platforms/powerbi.png', + 'Superset': '/img/logos/platforms/superset.svg', + }; + return logoMap[platformName] || '/img/logos/platforms/acryl.svg'; + }; + + const healthColors = { + 'Good': '#52c41a', + 'Warning': '#faad14', + 'Critical': '#ff4d4f', + }; + + // Get ownership type icon based on type + const getOwnershipTypeIcon = (ownershipType) => { + switch (ownershipType) { + case 'Technical Owner': + return '👨‍💻'; + case 'Business Owner': + return '👔'; + case 'Data Steward': + return '🛡️'; + case 'Data Owner': + return '📊'; + default: + return '👤'; + } + }; + + // Get assertion status icon + const getAssertionStatusIcon = (assertions) => { + if (assertions.total === 0) return null; + if (assertions.failing > 0) return '❌'; + if (assertions.passing === assertions.total) return '✅'; + return '⚠️'; + }; + + // Generate color hash for tags (matching DataHub's ColorHash) + const generateTagColor = (tagName) => { + let hash = 0; + for (let i = 0; i < tagName.length; i++) { + const char = tagName.charCodeAt(i); + hash = ((hash << 5) - hash) + char; + hash = hash & hash; + } + const hue = Math.abs(hash) % 360; + return `hsl(${hue}, 70%, 45%)`; + }; + + // Generate color for glossary terms + const generateTermColor = (termName) => { + const colors = [ + '#1890ff', '#52c41a', '#faad14', '#f5222d', '#722ed1', + '#fa541c', '#13c2c2', '#eb2f96', '#a0d911', '#fadb14' + ]; + let hash = 0; + for (let i = 0; i < termName.length; i++) { + hash = ((hash << 5) - hash) + termName.charCodeAt(i); + } + return colors[Math.abs(hash) % colors.length]; + }; + + // Tag component matching DataHub's StyledTag + const DataHubTag = ({ tag }) => ( +
+
+ {tag} +
+ ); + + // Glossary term component matching DataHub's Term + const DataHubTerm = ({ term }) => ( +
+
+ {term} +
+ ); + + return ( +
+
+
+ {`${platform} + {type} + + {platform} +
+
+ +
+

+ + {name} + {health && } + +

+ + {description && ( +

+ {description} +

+ )} + + {(tags.length > 0 || glossaryTerms.length > 0) && ( +
+ {tags.map((tag, index) => ( + + ))} + {glossaryTerms.map((term, index) => ( + + ))} +
+ )} + + {owners.length > 0 && ( +
+
Ownership
+ {(() => { + // Group owners by type + const ownersByType = {}; + owners.forEach(owner => { + const type = owner.type || 'Technical Owner'; + if (!ownersByType[type]) ownersByType[type] = []; + ownersByType[type].push(owner); + }); + + return Object.entries(ownersByType).map(([type, typeOwners]) => ( +
+
+ + {getOwnershipTypeIcon(type)} + + {type} +
+
+ {typeOwners.map((owner, index) => ( + + {owner.name || owner} + + ))} +
+
+ )); + })()} +
+ )} + + {assertions.total > 0 && ( +
+ Assertions: + + {getAssertionStatusIcon(assertions)} + + {assertions.passing}/{assertions.total} passing + + +
+ )} +
+
+ ); +}; + +// Pre-configured sample entities for tutorials +export const SampleEntities = { + userCreatedTable: { + name: 'fct_users_created', + type: 'Table', + platform: 'Hive', + description: 'Fact table tracking user creation events with timestamps and attribution', + owners: [ + { name: 'john.doe@company.com', type: 'Technical Owner' }, + { name: 'sarah.smith@company.com', type: 'Business Owner' } + ], + tags: ['PII', 'User Analytics', 'Daily'], + glossaryTerms: ['User Metrics', 'Fact Table'], + assertions: { passing: 8, failing: 0, total: 8 }, + health: 'Good', + }, + + userDeletedTable: { + name: 'fct_users_deleted', + type: 'Table', + platform: 'Hive', + description: 'Fact table tracking user deletion events and reasons', + owners: [ + { name: 'john.doe@company.com', type: 'Technical Owner' } + ], + tags: ['User Analytics', 'Daily'], + glossaryTerms: ['User Metrics'], + assertions: { passing: 5, failing: 1, total: 6 }, + health: 'Good', + }, + + kafkaUserEvents: { + name: 'user_events', + type: 'Topic', + platform: 'Kafka', + description: 'Real-time stream of user activity events', + owners: [ + { name: 'data.engineering@company.com', type: 'Technical Owner' }, + { name: 'mike.wilson@company.com', type: 'Data Steward' } + ], + tags: ['Streaming', 'Real-time', 'PII'], + glossaryTerms: ['User Activity', 'Event Data'], + assertions: { passing: 12, failing: 0, total: 12 }, + health: 'Good', + }, + + rawUserData: { + name: 'raw_user_data', + type: 'Dataset', + platform: 'HDFS', + description: 'Raw user registration and profile data from application database', + owners: [ + { name: 'data.platform@company.com', type: 'Data Owner' } + ], + tags: ['Raw', 'PII', 'Hourly'], + glossaryTerms: ['Source Data', 'User Information'], + assertions: { passing: 3, failing: 2, total: 5 }, + health: 'Warning', + }, +}; + + +export default DataHubEntityCard; diff --git a/docs-website/src/components/DataHubEntityCard/styles.module.css b/docs-website/src/components/DataHubEntityCard/styles.module.css new file mode 100644 index 00000000000000..e6863d669b456d --- /dev/null +++ b/docs-website/src/components/DataHubEntityCard/styles.module.css @@ -0,0 +1,503 @@ +/* Auto-generated DataHub Design Tokens */ +:root { + --datahub-primary: #533FD1; + --datahub-primary-dark: #4C39BE; + --datahub-primary-light: #7565DA; + --datahub-primary-lightest: #F1F3FD; + --datahub-gray-100: #EBECF0; + --datahub-gray-600: #374066; + --datahub-gray-1700: #5F6685; + --datahub-gray-1800: #8088A3; + --datahub-gray-1500: #F9FAFC; + --datahub-white: #FFFFFF; + --datahub-success: #77B750; + --datahub-warning: #EEAE09; + --datahub-error: #CD0D24; + --datahub-border: #E9EAEE; + --datahub-shadow: 0px 1px 2px 0px rgba(33, 23, 95, 0.07); + --datahub-shadow-hover: 0 2px 8px rgba(83, 63, 209, 0.15); + --datahub-node-width: 320px; + --datahub-node-height: 90px; + --datahub-transformation-size: 40px; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* DataHub Entity Card - Uses actual DataHub design tokens */ + +/* Import DataHub color variables */ +:root { + /* DataHub Alchemy Design System Colors */ + --datahub-primary: #533FD1; + --datahub-primary-dark: #4C39BE; + --datahub-primary-light: #7565DA; + --datahub-primary-lightest: #F1F3FD; + --datahub-gray-100: #EBECF0; + --datahub-gray-600: #374066; + --datahub-gray-1700: #5F6685; + --datahub-gray-1800: #8088A3; + --datahub-gray-1500: #F9FAFC; + --datahub-white: #FFFFFF; + --datahub-success: #77B750; + --datahub-warning: #EEAE09; + --datahub-error: #CD0D24; + --datahub-border: #E9EAEE; + --datahub-shadow: 0px 1px 2px 0px rgba(33, 23, 95, 0.07); + --datahub-shadow-hover: 0 2px 8px rgba(83, 63, 209, 0.15); +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +.entityCard { + background: var(--datahub-white); + border: 1px solid var(--datahub-border); + border-radius: 12px; /* Match DataHub's border-radius */ + padding: 16px; + margin: 12px 0; + transition: all 0.2s ease; + box-shadow: var(--datahub-shadow); + cursor: pointer; +} + +.entityCard:hover { + border-color: var(--datahub-primary); + box-shadow: var(--datahub-shadow-hover); + transform: translateY(-1px); +} + +.header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 12px; +} + +.platformInfo { + display: flex; + align-items: center; + gap: 8px; + font-size: 13px; + color: var(--datahub-gray-1700); +} + +.platformLogo { + width: 16px; + height: 16px; + object-fit: contain; +} + +.type { + font-weight: 500; + color: var(--datahub-gray-600); +} + +.divider { + color: var(--datahub-gray-1800); +} + +.platform { + color: var(--datahub-gray-1700); +} + +.content { + display: flex; + flex-direction: column; + gap: 8px; +} + +.entityName { + margin: 0; + font-size: 16px; + font-weight: 600; + line-height: 1.3; + color: var(--datahub-gray-600); +} + +.entityLink { + color: var(--datahub-primary); + text-decoration: none; + transition: color 0.2s ease; + display: inline-flex; + align-items: center; +} + +.entityLink:hover { + color: var(--datahub-primary-dark); + text-decoration: underline; +} + +.description { + margin: 0; + font-size: 14px; + color: var(--datahub-gray-1700); + line-height: 1.4; +} + +.tags { + display: flex; + flex-wrap: wrap; + gap: 6px; +} + +.tag { + background: var(--datahub-primary-lightest); + color: var(--datahub-primary-dark); + padding: 4px 8px; + border-radius: 6px; /* Match DataHub's tag styling */ + font-size: 12px; + font-weight: 500; + border: 1px solid var(--datahub-primary-light); +} + +/* Tags and Glossary Terms */ +.tagTermGroup { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-top: 8px; + align-items: center; +} + +.tag { + display: flex; + align-items: center; + background: var(--datahub-white); + border: 1px solid var(--datahub-border); + border-radius: 4px; + padding: 2px 6px; + font-size: 11px; + color: var(--datahub-gray-600); + max-width: 120px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.tagColorDot { + width: 6px; + height: 6px; + border-radius: 50%; + margin-right: 4px; + flex-shrink: 0; +} + +.tagText { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.term { + position: relative; + display: inline-flex; + align-items: center; + background: #f8f8f8; + border: 1px solid #ccd1dd; + border-radius: 5px; + padding: 3px 8px; + font-size: 12px; + font-weight: 400; + color: #565657; + max-width: 200px; + overflow: hidden; + cursor: pointer; + margin-left: 8px; /* Make room for ribbon */ +} + +.term:hover { + background-color: #f2f2f2; +} + +.termRibbon { + position: absolute; + left: -20px; + top: 4px; + width: 50px; + transform: rotate(-45deg); + padding: 4px; + opacity: 1; +} + +.termText { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + margin-left: 8px; +} + +/* Ownership Section */ +.ownership { + margin-top: 12px; +} + +.ownershipHeader { + font-size: 12px; + font-weight: 600; + color: var(--datahub-gray-600); + margin-bottom: 8px; + text-transform: uppercase; + letter-spacing: 0.5px; +} + +.ownershipTypeGroup { + margin-bottom: 8px; +} + +.ownershipTypeGroup:last-child { + margin-bottom: 0; +} + +.ownershipTypeHeader { + display: flex; + align-items: center; + gap: 6px; + margin-bottom: 4px; +} + +.ownershipTypeIcon { + font-size: 12px; +} + +.ownershipTypeName { + font-size: 11px; + font-weight: 600; + color: var(--datahub-gray-1700); +} + +.ownersList { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-left: 18px; /* Indent under type header */ +} + +.owner { + background: var(--datahub-gray-100); + color: var(--datahub-gray-600); + padding: 2px 6px; + border-radius: 4px; + font-size: 11px; + font-weight: 500; +} + +.assertions { + display: flex; + align-items: center; + gap: 8px; + margin-top: 8px; +} + +.assertionsLabel { + font-size: 12px; + color: var(--datahub-gray-1700); + font-weight: 500; +} + +.assertionStatus { + display: flex; + align-items: center; + gap: 4px; + background: var(--datahub-gray-100); + padding: 2px 6px; + border-radius: 4px; + font-size: 11px; +} + +.assertionText { + font-weight: 500; + color: var(--datahub-gray-600); +} + +/* Responsive design */ +@media (max-width: 768px) { + .entityCard { + margin: 8px 0; + padding: 12px; + } + + .header { + flex-direction: column; + align-items: flex-start; + gap: 8px; + } + + .tags { + gap: 4px; + } +} \ No newline at end of file diff --git a/docs-website/src/components/DataHubLineageNode/index.jsx b/docs-website/src/components/DataHubLineageNode/index.jsx new file mode 100644 index 00000000000000..6fc22076f23203 --- /dev/null +++ b/docs-website/src/components/DataHubLineageNode/index.jsx @@ -0,0 +1,708 @@ +import React from 'react'; +import styles from './styles.module.css'; + +// Simplified version of DataHub's LineageEntityNode for tutorials +const DataHubLineageNode = ({ + name, + type = 'Dataset', + entityType = 'Dataset', // DataHub entity type (Dataset, DataJob, etc.) + platform = 'Hive', + isSelected = false, + isCenter = false, + health = 'Good', + isExpanded = false, + columns = [], + tags = [], + glossaryTerms = [], + onClick, + onToggleExpand, + className = '', +}) => { + // Use actual DataHub platform logos from the docs website + const getPlatformLogo = (platformName) => { + const logoMap = { + // Analytics & BI + 'Looker': '/img/logos/platforms/looker.svg', + 'Tableau': '/img/logos/platforms/tableau.png', + 'PowerBI': '/img/logos/platforms/powerbi.png', + 'Metabase': '/img/logos/platforms/metabase.svg', + 'Superset': '/img/logos/platforms/superset.svg', + 'Mode': '/img/logos/platforms/mode.png', + 'Preset': '/img/logos/platforms/presetlogo.svg', + 'Sigma': '/img/logos/platforms/sigma.png', + 'Qlik': '/img/logos/platforms/qlik.png', + 'Redash': '/img/logos/platforms/redash.svg', + + // Cloud Data Warehouses + 'Snowflake': '/img/logos/platforms/snowflake.svg', + 'BigQuery': '/img/logos/platforms/bigquery.svg', + 'Redshift': '/img/logos/platforms/redshift.svg', + 'Databricks': '/img/logos/platforms/databricks.png', + 'Synapse': '/img/logos/platforms/mssql.svg', + + // Databases + 'PostgreSQL': '/img/logos/platforms/postgres.svg', + 'Postgres': '/img/logos/platforms/postgres.svg', + 'postgres': '/img/logos/platforms/postgres.svg', + 'MySQL': '/img/logos/platforms/mysql.svg', + 'Oracle': '/img/logos/platforms/oracle.svg', + 'SQL Server': '/img/logos/platforms/mssql.svg', + 'MongoDB': '/img/logos/platforms/mongodb.svg', + 'Cassandra': '/img/logos/platforms/cassandra.png', + 'Neo4j': '/img/logos/platforms/neo4j.png', + 'DynamoDB': '/img/logos/platforms/dynamodb.png', + 'ClickHouse': '/img/logos/platforms/clickhouse.svg', + 'CockroachDB': '/img/logos/platforms/cockroachdb.png', + 'MariaDB': '/img/logos/platforms/mariadb.png', + 'Teradata': '/img/logos/platforms/teradata.svg', + 'Vertica': '/img/logos/platforms/vertica.svg', + 'SAP HANA': '/img/logos/platforms/hana.svg', + 'Couchbase': '/img/logos/platforms/couchbase.svg', + + // Big Data & Processing + 'Hive': '/img/logos/platforms/hive.svg', + 'Spark': '/img/logos/platforms/spark.svg', + 'Hadoop': '/img/logos/platforms/hadoop.svg', + 'Kafka': '/img/logos/platforms/kafka.svg', + 'Pulsar': '/img/logos/platforms/pulsar.png', + 'Presto': '/img/logos/platforms/presto.svg', + 'Trino': '/img/logos/platforms/trino.png', + 'Druid': '/img/logos/platforms/druid.svg', + 'Pinot': '/img/logos/platforms/pinot.svg', + 'Kusto': '/img/logos/platforms/kusto.svg', + 'Iceberg': '/img/logos/platforms/iceberg.png', + 'Delta Lake': '/img/logos/platforms/deltalake.svg', + 'Hudi': '/img/logos/platforms/hudi.png', + + // Cloud Storage + 'S3': '/img/logos/platforms/s3.svg', + 'GCS': '/img/logos/platforms/gcs.svg', + 'ADLS': '/img/logos/platforms/adls.svg', + + // ETL & Orchestration + 'Airflow': '/img/logos/platforms/airflow.svg', + 'dbt': '/img/logos/platforms/dbt.svg', + 'Fivetran': '/img/logos/platforms/fivetran.png', + 'Dagster': '/img/logos/platforms/dagster.svg', + 'Prefect': '/img/logos/platforms/prefect.svg', + 'Snaplogic': '/img/logos/platforms/snaplogic.svg', + 'Nifi': '/img/logos/platforms/nifi.svg', + + // ML & AI + 'MLflow': '/img/logos/platforms/mlflow.svg', + 'SageMaker': '/img/logos/platforms/sagemaker.svg', + 'Vertex AI': '/img/logos/platforms/vertexai.png', + + // Cloud Platforms + 'AWS Athena': '/img/logos/platforms/athena.svg', + 'AWS Glue': '/img/logos/platforms/glue.svg', + 'Azure': '/img/logos/platforms/azure-ad.svg', + 'Elasticsearch': '/img/logos/platforms/elasticsearch.svg', + + // Data Quality & Governance + 'Great Expectations': '/img/logos/platforms/great-expectations.png', + 'Feast': '/img/logos/platforms/feast.svg', + 'Dremio': '/img/logos/platforms/dremio.png', + + // File Formats & Others + 'OpenAPI': '/img/logos/platforms/openapi.png', + 'Salesforce': '/img/logos/platforms/salesforce.png', + 'Okta': '/img/logos/platforms/okta.png', + 'SAC': '/img/logos/platforms/sac.svg', + 'Hex': '/img/logos/platforms/hex.png', + 'SQLAlchemy': '/img/logos/platforms/sqlalchemy.png', + 'Protobuf': '/img/logos/platforms/protobuf.png', + + // DataHub & Default + 'DataHub': '/img/logos/platforms/acryl.svg', + 'API': '/img/logos/platforms/acryl.svg', // Generic for API + 'Unknown': '/img/logos/platforms/acryl.svg', + }; + return logoMap[platformName] || '/img/logos/platforms/acryl.svg'; + }; + + const healthColors = { + 'Good': '#52c41a', + 'Warning': '#faad14', + 'Critical': '#ff4d4f', + }; + + // Health icon components matching DataHub's HealthIcon + const HealthIcon = ({ health, size = 16 }) => { + const iconStyle = { + width: `${size}px`, + height: `${size}px`, + display: 'inline-block', + }; + + if (health === 'Good') { + return ( + + + + ); + } + + if (health === 'Warning' || health === 'Critical') { + return ( + + + + ); + } + + return null; + }; + + // Column type icons matching DataHub's exact TypeIcon component + const getColumnTypeIcon = (columnType) => { + const iconStyle = { + width: '16px', + height: '16px', + display: 'flex', + alignItems: 'center', + justifyContent: 'center', + fontSize: '14px', + fontWeight: 'bold' + }; + + switch (columnType?.toLowerCase()) { + case 'string': + case 'varchar': + case 'text': + // String icon - A with underline (exactly like DataHub) + return ( +
+ A +
+ ); + case 'int': + case 'integer': + case 'bigint': + case 'number': + // Number icon - # symbol (exactly like DataHub) + return ( +
+ # +
+ ); + case 'date': + case 'datetime': + case 'timestamp': + // Calendar icon (simple calendar symbol) + return ( +
+ + + +
+ ); + case 'boolean': + case 'bool': + // Boolean icon - simple T/F + return ( +
+ T/F +
+ ); + case 'struct': + case 'object': + // Struct icon - curly brackets (exactly like DataHub) + return ( +
+ { } +
+ ); + case 'array': + case 'list': + // Array icon - square brackets + return ( +
+ [ ] +
+ ); + default: + // Question mark for unknown types + return ( +
+ ? +
+ ); + } + }; + + // Generate color hash for tags (matching DataHub's ColorHash) + const generateTagColor = (tagName) => { + // Simple hash function to generate consistent colors + let hash = 0; + for (let i = 0; i < tagName.length; i++) { + const char = tagName.charCodeAt(i); + hash = ((hash << 5) - hash) + char; + hash = hash & hash; // Convert to 32bit integer + } + + // Convert to HSL with high saturation for vibrant colors + const hue = Math.abs(hash) % 360; + return `hsl(${hue}, 70%, 45%)`; + }; + + // Generate color for glossary terms (matching DataHub's glossary colors) + const generateTermColor = (termName) => { + const colors = [ + '#1890ff', '#52c41a', '#faad14', '#f5222d', '#722ed1', + '#fa541c', '#13c2c2', '#eb2f96', '#a0d911', '#fadb14' + ]; + let hash = 0; + for (let i = 0; i < termName.length; i++) { + hash = ((hash << 5) - hash) + termName.charCodeAt(i); + } + return colors[Math.abs(hash) % colors.length]; + }; + + // Tag component matching DataHub's StyledTag + const DataHubTag = ({ tag }) => ( +
+
+ {tag} +
+ ); + + // Glossary term component matching DataHub's Term + const DataHubTerm = ({ term }) => ( +
+
+ {term} +
+ ); + + // Tags and terms group component + const TagTermGroup = ({ tags, glossaryTerms, maxShow = 3 }) => { + const allItems = [ + ...tags.map(tag => ({ type: 'tag', value: tag })), + ...glossaryTerms.map(term => ({ type: 'term', value: term })) + ]; + + const visibleItems = allItems.slice(0, maxShow); + const remainingCount = allItems.length - maxShow; + + return ( +
+ {visibleItems.map((item, index) => ( + item.type === 'tag' ? + : + + ))} + {remainingCount > 0 && ( +
+{remainingCount}
+ )} +
+ ); + }; + + // Determine if this is a transformation node (DataJob, Query, etc.) + const isTransformationNode = entityType === 'DataJob' || entityType === 'Query' || entityType === 'DataProcessInstance'; + + const nodeClasses = [ + isTransformationNode ? styles.transformationNode : styles.lineageNode, + isSelected && styles.selected, + isCenter && styles.center, + className + ].filter(Boolean).join(' '); + + // Render transformation node (circular, smaller) + if (isTransformationNode) { + return ( +
+
+ {`${platform} +
+
+ +
+
+ ); + } + + // Render entity node (rectangular, larger) + return ( +
+ {/* Main card content - matches DataHub's CardWrapper structure */} +
+
+
+ {`${platform} + {type} +
+
+ {columns.length > 0 && ( + + )} +
+
+ +
+
+
{name}
+
+ +
+
+
{platform}
+ {(tags.length > 0 || glossaryTerms.length > 0) && ( + + )} +
+
+ + {/* Expandable columns section */} + {isExpanded && columns.length > 0 && ( +
+
+ Columns ({columns.length}) +
+
+ {columns.map((column, index) => ( +
+ {/* Left handle for incoming connections */} +
+
+ {getColumnTypeIcon(column.type)} +
+
+ {column.name} + {column.type} +
+ {column.hasLineage && ( +
+ → +
+ )} + {/* Right handle for outgoing connections */} +
+
+ ))} +
+
+ )} +
+ ); +}; + +// Component for showing lineage connections with interactive expansion and column-level lineage +export const DataHubLineageFlow = ({ nodes, title, className = '', showColumnLineage = false }) => { + const [expandedNodes, setExpandedNodes] = React.useState(new Set()); + + const toggleNodeExpansion = (nodeId) => { + setExpandedNodes(prev => { + const newSet = new Set(prev); + if (newSet.has(nodeId)) { + newSet.delete(nodeId); + } else { + newSet.add(nodeId); + } + return newSet; + }); + }; + + // Column lineage mappings - shows which columns connect between nodes + const getColumnLineage = (sourceNodeIndex, targetNodeIndex) => { + // Only show column lineage when going from DataJob to Dataset (after transformation) + if (sourceNodeIndex === 1 && targetNodeIndex === 2) { + // DataJob -> fct_users_created (this represents the transformation from user_events through the ETL job) + return [ + { source: 'user_id', target: 'user_id' }, + { source: 'timestamp', target: 'created_date' }, + { source: 'event_type', target: 'signup_source' }, + ]; + } + return []; + }; + + const allNodesExpanded = nodes.every(node => expandedNodes.has(node.id)); + const shouldShowColumnConnections = false; // Disabled for now + + return ( +
+ {title &&

{title}

} +
+ {nodes.map((node, index) => ( + + toggleNodeExpansion(node.id)} + /> + {index < nodes.length - 1 && ( +
+
+
+ )} +
+ ))} +
+
+ ); +}; + +// Component for showing column-level lineage connections +const ColumnLineageConnections = ({ sourceNode, targetNode, connections, hasDataJob = false }) => { + if (!connections.length) return null; + + return ( +
+ + {connections.map((connection, index) => { + // When hasDataJob is true, the sourceNode is the DataJob and we need to show + // connections from the previous dataset through the DataJob to the target + let sourceY, targetY; + + if (hasDataJob) { + // Source is DataJob, target is Dataset - show transformation output + targetY = 50 + (targetNode.columns?.findIndex(col => col.name === connection.target) || 0) * 36; + // For DataJob source, we'll position the connection at the center + sourceY = 125; // Center of the DataJob + } else { + // Normal dataset to dataset connection + sourceY = 50 + (sourceNode.columns?.findIndex(col => col.name === connection.source) || 0) * 36; + targetY = 50 + (targetNode.columns?.findIndex(col => col.name === connection.target) || 0) * 36; + } + + return ( + + {/* Connection line */} + + {/* Connection points */} + + + + {/* Label showing the transformation */} + + {connection.source} → {connection.target} + + + ); + })} + {/* Arrow marker definition */} + + + + + + +
+ ); +}; + +// Sample column data for datasets +export const SampleColumns = { + userEvents: [ + { name: 'user_id', type: 'bigint', hasLineage: true }, + { name: 'event_type', type: 'string', hasLineage: false }, + { name: 'timestamp', type: 'timestamp', hasLineage: true }, + { name: 'properties', type: 'struct', hasLineage: false }, + ], + userCreated: [ + { name: 'user_id', type: 'bigint', hasLineage: true }, + { name: 'created_date', type: 'date', hasLineage: true }, + { name: 'signup_source', type: 'string', hasLineage: true }, + { name: 'user_email', type: 'string', hasLineage: false }, + { name: 'user_name', type: 'string', hasLineage: false }, + ], + rawUserData: [ + { name: 'id', type: 'bigint', hasLineage: true }, + { name: 'email', type: 'string', hasLineage: true }, + { name: 'name', type: 'string', hasLineage: true }, + { name: 'created_at', type: 'timestamp', hasLineage: true }, + { name: 'metadata', type: 'struct', hasLineage: false }, + { name: 'is_active', type: 'boolean', hasLineage: false }, + ], +}; + +// Pre-configured sample lineage flows for tutorials +export const SampleLineageFlows = { + userMetricsFlow: { + title: 'User Metrics Data Pipeline', + nodes: [ + { + id: 'source', + name: 'user_events_stream', + type: 'Topic', + entityType: 'Dataset', + platform: 'Kafka', + health: 'Good', + columns: SampleColumns.userEvents, + tags: ['Streaming', 'Real-time'], + glossaryTerms: ['User Activity', 'Event Data'], + }, + { + id: 'etl', + name: 'user_transformation_job', + type: 'ETL Job', + entityType: 'DataJob', + platform: 'Databricks', + health: 'Good', + }, + { + id: 'target', + name: 'user_metrics_fact', + type: 'Table', + entityType: 'Dataset', + platform: 'Snowflake', + health: 'Good', + isCenter: true, + columns: SampleColumns.userCreated, + tags: ['PII', 'User Analytics', 'Daily'], + glossaryTerms: ['User Metrics', 'Fact Table'], + }, + ], + }, + + troubleshootingFlow: { + title: 'Data Quality Investigation Pipeline', + nodes: [ + { + id: 'source', + name: 'customer_transactions', + type: 'Dataset', + entityType: 'Dataset', + platform: 'PostgreSQL', + health: 'Warning', + columns: SampleColumns.rawUserData, + tags: ['Raw', 'PII', 'Hourly'], + glossaryTerms: ['Source Data', 'Customer Information'], + }, + { + id: 'ingestion', + name: 'fivetran_sync_job', + type: 'Ingestion Job', + entityType: 'DataJob', + platform: 'Fivetran', + health: 'Good', + }, + { + id: 'validation', + name: 'dbt_quality_checks', + type: 'Validation Job', + entityType: 'DataJob', + platform: 'dbt', + health: 'Critical', + }, + { + id: 'target', + name: 'validated_transactions', + type: 'Table', + entityType: 'Dataset', + platform: 'BigQuery', + health: 'Good', + isSelected: true, + columns: SampleColumns.userCreated, // Same schema after cleaning + tags: ['Validated', 'Clean', 'Production'], + glossaryTerms: ['Processed Data', 'Transaction Data'], + }, + ], + }, + + qualityMonitoringFlow: { + title: 'Quality Monitoring Data Pipeline', + nodes: [ + { + id: 'source', + name: 'raw_transactions', + type: 'Table', + entityType: 'Dataset', + platform: 'PostgreSQL', + health: 'Warning', + columns: SampleColumns.rawUserData, + tags: ['Raw', 'Unvalidated'], + glossaryTerms: ['Raw Data', 'Transaction Source'], + }, + { + id: 'quality', + name: 'quality_validation_job', + type: 'Quality Job', + entityType: 'DataJob', + platform: 'DataHub', + health: 'Good', + }, + { + id: 'target', + name: 'validated_transactions', + type: 'Table', + entityType: 'Dataset', + platform: 'Snowflake', + health: 'Good', + columns: SampleColumns.userCreated, + tags: ['Validated', 'Quality-Assured', 'Production'], + glossaryTerms: ['Validated Data', 'Quality Metrics'], + }, + ], + }, +}; + + +export default DataHubLineageNode; diff --git a/docs-website/src/components/DataHubLineageNode/styles.module.css b/docs-website/src/components/DataHubLineageNode/styles.module.css new file mode 100644 index 00000000000000..645482e7218046 --- /dev/null +++ b/docs-website/src/components/DataHubLineageNode/styles.module.css @@ -0,0 +1,864 @@ +/* Auto-generated DataHub Design Tokens */ +:root { + --datahub-primary: #533FD1; + --datahub-primary-dark: #4C39BE; + --datahub-primary-light: #7565DA; + --datahub-primary-lightest: #F1F3FD; + --datahub-gray-100: #EBECF0; + --datahub-gray-600: #374066; + --datahub-gray-1700: #5F6685; + --datahub-gray-1800: #8088A3; + --datahub-gray-1500: #F9FAFC; + --datahub-white: #FFFFFF; + --datahub-success: #77B750; + --datahub-warning: #EEAE09; + --datahub-error: #CD0D24; + --datahub-border: #E9EAEE; + --datahub-shadow: 0px 1px 2px 0px rgba(33, 23, 95, 0.07); + --datahub-shadow-hover: 0 2px 8px rgba(83, 63, 209, 0.15); + --datahub-node-width: 320px; + --datahub-node-height: 90px; + --datahub-transformation-size: 40px; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* DataHub Lineage Node - Uses actual DataHub design tokens */ + +/* Import DataHub color variables */ +:root { + /* DataHub Alchemy Design System Colors */ + --datahub-primary: #533FD1; + --datahub-primary-dark: #4C39BE; + --datahub-primary-light: #7565DA; + --datahub-primary-lightest: #F1F3FD; + --datahub-gray-100: #EBECF0; + --datahub-gray-600: #374066; + --datahub-gray-1700: #5F6685; + --datahub-gray-1800: #8088A3; + --datahub-gray-1500: #F9FAFC; + --datahub-white: #FFFFFF; + --datahub-success: #77B750; + --datahub-warning: #EEAE09; + --datahub-error: #CD0D24; + --datahub-border: #E9EAEE; + --datahub-shadow: 0px 1px 2px 0px rgba(33, 23, 95, 0.07); + --datahub-shadow-hover: 0 2px 8px rgba(83, 63, 209, 0.15); + --datahub-node-width: 280px; /* Reduced width to prevent horizontal scroll */ + --datahub-node-height: 90px; /* Match LINEAGE_NODE_HEIGHT */ + --datahub-transformation-size: 40px; /* Match TRANSFORMATION_NODE_SIZE */ +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +.lineageNode { + background: var(--datahub-white); + border: 1px solid var(--datahub-border); + border-radius: 12px; /* Match DataHub V3 styling */ + width: var(--datahub-node-width); + min-height: var(--datahub-node-height); /* Use min-height to allow expansion */ + max-width: var(--datahub-node-width); /* Prevent overflow */ + flex-shrink: 0; /* Prevent nodes from shrinking in flex container */ + cursor: pointer; + transition: all 0.2s ease; + box-shadow: var(--datahub-shadow); + position: relative; + display: flex; + align-items: stretch; /* Allow content to stretch */ + flex-direction: column; + padding: 0; /* Remove padding to match DataHub structure */ + overflow: hidden; /* Prevent content overflow */ +} + +.lineageNode:hover { + border-color: var(--datahub-primary); + box-shadow: var(--datahub-shadow-hover); + transform: translateY(-1px); +} + +.lineageNode.selected { + border-color: var(--datahub-primary); + border-width: 2px; + box-shadow: 0 4px 12px rgba(83, 63, 209, 0.25); +} + +.lineageNode.center { + border-width: 2px; + border-color: var(--datahub-primary); + background: var(--datahub-primary-lightest); +} + +/* Transformation Node (Data Jobs, Queries) - Small Square */ +.transformationNode { + background: var(--datahub-white); + border: 1px solid var(--datahub-border); + border-radius: 4px; /* Small rounded corners for square transformation nodes */ + width: var(--datahub-transformation-size); + height: var(--datahub-transformation-size); + cursor: pointer; + transition: all 0.2s ease; + box-shadow: var(--datahub-shadow); + position: relative; + display: flex; + align-items: center; + justify-content: center; + flex-shrink: 0; + align-self: center; /* Center within flow */ +} + +.transformationNode:hover { + border-color: var(--datahub-primary); + box-shadow: var(--datahub-shadow-hover); + transform: translateY(-1px); +} + +.transformationNode.selected { + border-color: var(--datahub-primary); + border-width: 2px; + box-shadow: 0 4px 12px rgba(83, 63, 209, 0.25); +} + +.transformationNode.center { + border-width: 2px; + border-color: var(--datahub-primary); + background: var(--datahub-primary-lightest); +} + +.transformationIcon { + display: flex; + align-items: center; + justify-content: center; + font-size: 18px; +} + +.transformationLogo { + width: 18px; + height: 18px; + object-fit: contain; +} + +.transformationHealthIcon { + position: absolute; + top: -6px; + right: -6px; + display: flex; + align-items: center; + justify-content: center; + background: var(--datahub-white); + border-radius: 50%; + padding: 2px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1); +} + +/* Card wrapper - matches DataHub's CardWrapper */ +.cardWrapper { + display: flex; + flex-direction: column; + align-items: flex-start; + min-height: var(--datahub-node-height); /* Use min-height instead of height */ + width: 100%; + padding: 12px 16px; + box-sizing: border-box; + flex: 1; /* Allow it to grow */ +} + +.nodeHeader { + display: flex; + align-items: center; + justify-content: space-between; + width: 100%; + margin-bottom: 8px; + font-size: 12px; +} + +.headerActions { + display: flex; + align-items: center; + gap: 8px; +} + +.expandButton { + background: var(--datahub-gray-100); + border: 1px solid var(--datahub-border); + border-radius: 4px; + width: 20px; + height: 20px; + display: flex; + align-items: center; + justify-content: center; + cursor: pointer; + font-size: 12px; + font-weight: bold; + color: var(--datahub-gray-600); + transition: all 0.2s ease; +} + +.expandButton:hover { + background: var(--datahub-primary-lightest); + border-color: var(--datahub-primary); + color: var(--datahub-primary); +} + +.platformInfo { + display: flex; + align-items: center; + gap: 6px; +} + +.platformLogo { + width: 16px; + height: 16px; + object-fit: contain; +} + +.type { + font-weight: 500; + color: var(--datahub-gray-1700); + font-size: 12px; +} + +.healthIcon { + display: flex; + align-items: center; + justify-content: center; + flex-shrink: 0; + flex-grow: 0; + width: 16px; + height: 16px; +} + +.nodeContent { + display: flex; + flex-direction: column; + width: 100%; + flex: 1; +} + +.nameWithHealth { + display: flex; + align-items: center; + gap: 6px; + margin-bottom: 4px; + width: 100%; +} + +.nodeName { + font-size: 16px; + font-weight: 600; + color: var(--datahub-gray-600); + line-height: 1.2; + word-break: break-word; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + flex: 0 1 auto; /* Don't grow, but allow shrinking */ + min-width: 0; /* Allow flex item to shrink below content size */ + max-width: calc(100% - 22px); /* Reserve space for health icon (16px + 6px gap) */ +} + +.platform { + font-size: 12px; + color: var(--datahub-gray-1700); + text-transform: uppercase; + letter-spacing: 0.5px; + font-weight: 500; +} + +/* Expandable Columns Section */ +.columnsWrapper { + border-top: 1px solid var(--datahub-border); + background: var(--datahub-gray-1500); + border-radius: 0 0 12px 12px; + overflow: hidden; + width: 100%; + margin-top: 4px; /* Reduced spacing */ +} + +.columnsHeader { + padding: 12px 16px; + border-bottom: 2px solid var(--datahub-border); + background: var(--datahub-white); + min-height: 20px; /* Ensure visibility */ +} + +.columnsTitle { + font-size: 12px; + font-weight: 700; + color: var(--datahub-gray-600); /* Darker for better visibility */ + text-transform: uppercase; + letter-spacing: 0.5px; +} + +.columnsList { + max-height: 200px; + overflow-y: auto; + background: var(--datahub-gray-1500); +} + +.columnItem { + display: flex; + align-items: center; + padding: 4px 8px; + border-bottom: 1px solid var(--datahub-border); + transition: background-color 0.2s ease; + background: var(--datahub-gray-1500); + min-height: 32px; /* Reduced height for more compact layout */ + position: relative; /* For handle positioning */ +} + +.columnHandle { + position: absolute; + width: 8px; + height: 8px; + background: var(--datahub-primary); + border: 2px solid var(--datahub-white); + border-radius: 50%; + opacity: 0; + transition: opacity 0.2s ease; + z-index: 3; +} + +.columnHandle[data-position="left"] { + left: -4px; + top: 50%; + transform: translateY(-50%); +} + +.columnHandle[data-position="right"] { + right: -4px; + top: 50%; + transform: translateY(-50%); +} + +.columnItem:hover .columnHandle { + opacity: 1; +} + +.columnItem:hover { + background: var(--datahub-white); +} + +.columnItem:last-child { + border-bottom: none; +} + +.columnIcon { + width: 32px; + height: 24px; + display: flex; + align-items: center; + justify-content: center; + margin-right: 6px; + background: var(--datahub-white); + border-radius: 4px; + border: 1px solid var(--datahub-border); + flex-shrink: 0; + min-width: 32px; /* Prevent shrinking and provide adequate space */ +} + +.columnInfo { + flex: 1; + display: flex; + flex-direction: column; + min-width: 0; /* Allow shrinking */ + overflow: hidden; +} + +.columnName { + font-size: 11px; + font-weight: 600; /* Slightly bolder for better visibility */ + color: var(--datahub-gray-600); + line-height: 1.2; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + margin-bottom: 1px; + max-width: 100%; +} + +.columnType { + font-size: 9px; + color: var(--datahub-gray-1700); + text-transform: uppercase; + letter-spacing: 0.2px; + line-height: 1; + font-weight: 500; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.lineageIndicator { + color: var(--datahub-primary); + font-weight: bold; + font-size: 12px; + margin-left: 8px; + opacity: 0.7; +} + +/* Tags and Glossary Terms */ +.tagTermGroup { + display: flex; + flex-wrap: wrap; + gap: 4px; + margin-top: 4px; + align-items: center; +} + +.tag { + display: flex; + align-items: center; + background: var(--datahub-white); + border: 1px solid var(--datahub-border); + border-radius: 4px; + padding: 2px 6px; + font-size: 11px; + color: var(--datahub-gray-600); + max-width: 120px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.tagColorDot { + width: 6px; + height: 6px; + border-radius: 50%; + margin-right: 4px; + flex-shrink: 0; +} + +.tagText { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.term { + position: relative; + display: flex; + align-items: center; + background: var(--datahub-white); + border: 1px solid var(--datahub-border); + border-radius: 4px; + padding: 2px 6px; + font-size: 11px; + color: var(--datahub-gray-600); + max-width: 120px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + padding-left: 12px; /* Make room for ribbon */ +} + +.termRibbon { + position: absolute; + left: -8px; + top: 2px; + width: 20px; + height: 100%; + transform: rotate(-45deg); + transform-origin: center; + opacity: 0.8; + border-radius: 2px; +} + +.termText { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + position: relative; + z-index: 1; +} + +.moreCount { + font-size: 10px; + color: var(--datahub-gray-1700); + background: var(--datahub-gray-100); + border-radius: 3px; + padding: 1px 4px; + font-weight: 500; +} + +/* Lineage Flow Container */ +.lineageFlow { + margin: 20px 0; + padding: 16px; + background: var(--datahub-gray-1500); + border-radius: 12px; + border: 1px solid var(--datahub-border); +} + +.flowTitle { + margin: 0 0 16px 0; + font-size: 16px; + font-weight: 600; + color: var(--datahub-gray-600); + text-align: center; +} + +.flowContainer { + display: flex; + align-items: flex-start; + justify-content: center; + gap: 16px; + flex-wrap: wrap; + max-width: 100%; + overflow-x: auto; + padding: 10px 0; + box-sizing: border-box; +} + +/* Specific handling for flows with many nodes (5+) */ +.flowContainer:has(.lineageNode:nth-child(5)) { + justify-content: flex-start; + flex-wrap: nowrap; +} + +/* Fallback for browsers that don't support :has() */ +.flowContainer[data-node-count="5"], +.flowContainer[data-node-count="6"], +.flowContainer[data-node-count="7"] { + justify-content: flex-start; + flex-wrap: nowrap; + gap: 16px; /* Slightly smaller gap for many nodes */ +} + +/* Add scroll hint for flows with many nodes */ +.flowContainer[data-node-count="5"]::after, +.flowContainer[data-node-count="6"]::after, +.flowContainer[data-node-count="7"]::after { + content: "← Scroll horizontally to see all nodes →"; + position: absolute; + bottom: -25px; + left: 50%; + transform: translateX(-50%); + font-size: 12px; + color: var(--datahub-gray-500); + white-space: nowrap; + pointer-events: none; +} + +.lineageFlow { + position: relative; + padding-bottom: 30px; /* Make room for scroll hint */ +} + +.flowConnection { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; /* Always center vertically */ + position: relative; + min-height: 60px; + flex: 0 0 auto; /* Don't grow or shrink, maintain fixed size */ + align-self: center; /* Center within the parent flex container */ +} + +.flowArrow { + font-size: 24px; + color: var(--datahub-primary); + font-weight: bold; + z-index: 2; + position: relative; + animation: pulse 2s infinite; + display: block; + margin: 0 auto; +} + +.flowConnection:hover .flowArrow { + color: var(--datahub-primary-dark); + transform: scale(1.1); +} + +@keyframes pulse { + 0%, 100% { + opacity: 1; + transform: scale(1); + } + 50% { + opacity: 0.7; + transform: scale(1.1); + } +} + +/* Column-level lineage connections */ +.columnConnections { + position: relative; + width: 100%; + height: auto; + min-height: 250px; + pointer-events: none; + z-index: 10; + display: flex; + align-items: center; + justify-content: center; +} + +.connectionSvg { + width: 100%; + height: 250px; + max-width: 400px; +} + +.connectionLabel { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + font-size: 10px; + font-weight: 500; +} + +.lineageHint { + text-align: center; + margin-top: 16px; + padding: 8px 16px; + background: var(--datahub-primary-lightest); + border: 1px solid var(--datahub-primary); + border-radius: 6px; + color: var(--datahub-primary-dark); + font-size: 14px; + font-weight: 500; +} + +/* Interactive states */ +.lineageNode:focus { + outline: 2px solid var(--datahub-primary); + outline-offset: 2px; +} + +.lineageNode:active { + transform: translateY(0); + box-shadow: var(--datahub-shadow); +} + +/* Responsive design */ +@media (min-width: 769px) { + .flowContainer { + /* For wide screens with many nodes, allow horizontal scrolling */ + flex-wrap: nowrap; + justify-content: flex-start; + overflow-x: auto; + padding: 10px; + } + + .flowContainer::-webkit-scrollbar { + height: 6px; + } + + .flowContainer::-webkit-scrollbar-track { + background: var(--datahub-gray-100); + border-radius: 3px; + } + + .flowContainer::-webkit-scrollbar-thumb { + background: var(--datahub-gray-300); + border-radius: 3px; + } + + .flowContainer::-webkit-scrollbar-thumb:hover { + background: var(--datahub-gray-400); + } +} + +@media (max-width: 768px) { + .flowContainer { + flex-direction: column; + gap: 8px; + overflow-x: visible; + } + + .flowArrow { + transform: rotate(90deg); + font-size: 16px; + } + + .lineageNode { + width: 100%; + max-width: 280px; + min-width: 260px; + } + + .cardWrapper { + padding: 10px 12px; + } + + .nodeName { + font-size: 14px; + } + + .nameWithHealth { + gap: 5px; + } + + .nodeName { + max-width: calc(100% - 21px); /* Adjust for smaller gap */ + } +} + +@media (max-width: 480px) { + .lineageNode { + min-width: 250px; + height: auto; + min-height: var(--datahub-node-height); + } + + .nodeName { + white-space: normal; + overflow: visible; + text-overflow: unset; + } + + .nameWithHealth { + gap: 4px; + flex-wrap: wrap; + } + + .nodeName { + max-width: calc(100% - 20px); /* Adjust for smallest gap */ + } +} \ No newline at end of file diff --git a/docs-website/src/components/InteractiveDiagram/index.jsx b/docs-website/src/components/InteractiveDiagram/index.jsx new file mode 100644 index 00000000000000..f1d5136de5a748 --- /dev/null +++ b/docs-website/src/components/InteractiveDiagram/index.jsx @@ -0,0 +1,195 @@ +import React, { useCallback } from 'react'; +import ReactFlow, { + MiniMap, + Controls, + Background, + useNodesState, + useEdgesState, + addEdge, +} from 'reactflow'; +import 'reactflow/dist/style.css'; +import styles from './styles.module.css'; + +const InteractiveDiagram = ({ + nodes: initialNodes = [], + edges: initialEdges = [], + title, + height = '400px', + showMiniMap = true, + showControls = true, + showBackground = true, + backgroundType = 'dots' +}) => { + const [nodes, setNodes, onNodesChange] = useNodesState(initialNodes); + const [edges, setEdges, onEdgesChange] = useEdgesState(initialEdges); + + const onConnect = useCallback( + (params) => setEdges((eds) => addEdge(params, eds)), + [setEdges], + ); + + return ( +
+ {title &&

{title}

} +
+ + {showControls && } + {showMiniMap && ( + + )} + {showBackground && ( + + )} + +
+
+ ); +}; + +// Pre-defined diagram configurations for common DataHub workflows +export const DataHubWorkflows = { + ingestionFlow: { + nodes: [ + { + id: '1', + type: 'input', + data: { label: '🗄️ Data Sources\n(Kafka, Hive, HDFS)' }, + position: { x: 0, y: 0 }, + className: 'source-node', + }, + { + id: '2', + data: { label: '⚙️ DataHub Ingestion\nExtract Metadata' }, + position: { x: 200, y: 0 }, + className: 'process-node', + }, + { + id: '3', + data: { label: '📊 Metadata Storage\nElasticsearch + MySQL' }, + position: { x: 400, y: 0 }, + className: 'storage-node', + }, + { + id: '4', + type: 'output', + data: { label: '🔍 DataHub UI\nDiscovery & Lineage' }, + position: { x: 600, y: 0 }, + className: 'output-node', + }, + ], + edges: [ + { id: 'e1-2', source: '1', target: '2', animated: true, label: 'metadata' }, + { id: 'e2-3', source: '2', target: '3', animated: true, label: 'store' }, + { id: 'e3-4', source: '3', target: '4', animated: true, label: 'serve' }, + ], + }, + + discoveryFlow: { + nodes: [ + { + id: '1', + type: 'input', + data: { label: '👤 Data Analyst\nNeeds user metrics' }, + position: { x: 0, y: 100 }, + className: 'user-node', + }, + { + id: '2', + data: { label: '🔍 Search DataHub\n"user created deleted"' }, + position: { x: 200, y: 0 }, + className: 'search-node', + }, + { + id: '3', + data: { label: '📋 Browse Results\nfct_users_created' }, + position: { x: 200, y: 100 }, + className: 'browse-node', + }, + { + id: '4', + data: { label: '📊 Examine Schema\nColumns & Types' }, + position: { x: 200, y: 200 }, + className: 'schema-node', + }, + { + id: '5', + type: 'output', + data: { label: '✅ Found Data\nReady for Analysis' }, + position: { x: 400, y: 100 }, + className: 'success-node', + }, + ], + edges: [ + { id: 'e1-2', source: '1', target: '2', label: 'search' }, + { id: 'e1-3', source: '1', target: '3', label: 'browse' }, + { id: 'e1-4', source: '1', target: '4', label: 'explore' }, + { id: 'e2-5', source: '2', target: '5' }, + { id: 'e3-5', source: '3', target: '5' }, + { id: 'e4-5', source: '4', target: '5' }, + ], + }, + + lineageFlow: { + nodes: [ + { + id: '1', + data: { label: '📥 Raw Events\nKafka Stream' }, + position: { x: 0, y: 0 }, + className: 'source-node', + }, + { + id: '2', + data: { label: '⚙️ ETL Process\nSpark Job' }, + position: { x: 200, y: 0 }, + className: 'process-node', + }, + { + id: '3', + data: { label: '🗄️ Analytics Table\nfct_users_created' }, + position: { x: 400, y: 0 }, + className: 'table-node', + }, + { + id: '4', + data: { label: '📊 Dashboard\nUser Metrics' }, + position: { x: 600, y: 0 }, + className: 'output-node', + }, + { + id: '5', + data: { label: '🔧 Data Quality\nValidation Rules' }, + position: { x: 200, y: 100 }, + className: 'quality-node', + }, + ], + edges: [ + { id: 'e1-2', source: '1', target: '2', animated: true, label: 'raw data' }, + { id: 'e2-3', source: '2', target: '3', animated: true, label: 'processed' }, + { id: 'e3-4', source: '3', target: '4', animated: true, label: 'visualize' }, + { id: 'e2-5', source: '2', target: '5', label: 'validate' }, + { id: 'e5-3', source: '5', target: '3', label: 'quality check' }, + ], + }, +}; + +export default InteractiveDiagram; diff --git a/docs-website/src/components/InteractiveDiagram/styles.module.css b/docs-website/src/components/InteractiveDiagram/styles.module.css new file mode 100644 index 00000000000000..0b3a140338ea11 --- /dev/null +++ b/docs-website/src/components/InteractiveDiagram/styles.module.css @@ -0,0 +1,222 @@ +/* Interactive Diagram Styling for DataHub */ + +.diagramContainer { + margin: 24px 0; + border-radius: 12px; + overflow: hidden; + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1); + border: 1px solid var(--ifm-color-emphasis-200); + background: var(--ifm-background-color); +} + +.diagramTitle { + margin: 0; + padding: 16px 20px; + background: var(--ifm-color-primary-lightest); + color: var(--ifm-color-primary-darkest); + font-weight: 600; + font-size: 16px; + border-bottom: 1px solid var(--ifm-color-emphasis-200); +} + +.reactFlowWrapper { + position: relative; + background: var(--ifm-background-color); +} + +.reactFlow { + background: var(--ifm-background-color); +} + +/* Node Styling */ +.reactFlow :global(.react-flow__node) { + font-family: var(--ifm-font-family-base); + font-size: 12px; + font-weight: 500; + border-radius: 8px; + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); + border: 2px solid var(--ifm-color-emphasis-300); + background: var(--ifm-background-color); + color: var(--ifm-color-content); + padding: 8px 12px; + min-width: 120px; + text-align: center; + transition: all 0.2s ease; +} + +.reactFlow :global(.react-flow__node:hover) { + transform: translateY(-2px); + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.15); +} + +.reactFlow :global(.react-flow__node.selected) { + border-color: var(--ifm-color-primary); + box-shadow: 0 0 0 2px var(--ifm-color-primary-lightest); +} + +/* Specific Node Types */ +.reactFlow :global(.source-node) { + background: var(--ifm-color-success-lightest); + border-color: var(--ifm-color-success); + color: var(--ifm-color-success-darkest); +} + +.reactFlow :global(.process-node) { + background: var(--ifm-color-info-lightest); + border-color: var(--ifm-color-info); + color: var(--ifm-color-info-darkest); +} + +.reactFlow :global(.storage-node) { + background: var(--ifm-color-warning-lightest); + border-color: var(--ifm-color-warning); + color: var(--ifm-color-warning-darkest); +} + +.reactFlow :global(.output-node) { + background: var(--ifm-color-primary-lightest); + border-color: var(--ifm-color-primary); + color: var(--ifm-color-primary-darkest); +} + +.reactFlow :global(.user-node) { + background: #f0f9ff; + border-color: #0ea5e9; + color: #0c4a6e; +} + +.reactFlow :global(.search-node) { + background: #fef3c7; + border-color: #f59e0b; + color: #92400e; +} + +.reactFlow :global(.browse-node) { + background: #ecfdf5; + border-color: #10b981; + color: #065f46; +} + +.reactFlow :global(.schema-node) { + background: #f3e8ff; + border-color: #8b5cf6; + color: #581c87; +} + +.reactFlow :global(.success-node) { + background: var(--ifm-color-success-lightest); + border-color: var(--ifm-color-success); + color: var(--ifm-color-success-darkest); +} + +.reactFlow :global(.table-node) { + background: #fdf2f8; + border-color: #ec4899; + color: #9d174d; +} + +.reactFlow :global(.quality-node) { + background: #fff7ed; + border-color: #f97316; + color: #9a3412; +} + +/* Edge Styling */ +.reactFlow :global(.react-flow__edge-path) { + stroke: var(--ifm-color-primary); + stroke-width: 2px; +} + +.reactFlow :global(.react-flow__edge.animated path) { + stroke-dasharray: 5; + animation: dashdraw 0.5s linear infinite; +} + +.reactFlow :global(.react-flow__edge-text) { + font-family: var(--ifm-font-family-base); + font-size: 11px; + font-weight: 500; + fill: var(--ifm-color-content); + background: var(--ifm-background-color); + padding: 2px 4px; + border-radius: 4px; +} + +/* Controls Styling */ +.controls { + background: var(--ifm-background-color); + border: 1px solid var(--ifm-color-emphasis-200); + border-radius: 8px; + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); +} + +.controls :global(.react-flow__controls-button) { + background: var(--ifm-background-color); + border-color: var(--ifm-color-emphasis-200); + color: var(--ifm-color-content); + transition: all 0.2s ease; +} + +.controls :global(.react-flow__controls-button:hover) { + background: var(--ifm-color-emphasis-100); + border-color: var(--ifm-color-primary); +} + +/* MiniMap Styling */ +.miniMap { + background: var(--ifm-color-emphasis-100); + border: 1px solid var(--ifm-color-emphasis-200); + border-radius: 8px; + overflow: hidden; +} + +/* Background Styling */ +.background :global(.react-flow__background) { + background-color: var(--ifm-background-color); +} + +/* Dark Mode Adjustments */ +[data-theme='dark'] .diagramContainer { + border-color: var(--ifm-color-emphasis-300); + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3); +} + +[data-theme='dark'] .diagramTitle { + background: var(--ifm-color-emphasis-200); + color: var(--ifm-color-content); +} + +[data-theme='dark'] .reactFlow :global(.react-flow__node) { + background: var(--ifm-color-emphasis-100); + border-color: var(--ifm-color-emphasis-400); + color: var(--ifm-color-content); +} + +[data-theme='dark'] .reactFlow :global(.react-flow__edge-text) { + fill: var(--ifm-color-content); +} + +/* Animation */ +@keyframes dashdraw { + to { + stroke-dashoffset: -10; + } +} + +/* Responsive Design */ +@media (max-width: 768px) { + .diagramContainer { + margin: 16px 0; + } + + .diagramTitle { + padding: 12px 16px; + font-size: 14px; + } + + .reactFlow :global(.react-flow__node) { + font-size: 11px; + padding: 6px 8px; + min-width: 100px; + } +} diff --git a/docs-website/src/components/NextStepButton/index.jsx b/docs-website/src/components/NextStepButton/index.jsx new file mode 100644 index 00000000000000..bb8c4415c6f691 --- /dev/null +++ b/docs-website/src/components/NextStepButton/index.jsx @@ -0,0 +1,47 @@ +import React from 'react'; +import Link from '@docusaurus/Link'; +import styles from './styles.module.css'; + +const NextStepButton = ({ + to, + children, + tutorialId, + currentStep, + variant = 'primary', + icon = '→' +}) => { + const handleClick = () => { + if (tutorialId && currentStep !== undefined) { + const storageKey = `datahub-tutorial-${tutorialId}`; + const savedProgress = localStorage.getItem(storageKey); + let completedSteps = new Set(); + + if (savedProgress) { + try { + completedSteps = new Set(JSON.parse(savedProgress)); + } catch (e) { + console.warn('Failed to parse tutorial progress:', e); + } + } + + // Mark current step as completed + completedSteps.add(`step-${currentStep}`); + localStorage.setItem(storageKey, JSON.stringify([...completedSteps])); + } + }; + + return ( + + + {children} + {icon} + + + ); +}; + +export default NextStepButton; diff --git a/docs-website/src/components/NextStepButton/styles.module.css b/docs-website/src/components/NextStepButton/styles.module.css new file mode 100644 index 00000000000000..26028d37f54339 --- /dev/null +++ b/docs-website/src/components/NextStepButton/styles.module.css @@ -0,0 +1,65 @@ +.nextStepButton { + display: inline-flex; + align-items: center; + padding: 12px 24px; + border-radius: 8px; + text-decoration: none; + font-weight: 600; + font-size: 16px; + transition: all 0.2s ease; + border: 2px solid transparent; + margin: 16px 0; +} + +.nextStepButton:hover { + text-decoration: none; + transform: translateY(-1px); + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15); +} + +.primary { + background: var(--ifm-color-primary); + color: white; +} + +.primary:hover { + background: var(--ifm-color-primary-dark); + color: white; +} + +.secondary { + background: transparent; + color: var(--ifm-color-primary); + border-color: var(--ifm-color-primary); +} + +.secondary:hover { + background: var(--ifm-color-primary); + color: white; +} + +.content { + display: flex; + align-items: center; + gap: 8px; +} + +.icon { + font-size: 18px; + transition: transform 0.2s ease; +} + +.nextStepButton:hover .icon { + transform: translateX(2px); +} + +/* Dark mode support */ +[data-theme='dark'] .secondary { + border-color: var(--ifm-color-primary-light); + color: var(--ifm-color-primary-light); +} + +[data-theme='dark'] .secondary:hover { + background: var(--ifm-color-primary-light); + color: var(--ifm-color-primary-darkest); +} diff --git a/docs-website/src/components/OSDetectionTabs/index.jsx b/docs-website/src/components/OSDetectionTabs/index.jsx new file mode 100644 index 00000000000000..d345694c7a09e7 --- /dev/null +++ b/docs-website/src/components/OSDetectionTabs/index.jsx @@ -0,0 +1,88 @@ +import React, { useState, useEffect } from 'react'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import styles from './styles.module.css'; + +const OSDetectionTabs = ({ children, defaultOS = null }) => { + // Detect OS immediately during initialization + const detectOS = () => { + if (typeof window === 'undefined') return 'linux'; // SSR fallback + + const userAgent = window.navigator.userAgent; + const platform = window.navigator.platform; + + console.log('Detecting OS - UserAgent:', userAgent, 'Platform:', platform); + + // More specific macOS detection + if (platform.indexOf('Mac') !== -1 || + userAgent.indexOf('Mac') !== -1 || + userAgent.indexOf('macOS') !== -1 || + platform === 'MacIntel' || + platform === 'MacPPC') { + return 'macos'; + } else if (userAgent.indexOf('Win') !== -1 || platform.indexOf('Win') !== -1) { + return 'windows'; + } else if (userAgent.indexOf('Linux') !== -1 || platform.indexOf('Linux') !== -1) { + return 'linux'; + } else { + return 'linux'; // Default fallback + } + }; + + const [detectedOS, setDetectedOS] = useState(() => detectOS()); + const [defaultValue, setDefaultValue] = useState(() => defaultOS || detectOS()); + + useEffect(() => { + // Re-detect OS on client side to handle SSR + const os = detectOS(); + console.log('Detected OS:', os); + setDetectedOS(os); + + // Set default tab to detected OS if no explicit default provided + if (!defaultOS) { + setDefaultValue(os); + } + }, [defaultOS]); + + // Get OS icon + const getOSIcon = (osValue) => { + switch (osValue) { + case 'windows': return '🪟'; + case 'macos': return '🍎'; + case 'linux': return '🐧'; + default: return ''; + } + }; + + // Add OS detection info to child components + const enhancedChildren = React.Children.map(children, child => { + if (React.isValidElement(child) && child.type === TabItem) { + const isDetected = child.props.value === detectedOS; + const icon = getOSIcon(child.props.value); + const label = isDetected + ? `${icon} ${child.props.label} (Your OS)` + : `${icon} ${child.props.label}`; + + return React.cloneElement(child, { + ...child.props, + label, + className: isDetected ? styles.detectedTab : '' + }); + } + return child; + }); + + console.log('Rendering OSDetectionTabs with defaultValue:', defaultValue, 'detectedOS:', detectedOS); + + return ( + + {enhancedChildren} + + ); +}; + +export default OSDetectionTabs; diff --git a/docs-website/src/components/OSDetectionTabs/styles.module.css b/docs-website/src/components/OSDetectionTabs/styles.module.css new file mode 100644 index 00000000000000..89f09f2b0d9a2a --- /dev/null +++ b/docs-website/src/components/OSDetectionTabs/styles.module.css @@ -0,0 +1,77 @@ +/* OS Detection Tabs Styling */ +.osDetectionTabs { + margin: 1rem 0; +} + +.detectedLabel { + color: var(--ifm-color-primary); + font-weight: 600; +} + +.osIcon { + margin-right: 0.5rem; + font-size: 1.1em; +} + +/* OS-specific icons */ +.windowsIcon::before { + content: "🪟"; +} + +.macosIcon::before { + content: "🍎"; +} + +.linuxIcon::before { + content: "🐧"; +} + +/* Enhanced tab styling for detected OS */ +.detectedTab { + background-color: var(--ifm-color-primary-lightest); + border-color: var(--ifm-color-primary); +} + +/* Code block enhancements for different shells */ +.windowsCode { + background-color: #1e1e1e; + color: #d4d4d4; +} + +.macosCode { + background-color: #2d2d2d; + color: #f8f8f2; +} + +.linuxCode { + background-color: #300a24; + color: #ffffff; +} + +/* Troubleshooting sections */ +.troubleshooting { + background-color: var(--ifm-color-warning-lightest); + border-left: 4px solid var(--ifm-color-warning); + padding: 1rem; + margin: 1rem 0; + border-radius: 0 4px 4px 0; +} + +.troubleshooting h4 { + color: var(--ifm-color-warning-dark); + margin-bottom: 0.5rem; +} + +/* System requirements styling */ +.systemRequirements { + background-color: var(--ifm-color-info-lightest); + border: 1px solid var(--ifm-color-info-light); + border-radius: 4px; + padding: 1rem; + margin: 1rem 0; +} + +.systemRequirements h4 { + color: var(--ifm-color-info-dark); + margin-bottom: 0.5rem; +} diff --git a/docs-website/src/components/ProcessFlow/index.jsx b/docs-website/src/components/ProcessFlow/index.jsx new file mode 100644 index 00000000000000..239249568d749e --- /dev/null +++ b/docs-website/src/components/ProcessFlow/index.jsx @@ -0,0 +1,148 @@ +import React from 'react'; +import styles from './styles.module.css'; + +const ProcessFlow = ({ + title, + steps, + type = 'horizontal', // 'horizontal', 'vertical', 'circular' + showNumbers = true, + animated = true +}) => { + const renderStep = (step, index) => ( +
+ {showNumbers && ( +
{index + 1}
+ )} +
+
{step.title}
+ {step.description && ( +
{step.description}
+ )} + {step.details && ( +
+ {step.details.map((detail, i) => ( +
• {detail}
+ ))} +
+ )} +
+
+ ); + + const renderConnector = (index) => ( +
+ {type === 'horizontal' ? '→' : '↓'} +
+ ); + + // Detect if we might have overflow (4+ steps in horizontal layout) + const hasOverflow = type === 'horizontal' && steps.length >= 4; + + return ( +
+ {title &&
{title}
} + +
+ {steps.map((step, index) => ( + + {renderStep(step, index)} + {index < steps.length - 1 && renderConnector(index)} + + ))} +
+
+ ); +}; + +// Predefined workflow configurations +export const DataHubWorkflows = { + discoveryProcess: { + title: "Enterprise Data Discovery Process", + steps: [ + { + title: "Requirements Analysis", + description: "Define business objectives", + details: ["Identify data needs", "Set success criteria", "Define scope"] + }, + { + title: "Strategic Search", + description: "Apply targeted queries", + details: ["Use business terms", "Apply filters", "Refine results"] + }, + { + title: "Asset Evaluation", + description: "Assess data quality", + details: ["Check freshness", "Review schema", "Validate completeness"] + }, + { + title: "Access Planning", + description: "Understand requirements", + details: ["Check permissions", "Review documentation", "Plan integration"] + } + ] + }, + + lineageAnalysis: { + title: "5-Hop Lineage Analysis Method", + steps: [ + { + title: "Start at Target", + description: "Begin with dataset of interest", + details: ["Open lineage view", "Identify current dataset", "Note business context"] + }, + { + title: "Trace Upstream", + description: "Follow data backwards", + details: ["Identify transformations", "Check data sources", "Document dependencies"] + }, + { + title: "Analyze Hops", + description: "Examine each connection", + details: ["Understand business logic", "Check quality gates", "Note critical points"] + }, + { + title: "Impact Assessment", + description: "Evaluate change effects", + details: ["Identify affected systems", "Assess risk levels", "Plan mitigation"] + }, + { + title: "Validate Understanding", + description: "Confirm analysis", + details: ["Review with data owners", "Test assumptions", "Document findings"] + } + ] + }, + + ingestionProcess: { + title: "Metadata Ingestion Workflow", + steps: [ + { + title: "Connection", + description: "Establish secure connections", + details: ["Configure credentials", "Test connectivity", "Set up authentication"] + }, + { + title: "Discovery", + description: "Scan data structures", + details: ["Identify schemas", "Map relationships", "Detect patterns"] + }, + { + title: "Extraction", + description: "Pull comprehensive metadata", + details: ["Schema information", "Statistics", "Lineage data"] + }, + { + title: "Transformation", + description: "Standardize metadata format", + details: ["Apply business rules", "Enrich with context", "Validate quality"] + }, + { + title: "Loading", + description: "Store in DataHub", + details: ["Update knowledge graph", "Index for search", "Enable discovery"] + } + ] + } +}; + +export default ProcessFlow; diff --git a/docs-website/src/components/ProcessFlow/styles.module.css b/docs-website/src/components/ProcessFlow/styles.module.css new file mode 100644 index 00000000000000..024765eb19f71c --- /dev/null +++ b/docs-website/src/components/ProcessFlow/styles.module.css @@ -0,0 +1,270 @@ +/* Process Flow Styles */ +.processFlow { + background: linear-gradient(135deg, #ffffff 0%, #f8fafc 100%); + border: 1px solid var(--ifm-color-primary-lightest); + border-radius: 12px; + padding: 24px; + margin: 24px 0; + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08); + position: relative; + overflow: hidden; +} + +.flowTitle { + text-align: center; + font-size: 1.3rem; + font-weight: 600; + color: var(--ifm-color-primary-dark); + margin-bottom: 24px; + padding-bottom: 12px; + border-bottom: 2px solid var(--ifm-color-primary-lightest); +} + +.flowContainer { + display: flex; + align-items: stretch; + gap: 16px; + max-width: 100%; + overflow-x: auto; + padding: 8px 0; + scrollbar-width: thin; + scrollbar-color: var(--ifm-color-primary-light) transparent; +} + +/* Custom scrollbar for webkit browsers */ +.flowContainer::-webkit-scrollbar { + height: 6px; +} + +.flowContainer::-webkit-scrollbar-track { + background: var(--ifm-color-emphasis-200); + border-radius: 3px; +} + +.flowContainer::-webkit-scrollbar-thumb { + background: var(--ifm-color-primary-light); + border-radius: 3px; +} + +.flowContainer::-webkit-scrollbar-thumb:hover { + background: var(--ifm-color-primary); +} + +/* Horizontal Layout */ +.horizontal .flowContainer { + flex-direction: row; + justify-content: flex-start; + flex-wrap: nowrap; +} + +.horizontal .step { + flex: 0 0 auto; + min-width: 180px; + max-width: 220px; +} + +/* Scroll hint for horizontal flows with many steps */ +.horizontal.hasOverflow::after { + content: "← Scroll horizontally to see all steps →"; + position: absolute; + bottom: 8px; + left: 50%; + transform: translateX(-50%); + font-size: 12px; + color: var(--ifm-color-emphasis-600); + white-space: nowrap; + pointer-events: none; + background: rgba(255, 255, 255, 0.9); + padding: 4px 8px; + border-radius: 4px; + border: 1px solid var(--ifm-color-emphasis-300); +} + +.horizontal .connector { + display: flex; + align-items: center; + justify-content: center; + font-size: 1.5rem; + color: var(--ifm-color-primary); + font-weight: bold; + flex-shrink: 0; + width: 40px; +} + +/* Vertical Layout */ +.vertical .flowContainer { + flex-direction: column; + align-items: center; +} + +.vertical .step { + width: 100%; + max-width: 500px; +} + +.vertical .connector { + display: flex; + align-items: center; + justify-content: center; + font-size: 1.5rem; + color: var(--ifm-color-primary); + font-weight: bold; + height: 30px; + width: 100%; +} + +/* Step Styles */ +.step { + background: white; + border: 2px solid var(--ifm-color-primary-light); + border-radius: 8px; + padding: 16px; + position: relative; + transition: all 0.3s ease; + cursor: pointer; +} + +.step:hover { + transform: translateY(-2px); + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.12); + border-color: var(--ifm-color-primary); +} + +.stepNumber { + position: absolute; + top: -12px; + left: 16px; + background: var(--ifm-color-primary); + color: white; + width: 24px; + height: 24px; + border-radius: 50%; + display: flex; + align-items: center; + justify-content: center; + font-size: 0.8rem; + font-weight: bold; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); +} + +.stepContent { + margin-top: 8px; +} + +.stepTitle { + font-weight: 600; + font-size: 1rem; + color: var(--ifm-color-emphasis-800); + margin-bottom: 8px; +} + +.stepDescription { + font-size: 0.9rem; + color: var(--ifm-color-emphasis-600); + margin-bottom: 12px; + font-style: italic; +} + +.stepDetails { + font-size: 0.8rem; + color: var(--ifm-color-emphasis-700); +} + +.stepDetail { + margin-bottom: 4px; + padding-left: 8px; +} + +/* Animation */ +.animated { + animation: slideIn 0.6s ease-out forwards; + opacity: 0; +} + +.animated:nth-child(1) { animation-delay: 0.1s; } +.animated:nth-child(3) { animation-delay: 0.2s; } +.animated:nth-child(5) { animation-delay: 0.3s; } +.animated:nth-child(7) { animation-delay: 0.4s; } +.animated:nth-child(9) { animation-delay: 0.5s; } + +@keyframes slideIn { + from { + opacity: 0; + transform: translateY(20px); + } + to { + opacity: 1; + transform: translateY(0); + } +} + +/* Connector Animation */ +.connector { + animation: pulse 2s infinite; +} + +@keyframes pulse { + 0%, 100% { opacity: 0.6; } + 50% { opacity: 1; } +} + +/* Responsive Design */ +@media (max-width: 768px) { + .horizontal .flowContainer { + flex-direction: column; + align-items: center; + } + + .horizontal .step { + width: 100%; + max-width: 400px; + } + + .horizontal .connector { + transform: rotate(90deg); + width: 100%; + height: 30px; + } + + .stepNumber { + left: 12px; + } +} + +/* Dark Mode Support */ +[data-theme='dark'] .processFlow { + background: linear-gradient(135deg, #1e293b 0%, #334155 100%); + border-color: var(--ifm-color-primary-dark); +} + +[data-theme='dark'] .step { + background: var(--ifm-color-emphasis-100); + border-color: var(--ifm-color-primary-dark); + color: var(--ifm-color-emphasis-800); +} + +[data-theme='dark'] .step:hover { + border-color: var(--ifm-color-primary); + background: var(--ifm-color-emphasis-200); +} + +/* Step Type Variations */ +.step.start { + border-color: #10b981; + background: linear-gradient(135deg, #ecfdf5 0%, #d1fae5 100%); +} + +.step.process { + border-color: #3b82f6; + background: linear-gradient(135deg, #eff6ff 0%, #dbeafe 100%); +} + +.step.decision { + border-color: #f59e0b; + background: linear-gradient(135deg, #fffbeb 0%, #fef3c7 100%); +} + +.step.end { + border-color: #8b5cf6; + background: linear-gradient(135deg, #f5f3ff 0%, #ede9fe 100%); +} diff --git a/docs-website/src/components/StepCompletion/index.jsx b/docs-website/src/components/StepCompletion/index.jsx new file mode 100644 index 00000000000000..dd8a9bd51fcc75 --- /dev/null +++ b/docs-website/src/components/StepCompletion/index.jsx @@ -0,0 +1,50 @@ +import React, { useState, useEffect } from 'react'; +import styles from './styles.module.css'; + +const StepCompletion = ({ stepId, children, completionText = "✅ Completed!" }) => { + const [isCompleted, setIsCompleted] = useState(false); + const storageKey = `datahub-step-${stepId}`; + + // Load completion status from localStorage + useEffect(() => { + const saved = localStorage.getItem(storageKey); + if (saved === 'true') { + setIsCompleted(true); + } + }, [storageKey]); + + // Save completion status to localStorage + useEffect(() => { + localStorage.setItem(storageKey, isCompleted.toString()); + }, [isCompleted, storageKey]); + + const toggleCompletion = () => { + setIsCompleted(!isCompleted); + }; + + return ( +
+
+ {children} +
+
+ +
+
+ ); +}; + +export default StepCompletion; diff --git a/docs-website/src/components/StepCompletion/styles.module.css b/docs-website/src/components/StepCompletion/styles.module.css new file mode 100644 index 00000000000000..ca212b85e36a2b --- /dev/null +++ b/docs-website/src/components/StepCompletion/styles.module.css @@ -0,0 +1,76 @@ +.stepCompletion { + background: #f8f9fa; + border: 2px solid #e9ecef; + border-radius: 8px; + padding: 16px; + margin: 16px 0; + transition: all 0.3s ease; +} + +.stepCompletion.completed { + background: #d4edda; + border-color: #28a745; +} + +.content { + margin-bottom: 12px; +} + +.completionControl { + border-top: 1px solid #e9ecef; + padding-top: 12px; +} + +.completionLabel { + display: flex; + align-items: center; + gap: 8px; + cursor: pointer; + margin: 0; + font-weight: 500; +} + +.checkbox { + display: none; +} + +.checkmark { + font-size: 16px; + transition: transform 0.1s ease; +} + +.completionLabel:hover .checkmark { + transform: scale(1.1); +} + +.completionText { + color: #495057; + font-size: 14px; +} + +.completed .completionText { + color: #155724; +} + +/* Dark mode support */ +[data-theme='dark'] .stepCompletion { + background: #2d2d2d; + border-color: #444; +} + +[data-theme='dark'] .stepCompletion.completed { + background: #1e3a1e; + border-color: #28a745; +} + +[data-theme='dark'] .completionControl { + border-top-color: #444; +} + +[data-theme='dark'] .completionText { + color: #e9ecef; +} + +[data-theme='dark'] .completed .completionText { + color: #90ee90; +} diff --git a/docs-website/src/components/TutorialExercise/index.jsx b/docs-website/src/components/TutorialExercise/index.jsx new file mode 100644 index 00000000000000..30455597a26052 --- /dev/null +++ b/docs-website/src/components/TutorialExercise/index.jsx @@ -0,0 +1,137 @@ +import React from 'react'; +import styles from './styles.module.css'; + +const TutorialExercise = ({ + title, + type = 'search', + icon, + children, + difficulty = 'beginner', + timeEstimate, + platform = 'DataHub' +}) => { + const getTypeIcon = () => { + switch (type) { + case 'search': + return '🔍'; + case 'hands-on': + return '💻'; + case 'analysis': + return '📊'; + case 'exercise': + return '🎯'; + default: + return '📝'; + } + }; + + const getDifficultyColor = () => { + switch (difficulty) { + case 'beginner': + return 'var(--datahub-success)'; + case 'intermediate': + return 'var(--datahub-warning)'; + case 'advanced': + return 'var(--datahub-error)'; + default: + return 'var(--datahub-primary)'; + } + }; + + return ( +
+
+
+
+ {icon || getTypeIcon()} +
+
+

{title}

+
+ + {difficulty} + + {timeEstimate && ( + + ⏱️ {timeEstimate} + + )} + + {platform} + +
+
+
+
+
+ {children} +
+
+ ); +}; + +export const SearchExercise = ({ title, searches, children, ...props }) => ( + + {searches && ( +
+ {searches.map((search, index) => ( +
+
+ {search.query} +
+ {search.description && ( +
+ {search.description} +
+ )} + {search.expected && ( +
+ Expected: {search.expected} +
+ )} +
+ ))} +
+ )} + {children} +
+); + +export const HandsOnExercise = ({ title, steps, children, ...props }) => ( + + {steps && ( +
+ {steps.map((step, index) => ( +
+
{index + 1}
+
+
{step.title}
+ {step.description && ( +
{step.description}
+ )} + {step.code && ( +
+ {step.code} +
+ )} +
+
+ ))} +
+ )} + {children} +
+); + +export const InteractiveDemo = ({ title, children, ...props }) => ( + +
+ {children} +
+
+); + +export default TutorialExercise; diff --git a/docs-website/src/components/TutorialExercise/styles.module.css b/docs-website/src/components/TutorialExercise/styles.module.css new file mode 100644 index 00000000000000..2190c4226ee8b7 --- /dev/null +++ b/docs-website/src/components/TutorialExercise/styles.module.css @@ -0,0 +1,284 @@ +/* Import DataHub design tokens */ +@import url('../../css/datahub-tokens.css'); + +.exerciseContainer { + background: var(--datahub-white); + border: 1px solid var(--datahub-border); + border-radius: 12px; + box-shadow: var(--datahub-shadow); + margin: 24px 0; + overflow: hidden; + transition: all 0.2s ease; +} + +.exerciseContainer:hover { + box-shadow: var(--datahub-shadow-hover); + border-color: var(--datahub-primary-light); +} + +.exerciseHeader { + background: linear-gradient(135deg, var(--datahub-gray-50) 0%, var(--datahub-white) 100%); + border-bottom: 1px solid var(--datahub-border); + padding: 16px 20px; + display: flex; + align-items: center; + justify-content: space-between; +} + +.headerLeft { + display: flex; + align-items: center; + gap: 12px; +} + +.typeIcon { + width: 40px; + height: 40px; + background: var(--datahub-primary); + color: var(--datahub-white); + border-radius: 8px; + display: flex; + align-items: center; + justify-content: center; + font-size: 18px; + flex-shrink: 0; +} + +.titleSection { + display: flex; + flex-direction: column; + gap: 4px; +} + +.exerciseTitle { + margin: 0; + font-size: 16px; + font-weight: 600; + color: var(--datahub-text-primary); + line-height: 1.3; +} + +.metadata { + display: flex; + align-items: center; + gap: 8px; + flex-wrap: wrap; +} + +.difficulty { + padding: 2px 8px; + border-radius: 12px; + font-size: 11px; + font-weight: 500; + color: var(--datahub-white); + text-transform: uppercase; + letter-spacing: 0.5px; +} + +.timeEstimate { + font-size: 12px; + color: var(--datahub-text-secondary); + background: var(--datahub-gray-100); + padding: 2px 6px; + border-radius: 4px; +} + +.platform { + font-size: 12px; + color: var(--datahub-primary); + background: var(--datahub-primary-light); + padding: 2px 6px; + border-radius: 4px; + font-weight: 500; +} + +.exerciseContent { + padding: 20px; +} + +/* Search Exercise Styles */ +.searchList { + display: flex; + flex-direction: column; + gap: 16px; +} + +.searchItem { + background: var(--datahub-gray-50); + border: 1px solid var(--datahub-border-light); + border-radius: 8px; + padding: 16px; + transition: all 0.2s ease; +} + +.searchItem:hover { + background: var(--datahub-white); + border-color: var(--datahub-primary-light); +} + +.searchQuery { + margin-bottom: 8px; +} + +.searchQuery code { + background: var(--datahub-primary-dark); + color: var(--datahub-white); + padding: 8px 12px; + border-radius: 6px; + font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; + font-size: 14px; + font-weight: 500; + display: inline-block; + min-width: 200px; +} + +.searchDescription { + color: var(--datahub-text-secondary); + font-size: 14px; + margin-bottom: 6px; + line-height: 1.4; +} + +.searchExpected { + color: var(--datahub-text-primary); + font-size: 13px; + background: var(--datahub-success-light); + padding: 6px 10px; + border-radius: 4px; + border-left: 3px solid var(--datahub-success); +} + +/* Hands-On Exercise Styles */ +.stepsList { + display: flex; + flex-direction: column; + gap: 16px; +} + +.stepItem { + display: flex; + gap: 12px; + align-items: flex-start; +} + +.stepNumber { + width: 28px; + height: 28px; + background: var(--datahub-primary); + color: var(--datahub-white); + border-radius: 50%; + display: flex; + align-items: center; + justify-content: center; + font-size: 14px; + font-weight: 600; + flex-shrink: 0; + margin-top: 2px; +} + +.stepContent { + flex: 1; + display: flex; + flex-direction: column; + gap: 6px; +} + +.stepTitle { + font-weight: 600; + color: var(--datahub-text-primary); + font-size: 15px; + line-height: 1.4; +} + +.stepDescription { + color: var(--datahub-text-secondary); + font-size: 14px; + line-height: 1.5; +} + +.stepCode { + background: var(--datahub-gray-900); + color: var(--datahub-white); + padding: 10px 12px; + border-radius: 6px; + font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; + font-size: 13px; + margin-top: 4px; +} + +/* Interactive Demo Styles */ +.interactiveContent { + background: linear-gradient(135deg, var(--datahub-primary-light) 0%, var(--datahub-white) 100%); + border: 1px solid var(--datahub-primary-light); + border-radius: 8px; + padding: 20px; +} + +/* Responsive Design */ +@media (max-width: 768px) { + .exerciseHeader { + padding: 12px 16px; + } + + .headerLeft { + gap: 8px; + } + + .typeIcon { + width: 32px; + height: 32px; + font-size: 16px; + } + + .exerciseTitle { + font-size: 14px; + } + + .exerciseContent { + padding: 16px; + } + + .metadata { + gap: 6px; + } + + .searchQuery code { + min-width: auto; + font-size: 13px; + padding: 6px 10px; + } + + .stepItem { + gap: 8px; + } + + .stepNumber { + width: 24px; + height: 24px; + font-size: 12px; + } +} + +/* Dark mode support */ +[data-theme='dark'] .exerciseContainer { + background: var(--datahub-gray-800); + border-color: var(--datahub-gray-700); +} + +[data-theme='dark'] .exerciseHeader { + background: linear-gradient(135deg, var(--datahub-gray-700) 0%, var(--datahub-gray-800) 100%); + border-bottom-color: var(--datahub-gray-700); +} + +[data-theme='dark'] .searchItem { + background: var(--datahub-gray-700); + border-color: var(--datahub-gray-600); +} + +[data-theme='dark'] .searchItem:hover { + background: var(--datahub-gray-600); +} + +[data-theme='dark'] .interactiveContent { + background: linear-gradient(135deg, var(--datahub-primary-dark) 0%, var(--datahub-gray-800) 100%); +} diff --git a/docs-website/src/components/TutorialProgress/index.jsx b/docs-website/src/components/TutorialProgress/index.jsx new file mode 100644 index 00000000000000..ef84bae6933712 --- /dev/null +++ b/docs-website/src/components/TutorialProgress/index.jsx @@ -0,0 +1,203 @@ +import React, { useState, useEffect } from 'react'; +import { useHistory, useLocation } from '@docusaurus/router'; +import styles from './styles.module.css'; + +const TutorialProgress = ({ tutorialId, steps, currentStep, compact = false }) => { + const [completedSteps, setCompletedSteps] = useState(new Set()); + const [isMinimized, setIsMinimized] = useState(false); + const [isScrolled, setIsScrolled] = useState(false); + + // Handle both old and new formats + const actualTutorialId = tutorialId || 'tutorial'; + const actualCurrentStep = typeof currentStep === 'string' ? currentStep : `step-${currentStep}`; + const storageKey = `datahub-tutorial-${actualTutorialId}`; + + // Load progress from localStorage on component mount + useEffect(() => { + const savedProgress = localStorage.getItem(storageKey); + if (savedProgress) { + try { + const parsed = JSON.parse(savedProgress); + setCompletedSteps(new Set(parsed)); + } catch (e) { + console.warn('Failed to parse tutorial progress:', e); + } + } + }, [storageKey]); + + // Save progress to localStorage whenever completedSteps changes + useEffect(() => { + localStorage.setItem(storageKey, JSON.stringify([...completedSteps])); + }, [completedSteps, storageKey]); + + const toggleStep = (stepId) => { + setCompletedSteps(prev => { + const newSet = new Set(prev); + if (newSet.has(stepId)) { + newSet.delete(stepId); + } else { + newSet.add(stepId); + // Auto-mark previous steps as completed + const stepIndex = parseInt(stepId.split('-')[1]); + for (let i = 0; i < stepIndex; i++) { + newSet.add(`step-${i}`); + } + } + return newSet; + }); + }; + + const resetProgress = () => { + setCompletedSteps(new Set()); + localStorage.removeItem(storageKey); + }; + + // Auto-mark current step as completed when user navigates + useEffect(() => { + if (currentStep !== undefined) { + setCompletedSteps(prev => { + const newSet = new Set(prev); + newSet.add(actualCurrentStep); + return newSet; + }); + } + }, [actualCurrentStep]); + + // Handle scroll behavior for auto-minimizing + useEffect(() => { + const handleScroll = () => { + const scrollTop = window.pageYOffset || document.documentElement.scrollTop; + setIsScrolled(scrollTop > 100); // Auto-minimize after scrolling 100px + }; + + window.addEventListener('scroll', handleScroll); + return () => window.removeEventListener('scroll', handleScroll); + }, []); + + const toggleMinimized = () => { + setIsMinimized(!isMinimized); + }; + + const completionPercentage = Math.round((completedSteps.size / steps.length) * 100); + + if (compact) { + return ( +
+
+ 📋 Progress: {completedSteps.size}/{steps.length} +
+
+
+
+
+ ); + } + + // Determine if we should show minimized version + const shouldShowMinimized = isMinimized || isScrolled; + + if (shouldShowMinimized) { + return ( +
+
+
+ + 📋 {completedSteps.size}/{steps.length} completed ({completionPercentage}%) + +
+
+
+
+ +
+
+ ); + } + + return ( +
+
+
+

📋 Tutorial Progress

+ +
+
+
+ + {completedSteps.size} of {steps.length} completed ({completionPercentage}%) + +
+
+ +
+ {steps.map((step, index) => { + // Handle both old format (step-${index}) and new format (step.id) + const stepId = step.id || `step-${index}`; + const isCompleted = completedSteps.has(stepId); + const isCurrent = actualCurrentStep === stepId; + + return ( +
+ + {step.description && ( +
{step.description}
+ )} +
+ ); + })} +
+ +
+ + {completedSteps.size === steps.length && ( +
+ 🎉 Tutorial Complete! Great job finishing all steps! +
+ )} +
+
+ ); +}; + +export default TutorialProgress; diff --git a/docs-website/src/components/TutorialProgress/styles.module.css b/docs-website/src/components/TutorialProgress/styles.module.css new file mode 100644 index 00000000000000..d70b8de5150c77 --- /dev/null +++ b/docs-website/src/components/TutorialProgress/styles.module.css @@ -0,0 +1,379 @@ +.tutorialProgress { + background: var(--ifm-background-color); + border: 1px solid var(--ifm-color-emphasis-300); + border-radius: 12px; + padding: 24px; + margin: 32px 0; + font-family: var(--ifm-font-family-base); + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); + position: sticky; + top: 20px; + z-index: 10; +} + +.header { + margin-bottom: 16px; +} + +.header h4 { + margin: 0 0 16px 0; + color: var(--ifm-color-primary); + font-size: 18px; + font-weight: 700; + display: flex; + align-items: center; + gap: 8px; +} + +.progressBar { + position: relative; + background: #e9ecef; + border-radius: 10px; + height: 20px; + overflow: hidden; +} + +.progressFill { + background: linear-gradient(90deg, var(--ifm-color-primary) 0%, var(--ifm-color-primary-light) 100%); + height: 100%; + border-radius: 10px; + transition: width 0.4s cubic-bezier(0.4, 0, 0.2, 1); +} + +.progressText { + position: absolute; + top: 50%; + left: 50%; + transform: translate(-50%, -50%); + font-size: 13px; + font-weight: 600; + color: var(--ifm-color-content); + text-shadow: 0 0 3px var(--ifm-background-color); +} + +.stepsList { + display: flex; + flex-direction: column; + gap: 12px; +} + +.step { + padding: 12px; + border-radius: 6px; + transition: all 0.2s ease; +} + +.step:hover { + background: rgba(0, 123, 255, 0.05); +} + +.step.current { + background: var(--ifm-color-primary-lightest); + border-left: 4px solid var(--ifm-color-primary); + padding-left: 16px; + border-radius: 8px; +} + +.stepLabel { + display: flex; + align-items: flex-start; + gap: 12px; + cursor: pointer; + margin: 0; +} + +.checkbox { + display: none; +} + +.checkmark { + font-size: 18px; + line-height: 1; + user-select: none; + transition: transform 0.1s ease; +} + +.stepLabel:hover .checkmark { + transform: scale(1.1); +} + +.stepText { + flex: 1; + line-height: 1.4; +} + +.stepText strong { + color: #495057; + font-weight: 600; +} + +.time { + color: #6c757d; + font-size: 14px; + margin-left: 8px; +} + +.currentBadge { + color: var(--ifm-color-primary); + font-weight: 600; + font-size: 14px; + margin-left: 12px; + background: var(--ifm-color-primary-lightest); + padding: 2px 8px; + border-radius: 12px; + border: 1px solid var(--ifm-color-primary-light); +} + +.stepDescription { + margin-top: 6px; + margin-left: 30px; + font-size: 14px; + color: #6c757d; + line-height: 1.4; +} + +.actions { + margin-top: 20px; + padding-top: 16px; + border-top: 1px solid #e9ecef; + display: flex; + justify-content: space-between; + align-items: center; +} + +.resetButton { + background: #f8f9fa; + border: 1px solid #dee2e6; + border-radius: 4px; + padding: 6px 12px; + font-size: 12px; + color: #6c757d; + cursor: pointer; + transition: all 0.2s ease; +} + +.resetButton:hover { + background: #e9ecef; + border-color: #adb5bd; +} + +.completionMessage { + color: var(--ifm-color-success); + font-weight: 600; + font-size: 14px; + background: var(--ifm-color-success-lightest); + padding: 12px 16px; + border-radius: 8px; + border: 1px solid var(--ifm-color-success-light); + text-align: center; +} + +/* Dark mode support */ +[data-theme='dark'] .tutorialProgress { + background: #1e1e1e; + border-color: #444; + color: #e9ecef; +} + +[data-theme='dark'] .header h4 { + color: #e9ecef; +} + +[data-theme='dark'] .progressBar { + background: #444; +} + +[data-theme='dark'] .progressText { + color: #e9ecef; + text-shadow: 0 0 3px rgba(0, 0, 0, 0.8); +} + +[data-theme='dark'] .step:hover { + background: rgba(0, 123, 255, 0.15); +} + +[data-theme='dark'] .step.current { + background: rgba(0, 123, 255, 0.2); +} + +[data-theme='dark'] .stepText strong { + color: #e9ecef; +} + +[data-theme='dark'] .actions { + border-top-color: #444; +} + +[data-theme='dark'] .resetButton { + background: #2d2d2d; + border-color: #444; + color: #adb5bd; +} + +[data-theme='dark'] .resetButton:hover { + background: #3d3d3d; + border-color: #555; +} + +/* Compact mode styles */ +.compact { + position: relative; + top: auto; + margin: 16px 0; + padding: 16px; + background: var(--ifm-color-emphasis-100); + border: 1px solid var(--ifm-color-emphasis-200); +} + +.compactHeader { + display: flex; + align-items: center; + gap: 16px; +} + +.compactTitle { + font-weight: 600; + color: var(--ifm-color-content); + font-size: 14px; + white-space: nowrap; +} + +.compactBar { + flex: 1; + height: 8px; + background: var(--ifm-color-emphasis-200); + border-radius: 4px; + overflow: hidden; +} + +.compactBar .progressFill { + height: 100%; + border-radius: 4px; +} + +[data-theme='dark'] .compact { + background: var(--ifm-color-emphasis-200); + border-color: var(--ifm-color-emphasis-300); +} + +/* Header content layout */ +.headerContent { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 12px; +} + +/* Minimize/Expand buttons */ +.minimizeButton, +.expandButton { + background: none; + border: none; + font-size: 16px; + cursor: pointer; + padding: 4px 8px; + border-radius: 4px; + transition: all 0.2s ease; + opacity: 0.7; +} + +.minimizeButton:hover, +.expandButton:hover { + opacity: 1; + background: var(--ifm-color-emphasis-100); +} + +/* Minimized state styles */ +.minimized { + position: fixed; + top: 80px; /* Below the DataHub header banner */ + right: 20px; + width: 300px; + z-index: 100; /* Lower z-index to stay below header */ + margin: 0; + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.15); + border-radius: 8px; + transition: all 0.3s ease; +} + +.minimized.scrolled { + top: 70px; /* Slightly higher when scrolled but still below header */ + width: 280px; + box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2); +} + +.minimizedHeader { + display: flex; + align-items: center; + justify-content: space-between; + padding: 12px 16px; + cursor: pointer; + transition: all 0.2s ease; +} + +.minimizedHeader:hover { + background: var(--ifm-color-emphasis-50); +} + +.minimizedContent { + flex: 1; + display: flex; + flex-direction: column; + gap: 8px; +} + +.minimizedTitle { + font-weight: 600; + font-size: 14px; + color: var(--ifm-color-content); +} + +.minimizedBar { + height: 6px; + background: var(--ifm-color-emphasis-200); + border-radius: 3px; + overflow: hidden; +} + +.minimizedBar .progressFill { + height: 100%; + border-radius: 3px; +} + +.expandButton { + margin-left: 12px; + font-size: 14px; +} + +/* Dark mode adjustments for minimized state */ +[data-theme='dark'] .minimized { + background: var(--ifm-color-emphasis-200); + border-color: var(--ifm-color-emphasis-400); +} + +[data-theme='dark'] .minimizedHeader:hover { + background: var(--ifm-color-emphasis-300); +} + +[data-theme='dark'] .minimizedBar { + background: var(--ifm-color-emphasis-400); +} + +/* Responsive design for minimized state */ +@media (max-width: 768px) { + .minimized { + position: relative; + top: auto; + right: auto; + width: 100%; + margin: 16px 0; + } + + .minimized.scrolled { + position: fixed; + top: 60px; /* Account for mobile header */ + left: 10px; + right: 10px; + width: auto; + } +} diff --git a/docs-website/src/css/mermaid-custom.css b/docs-website/src/css/mermaid-custom.css new file mode 100644 index 00000000000000..c3c987c42ec84f --- /dev/null +++ b/docs-website/src/css/mermaid-custom.css @@ -0,0 +1,166 @@ +/* Enhanced Mermaid Diagram Styling for DataHub */ + +/* Container styling */ +.mermaid { + background: var(--ifm-background-color); + border-radius: 12px; + padding: 20px; + margin: 24px 0; + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); + border: 1px solid var(--ifm-color-emphasis-200); + overflow: visible; +} + +/* Dark mode adjustments */ +[data-theme='dark'] .mermaid { + background: var(--ifm-color-emphasis-100); + border-color: var(--ifm-color-emphasis-300); + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.3); +} + +/* Enhanced node styling */ +.mermaid .node rect, +.mermaid .node circle, +.mermaid .node ellipse, +.mermaid .node polygon { + stroke-width: 2px; + filter: drop-shadow(0 2px 4px rgba(0, 0, 0, 0.1)); + transition: all 0.2s ease; +} + +.mermaid .node:hover rect, +.mermaid .node:hover circle, +.mermaid .node:hover ellipse, +.mermaid .node:hover polygon { + filter: drop-shadow(0 4px 8px rgba(0, 0, 0, 0.15)); + transform: translateY(-1px); +} + +/* Enhanced edge/arrow styling */ +.mermaid .edgePath path { + stroke-width: 2px; + filter: drop-shadow(0 1px 2px rgba(0, 0, 0, 0.1)); +} + +.mermaid .arrowheadPath { + fill: var(--ifm-color-primary); + stroke: var(--ifm-color-primary); +} + +/* Text styling improvements */ +.mermaid .nodeLabel, +.mermaid .edgeLabel { + font-family: var(--ifm-font-family-base); + font-weight: 500; + text-shadow: 0 1px 2px rgba(255, 255, 255, 0.8); +} + +[data-theme='dark'] .mermaid .nodeLabel, +[data-theme='dark'] .mermaid .edgeLabel { + text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8); +} + +/* Cluster/subgraph styling */ +.mermaid .cluster rect { + fill: var(--ifm-color-primary-lightest); + stroke: var(--ifm-color-primary-light); + stroke-width: 2px; + stroke-dasharray: 5,5; + rx: 8px; + ry: 8px; +} + +/* Flowchart specific enhancements */ +.mermaid .flowchart-link { + stroke: var(--ifm-color-primary); + stroke-width: 2px; +} + +/* Sequence diagram enhancements */ +.mermaid .actor { + fill: var(--ifm-color-primary-lightest); + stroke: var(--ifm-color-primary); + stroke-width: 2px; +} + +.mermaid .messageLine0, +.mermaid .messageLine1 { + stroke: var(--ifm-color-primary); + stroke-width: 2px; +} + +/* Gantt chart enhancements */ +.mermaid .section0, +.mermaid .section1, +.mermaid .section2, +.mermaid .section3 { + fill: var(--ifm-color-primary); + opacity: 0.8; +} + +/* Git graph enhancements */ +.mermaid .commit-id, +.mermaid .commit-msg, +.mermaid .branch-label { + font-family: var(--ifm-font-family-monospace); + font-size: 12px; +} + +/* State diagram enhancements */ +.mermaid .state-start circle, +.mermaid .state-end circle { + fill: var(--ifm-color-primary); + stroke: var(--ifm-color-primary-dark); + stroke-width: 2px; +} + +/* Journey diagram enhancements */ +.mermaid .journey-section { + fill: var(--ifm-color-primary-lightest); +} + +/* Responsive design */ +@media (max-width: 768px) { + .mermaid { + padding: 16px; + margin: 16px 0; + font-size: 14px; + } +} + +/* Animation for diagram loading */ +.mermaid { + animation: fadeInUp 0.5s ease-out; +} + +@keyframes fadeInUp { + from { + opacity: 0; + transform: translateY(20px); + } + to { + opacity: 1; + transform: translateY(0); + } +} + +/* Custom styling for tutorial-specific diagrams */ +.mermaid .tutorial-start { + fill: var(--ifm-color-success-lightest) !important; + stroke: var(--ifm-color-success) !important; +} + +.mermaid .tutorial-end { + fill: var(--ifm-color-primary-lightest) !important; + stroke: var(--ifm-color-primary) !important; +} + +.mermaid .tutorial-process { + fill: var(--ifm-color-info-lightest) !important; + stroke: var(--ifm-color-info) !important; +} + +.mermaid .tutorial-decision { + fill: var(--ifm-color-warning-lightest) !important; + stroke: var(--ifm-color-warning) !important; +} diff --git a/docs-website/src/pages/datahub-components-demo.md b/docs-website/src/pages/datahub-components-demo.md new file mode 100644 index 00000000000000..4b4a79f6737a43 --- /dev/null +++ b/docs-website/src/pages/datahub-components-demo.md @@ -0,0 +1,165 @@ +# DataHub UI Components Demo + +This page demonstrates the DataHub-style UI components that can be embedded in tutorials to provide an authentic DataHub experience. + +import DataHubEntityCard, { SampleEntities } from '@site/src/components/DataHubEntityCard'; +import DataHubLineageNode, { DataHubLineageFlow, SampleLineageFlows } from '@site/src/components/DataHubLineageNode'; + +## Entity Cards + +These cards mimic the actual DataHub search results and entity previews: + +### Sample User Analytics Tables + + + + +### Streaming Data Sources + + + +### Raw Data Storage + + + +## Lineage Flows + +These components show data pipeline relationships using actual DataHub styling: + +### User Metrics Pipeline (Basic) + + + +### User Metrics Pipeline (with Column-Level Lineage) + + + +### Troubleshooting Flow + + + +## Individual Lineage Nodes + +### Dataset Nodes (Rectangular) - With Tags & Glossary Terms +
+ + + +
+ +### Data Job Nodes (Circular) +
+ + + +
+ +## Updated Specifications + +The components now match DataHub V3 specifications: + +### Dataset Nodes (Rectangular) +- **Width**: 320px (matches `LINEAGE_NODE_WIDTH`) +- **Height**: 90px base + expandable columns section +- **Border Radius**: 12px (DataHub V3 styling) +- **Health Icons**: Actual SVG icons (✓ for Good, ⚠ for Warning/Critical) +- **Expandable Columns**: Click + button to show/hide column details +- **Column Types**: Color-coded icons (Aa for strings, 123 for numbers, etc.) +- **Column Lineage**: → indicator shows columns with lineage connections +- **Column-Level Lineage**: Visual connections between related columns across nodes (when all nodes expanded) +- **Tags**: Color-coded dots with tag names (e.g., PII, Daily, Streaming) +- **Glossary Terms**: Colored ribbon indicators with term names (e.g., User Metrics, Fact Table) + +### Data Job Nodes (Circular) +- **Size**: 40px × 40px (matches `TRANSFORMATION_NODE_SIZE`) +- **Border Radius**: 8px (slightly rounded for transformation nodes) +- **Health Icons**: Positioned as badges in top-right corner +- **Platform Logos**: 18px icons centered in the node +- **No Expansion**: Data jobs don't have column-level details + +### Entity Cards +- **Colors**: Synced with DataHub Alchemy design system +- **Primary**: `#533FD1` (DataHub violet[500]) +- **Border**: `#E9EAEE` (DataHub gray[1400]) +- **Text**: `#374066` (DataHub gray[600]) + +## Benefits of Using Actual DataHub Components + +1. **Pixel-Perfect Accuracy**: Matches exact DataHub V3 dimensions and styling +2. **Auto-Sync**: Colors and design tokens automatically sync with DataHub updates +3. **Real Platform Logos**: Uses the actual SVG logos from DataHub's platform library +4. **Consistent Experience**: Users see the exact same UI they'll encounter in DataHub +5. **Future-Proof**: Automatically stays in sync as DataHub UI evolves + +## Technical Implementation + +These components are now precisely calibrated to DataHub's actual specifications: + +- **DataHubEntityCard**: Based on `DefaultPreviewCard` with exact color tokens +- **DataHubLineageNode**: Based on `LineageEntityNode` with V3 dimensions (320x90px) +- **Platform Logos**: Uses the same SVG assets as production DataHub UI +- **Design Tokens**: Automatically extracted from `datahub-web-react/src/alchemy-components/theme/` + +The styling is automatically synchronized at build time, ensuring tutorial components always match the production DataHub interface. diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock index c0ac2729adebab..25597f2e3a9fd2 100644 --- a/docs-website/yarn.lock +++ b/docs-website/yarn.lock @@ -2885,6 +2885,72 @@ rc-resize-observer "^1.3.1" rc-util "^5.38.0" +"@reactflow/background@11.3.14": + version "11.3.14" + resolved "https://registry.yarnpkg.com/@reactflow/background/-/background-11.3.14.tgz#778ca30174f3de77fc321459ab3789e66e71a699" + integrity sha512-Gewd7blEVT5Lh6jqrvOgd4G6Qk17eGKQfsDXgyRSqM+CTwDqRldG2LsWN4sNeno6sbqVIC2fZ+rAUBFA9ZEUDA== + dependencies: + "@reactflow/core" "11.11.4" + classcat "^5.0.3" + zustand "^4.4.1" + +"@reactflow/controls@11.2.14": + version "11.2.14" + resolved "https://registry.yarnpkg.com/@reactflow/controls/-/controls-11.2.14.tgz#508ed2c40d23341b3b0919dd11e76fd49cf850c7" + integrity sha512-MiJp5VldFD7FrqaBNIrQ85dxChrG6ivuZ+dcFhPQUwOK3HfYgX2RHdBua+gx+40p5Vw5It3dVNp/my4Z3jF0dw== + dependencies: + "@reactflow/core" "11.11.4" + classcat "^5.0.3" + zustand "^4.4.1" + +"@reactflow/core@11.11.4": + version "11.11.4" + resolved "https://registry.yarnpkg.com/@reactflow/core/-/core-11.11.4.tgz#89bd86d1862aa1416f3f49926cede7e8c2aab6a7" + integrity sha512-H4vODklsjAq3AMq6Np4LE12i1I4Ta9PrDHuBR9GmL8uzTt2l2jh4CiQbEMpvMDcp7xi4be0hgXj+Ysodde/i7Q== + dependencies: + "@types/d3" "^7.4.0" + "@types/d3-drag" "^3.0.1" + "@types/d3-selection" "^3.0.3" + "@types/d3-zoom" "^3.0.1" + classcat "^5.0.3" + d3-drag "^3.0.0" + d3-selection "^3.0.0" + d3-zoom "^3.0.0" + zustand "^4.4.1" + +"@reactflow/minimap@11.7.14": + version "11.7.14" + resolved "https://registry.yarnpkg.com/@reactflow/minimap/-/minimap-11.7.14.tgz#298d7a63cb1da06b2518c99744f716560c88ca73" + integrity sha512-mpwLKKrEAofgFJdkhwR5UQ1JYWlcAAL/ZU/bctBkuNTT1yqV+y0buoNVImsRehVYhJwffSWeSHaBR5/GJjlCSQ== + dependencies: + "@reactflow/core" "11.11.4" + "@types/d3-selection" "^3.0.3" + "@types/d3-zoom" "^3.0.1" + classcat "^5.0.3" + d3-selection "^3.0.0" + d3-zoom "^3.0.0" + zustand "^4.4.1" + +"@reactflow/node-resizer@2.2.14": + version "2.2.14" + resolved "https://registry.yarnpkg.com/@reactflow/node-resizer/-/node-resizer-2.2.14.tgz#1810c0ce51aeb936f179466a6660d1e02c7a77a8" + integrity sha512-fwqnks83jUlYr6OHcdFEedumWKChTHRGw/kbCxj0oqBd+ekfs+SIp4ddyNU0pdx96JIm5iNFS0oNrmEiJbbSaA== + dependencies: + "@reactflow/core" "11.11.4" + classcat "^5.0.4" + d3-drag "^3.0.0" + d3-selection "^3.0.0" + zustand "^4.4.1" + +"@reactflow/node-toolbar@1.3.14": + version "1.3.14" + resolved "https://registry.yarnpkg.com/@reactflow/node-toolbar/-/node-toolbar-1.3.14.tgz#c6ffc76f82acacdce654f2160dc9852162d6e7c9" + integrity sha512-rbynXQnH/xFNu4P9H+hVqlEUafDCkEoCy0Dg9mG22Sg+rY/0ck6KkrAQrYrTgXusd+cEJOMK0uOOFCK2/5rSGQ== + dependencies: + "@reactflow/core" "11.11.4" + classcat "^5.0.3" + zustand "^4.4.1" + "@servicebell/widget@^0.1.6": version "0.1.6" resolved "https://registry.yarnpkg.com/@servicebell/widget/-/widget-0.1.6.tgz#04672a7e7b14ff7025ec83fd740373345c359d74" @@ -3237,6 +3303,216 @@ dependencies: "@types/node" "*" +"@types/d3-array@*": + version "3.2.2" + resolved "https://registry.yarnpkg.com/@types/d3-array/-/d3-array-3.2.2.tgz#e02151464d02d4a1b44646d0fcdb93faf88fde8c" + integrity sha512-hOLWVbm7uRza0BYXpIIW5pxfrKe0W+D5lrFiAEYR+pb6w3N2SwSMaJbXdUfSEv+dT4MfHBLtn5js0LAWaO6otw== + +"@types/d3-axis@*": + version "3.0.6" + resolved "https://registry.yarnpkg.com/@types/d3-axis/-/d3-axis-3.0.6.tgz#e760e5765b8188b1defa32bc8bb6062f81e4c795" + integrity sha512-pYeijfZuBd87T0hGn0FO1vQ/cgLk6E1ALJjfkC0oJ8cbwkZl3TpgS8bVBLZN+2jjGgg38epgxb2zmoGtSfvgMw== + dependencies: + "@types/d3-selection" "*" + +"@types/d3-brush@*": + version "3.0.6" + resolved "https://registry.yarnpkg.com/@types/d3-brush/-/d3-brush-3.0.6.tgz#c2f4362b045d472e1b186cdbec329ba52bdaee6c" + integrity sha512-nH60IZNNxEcrh6L1ZSMNA28rj27ut/2ZmI3r96Zd+1jrZD++zD3LsMIjWlvg4AYrHn/Pqz4CF3veCxGjtbqt7A== + dependencies: + "@types/d3-selection" "*" + +"@types/d3-chord@*": + version "3.0.6" + resolved "https://registry.yarnpkg.com/@types/d3-chord/-/d3-chord-3.0.6.tgz#1706ca40cf7ea59a0add8f4456efff8f8775793d" + integrity sha512-LFYWWd8nwfwEmTZG9PfQxd17HbNPksHBiJHaKuY1XeqscXacsS2tyoo6OdRsjf+NQYeB6XrNL3a25E3gH69lcg== + +"@types/d3-color@*": + version "3.1.3" + resolved "https://registry.yarnpkg.com/@types/d3-color/-/d3-color-3.1.3.tgz#368c961a18de721da8200e80bf3943fb53136af2" + integrity sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A== + +"@types/d3-contour@*": + version "3.0.6" + resolved "https://registry.yarnpkg.com/@types/d3-contour/-/d3-contour-3.0.6.tgz#9ada3fa9c4d00e3a5093fed0356c7ab929604231" + integrity sha512-BjzLgXGnCWjUSYGfH1cpdo41/hgdWETu4YxpezoztawmqsvCeep+8QGfiY6YbDvfgHz/DkjeIkkZVJavB4a3rg== + dependencies: + "@types/d3-array" "*" + "@types/geojson" "*" + +"@types/d3-delaunay@*": + version "6.0.4" + resolved "https://registry.yarnpkg.com/@types/d3-delaunay/-/d3-delaunay-6.0.4.tgz#185c1a80cc807fdda2a3fe960f7c11c4a27952e1" + integrity sha512-ZMaSKu4THYCU6sV64Lhg6qjf1orxBthaC161plr5KuPHo3CNm8DTHiLw/5Eq2b6TsNP0W0iJrUOFscY6Q450Hw== + +"@types/d3-dispatch@*": + version "3.0.7" + resolved "https://registry.yarnpkg.com/@types/d3-dispatch/-/d3-dispatch-3.0.7.tgz#ef004d8a128046cfce434d17182f834e44ef95b2" + integrity sha512-5o9OIAdKkhN1QItV2oqaE5KMIiXAvDWBDPrD85e58Qlz1c1kI/J0NcqbEG88CoTwJrYe7ntUCVfeUl2UJKbWgA== + +"@types/d3-drag@*", "@types/d3-drag@^3.0.1": + version "3.0.7" + resolved "https://registry.yarnpkg.com/@types/d3-drag/-/d3-drag-3.0.7.tgz#b13aba8b2442b4068c9a9e6d1d82f8bcea77fc02" + integrity sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ== + dependencies: + "@types/d3-selection" "*" + +"@types/d3-dsv@*": + version "3.0.7" + resolved "https://registry.yarnpkg.com/@types/d3-dsv/-/d3-dsv-3.0.7.tgz#0a351f996dc99b37f4fa58b492c2d1c04e3dac17" + integrity sha512-n6QBF9/+XASqcKK6waudgL0pf/S5XHPPI8APyMLLUHd8NqouBGLsU8MgtO7NINGtPBtk9Kko/W4ea0oAspwh9g== + +"@types/d3-ease@*": + version "3.0.2" + resolved "https://registry.yarnpkg.com/@types/d3-ease/-/d3-ease-3.0.2.tgz#e28db1bfbfa617076f7770dd1d9a48eaa3b6c51b" + integrity sha512-NcV1JjO5oDzoK26oMzbILE6HW7uVXOHLQvHshBUW4UMdZGfiY6v5BeQwh9a9tCzv+CeefZQHJt5SRgK154RtiA== + +"@types/d3-fetch@*": + version "3.0.7" + resolved "https://registry.yarnpkg.com/@types/d3-fetch/-/d3-fetch-3.0.7.tgz#c04a2b4f23181aa376f30af0283dbc7b3b569980" + integrity sha512-fTAfNmxSb9SOWNB9IoG5c8Hg6R+AzUHDRlsXsDZsNp6sxAEOP0tkP3gKkNSO/qmHPoBFTxNrjDprVHDQDvo5aA== + dependencies: + "@types/d3-dsv" "*" + +"@types/d3-force@*": + version "3.0.10" + resolved "https://registry.yarnpkg.com/@types/d3-force/-/d3-force-3.0.10.tgz#6dc8fc6e1f35704f3b057090beeeb7ac674bff1a" + integrity sha512-ZYeSaCF3p73RdOKcjj+swRlZfnYpK1EbaDiYICEEp5Q6sUiqFaFQ9qgoshp5CzIyyb/yD09kD9o2zEltCexlgw== + +"@types/d3-format@*": + version "3.0.4" + resolved "https://registry.yarnpkg.com/@types/d3-format/-/d3-format-3.0.4.tgz#b1e4465644ddb3fdf3a263febb240a6cd616de90" + integrity sha512-fALi2aI6shfg7vM5KiR1wNJnZ7r6UuggVqtDA+xiEdPZQwy/trcQaHnwShLuLdta2rTymCNpxYTiMZX/e09F4g== + +"@types/d3-geo@*": + version "3.1.0" + resolved "https://registry.yarnpkg.com/@types/d3-geo/-/d3-geo-3.1.0.tgz#b9e56a079449174f0a2c8684a9a4df3f60522440" + integrity sha512-856sckF0oP/diXtS4jNsiQw/UuK5fQG8l/a9VVLeSouf1/PPbBE1i1W852zVwKwYCBkFJJB7nCFTbk6UMEXBOQ== + dependencies: + "@types/geojson" "*" + +"@types/d3-hierarchy@*": + version "3.1.7" + resolved "https://registry.yarnpkg.com/@types/d3-hierarchy/-/d3-hierarchy-3.1.7.tgz#6023fb3b2d463229f2d680f9ac4b47466f71f17b" + integrity sha512-tJFtNoYBtRtkNysX1Xq4sxtjK8YgoWUNpIiUee0/jHGRwqvzYxkq0hGVbbOGSz+JgFxxRu4K8nb3YpG3CMARtg== + +"@types/d3-interpolate@*": + version "3.0.4" + resolved "https://registry.yarnpkg.com/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz#412b90e84870285f2ff8a846c6eb60344f12a41c" + integrity sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA== + dependencies: + "@types/d3-color" "*" + +"@types/d3-path@*": + version "3.1.1" + resolved "https://registry.yarnpkg.com/@types/d3-path/-/d3-path-3.1.1.tgz#f632b380c3aca1dba8e34aa049bcd6a4af23df8a" + integrity sha512-VMZBYyQvbGmWyWVea0EHs/BwLgxc+MKi1zLDCONksozI4YJMcTt8ZEuIR4Sb1MMTE8MMW49v0IwI5+b7RmfWlg== + +"@types/d3-polygon@*": + version "3.0.2" + resolved "https://registry.yarnpkg.com/@types/d3-polygon/-/d3-polygon-3.0.2.tgz#dfae54a6d35d19e76ac9565bcb32a8e54693189c" + integrity sha512-ZuWOtMaHCkN9xoeEMr1ubW2nGWsp4nIql+OPQRstu4ypeZ+zk3YKqQT0CXVe/PYqrKpZAi+J9mTs05TKwjXSRA== + +"@types/d3-quadtree@*": + version "3.0.6" + resolved "https://registry.yarnpkg.com/@types/d3-quadtree/-/d3-quadtree-3.0.6.tgz#d4740b0fe35b1c58b66e1488f4e7ed02952f570f" + integrity sha512-oUzyO1/Zm6rsxKRHA1vH0NEDG58HrT5icx/azi9MF1TWdtttWl0UIUsjEQBBh+SIkrpd21ZjEv7ptxWys1ncsg== + +"@types/d3-random@*": + version "3.0.3" + resolved "https://registry.yarnpkg.com/@types/d3-random/-/d3-random-3.0.3.tgz#ed995c71ecb15e0cd31e22d9d5d23942e3300cfb" + integrity sha512-Imagg1vJ3y76Y2ea0871wpabqp613+8/r0mCLEBfdtqC7xMSfj9idOnmBYyMoULfHePJyxMAw3nWhJxzc+LFwQ== + +"@types/d3-scale-chromatic@*": + version "3.1.0" + resolved "https://registry.yarnpkg.com/@types/d3-scale-chromatic/-/d3-scale-chromatic-3.1.0.tgz#dc6d4f9a98376f18ea50bad6c39537f1b5463c39" + integrity sha512-iWMJgwkK7yTRmWqRB5plb1kadXyQ5Sj8V/zYlFGMUBbIPKQScw+Dku9cAAMgJG+z5GYDoMjWGLVOvjghDEFnKQ== + +"@types/d3-scale@*": + version "4.0.9" + resolved "https://registry.yarnpkg.com/@types/d3-scale/-/d3-scale-4.0.9.tgz#57a2f707242e6fe1de81ad7bfcccaaf606179afb" + integrity sha512-dLmtwB8zkAeO/juAMfnV+sItKjlsw2lKdZVVy6LRr0cBmegxSABiLEpGVmSJJ8O08i4+sGR6qQtb6WtuwJdvVw== + dependencies: + "@types/d3-time" "*" + +"@types/d3-selection@*", "@types/d3-selection@^3.0.3": + version "3.0.11" + resolved "https://registry.yarnpkg.com/@types/d3-selection/-/d3-selection-3.0.11.tgz#bd7a45fc0a8c3167a631675e61bc2ca2b058d4a3" + integrity sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w== + +"@types/d3-shape@*": + version "3.1.7" + resolved "https://registry.yarnpkg.com/@types/d3-shape/-/d3-shape-3.1.7.tgz#2b7b423dc2dfe69c8c93596e673e37443348c555" + integrity sha512-VLvUQ33C+3J+8p+Daf+nYSOsjB4GXp19/S/aGo60m9h1v6XaxjiT82lKVWJCfzhtuZ3yD7i/TPeC/fuKLLOSmg== + dependencies: + "@types/d3-path" "*" + +"@types/d3-time-format@*": + version "4.0.3" + resolved "https://registry.yarnpkg.com/@types/d3-time-format/-/d3-time-format-4.0.3.tgz#d6bc1e6b6a7db69cccfbbdd4c34b70632d9e9db2" + integrity sha512-5xg9rC+wWL8kdDj153qZcsJ0FWiFt0J5RB6LYUNZjwSnesfblqrI/bJ1wBdJ8OQfncgbJG5+2F+qfqnqyzYxyg== + +"@types/d3-time@*": + version "3.0.4" + resolved "https://registry.yarnpkg.com/@types/d3-time/-/d3-time-3.0.4.tgz#8472feecd639691450dd8000eb33edd444e1323f" + integrity sha512-yuzZug1nkAAaBlBBikKZTgzCeA+k1uy4ZFwWANOfKw5z5LRhV0gNA7gNkKm7HoK+HRN0wX3EkxGk0fpbWhmB7g== + +"@types/d3-timer@*": + version "3.0.2" + resolved "https://registry.yarnpkg.com/@types/d3-timer/-/d3-timer-3.0.2.tgz#70bbda77dc23aa727413e22e214afa3f0e852f70" + integrity sha512-Ps3T8E8dZDam6fUyNiMkekK3XUsaUEik+idO9/YjPtfj2qruF8tFBXS7XhtE4iIXBLxhmLjP3SXpLhVf21I9Lw== + +"@types/d3-transition@*": + version "3.0.9" + resolved "https://registry.yarnpkg.com/@types/d3-transition/-/d3-transition-3.0.9.tgz#1136bc57e9ddb3c390dccc9b5ff3b7d2b8d94706" + integrity sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg== + dependencies: + "@types/d3-selection" "*" + +"@types/d3-zoom@*", "@types/d3-zoom@^3.0.1": + version "3.0.8" + resolved "https://registry.yarnpkg.com/@types/d3-zoom/-/d3-zoom-3.0.8.tgz#dccb32d1c56b1e1c6e0f1180d994896f038bc40b" + integrity sha512-iqMC4/YlFCSlO8+2Ii1GGGliCAY4XdeG748w5vQUbevlbDu0zSjH/+jojorQVBK/se0j6DUFNPBGSqD3YWYnDw== + dependencies: + "@types/d3-interpolate" "*" + "@types/d3-selection" "*" + +"@types/d3@^7.4.0": + version "7.4.3" + resolved "https://registry.yarnpkg.com/@types/d3/-/d3-7.4.3.tgz#d4550a85d08f4978faf0a4c36b848c61eaac07e2" + integrity sha512-lZXZ9ckh5R8uiFVt8ogUNf+pIrK4EsWrx2Np75WvF/eTpJ0FMHNhjXk8CKEx/+gpHbNQyJWehbFaTvqmHWB3ww== + dependencies: + "@types/d3-array" "*" + "@types/d3-axis" "*" + "@types/d3-brush" "*" + "@types/d3-chord" "*" + "@types/d3-color" "*" + "@types/d3-contour" "*" + "@types/d3-delaunay" "*" + "@types/d3-dispatch" "*" + "@types/d3-drag" "*" + "@types/d3-dsv" "*" + "@types/d3-ease" "*" + "@types/d3-fetch" "*" + "@types/d3-force" "*" + "@types/d3-format" "*" + "@types/d3-geo" "*" + "@types/d3-hierarchy" "*" + "@types/d3-interpolate" "*" + "@types/d3-path" "*" + "@types/d3-polygon" "*" + "@types/d3-quadtree" "*" + "@types/d3-random" "*" + "@types/d3-scale" "*" + "@types/d3-scale-chromatic" "*" + "@types/d3-selection" "*" + "@types/d3-shape" "*" + "@types/d3-time" "*" + "@types/d3-time-format" "*" + "@types/d3-timer" "*" + "@types/d3-transition" "*" + "@types/d3-zoom" "*" + "@types/debug@^4.0.0": version "4.1.12" resolved "https://registry.yarnpkg.com/@types/debug/-/debug-4.1.12.tgz#a155f21690871953410df4b6b6f53187f0500917" @@ -3276,6 +3552,11 @@ "@types/qs" "*" "@types/serve-static" "*" +"@types/geojson@*": + version "7946.0.16" + resolved "https://registry.yarnpkg.com/@types/geojson/-/geojson-7946.0.16.tgz#8ebe53d69efada7044454e3305c19017d97ced2a" + integrity sha512-6C8nqWur3j98U6+lXDfTUWIfgvZU+EumvpHKcYjujKH7woYyLj2sUmff0tRhrqM7BohUw7Pz3ZB1jj2gW9Fvmg== + "@types/hast@^2.0.0": version "2.3.10" resolved "https://registry.yarnpkg.com/@types/hast/-/hast-2.3.10.tgz#5c9d9e0b304bbb8879b857225c5ebab2d81d7643" @@ -4433,6 +4714,11 @@ ci-info@^3.2.0: resolved "https://registry.yarnpkg.com/ci-info/-/ci-info-3.9.0.tgz#4279a62028a7b1f262f3473fc9605f5e218c59b4" integrity sha512-NIxF55hv4nSqQswkAeiOi1r83xy8JldOFDTWiug55KBu9Jnblncd2U6ViHmYgHf01TPZS77NJBhBMKdWj9HQMQ== +classcat@^5.0.3, classcat@^5.0.4: + version "5.0.5" + resolved "https://registry.yarnpkg.com/classcat/-/classcat-5.0.5.tgz#8c209f359a93ac302404a10161b501eba9c09c77" + integrity sha512-JhZUT7JFcQy/EzW605k/ktHtncoo9vnyW/2GspNYwFlN1C/WmjuV/xtS04e9SOkL2sTdw0VAZ2UGCcQ9lR6p6w== + classnames@2.x, classnames@^2.2.1, classnames@^2.2.3, classnames@^2.2.5, classnames@^2.2.6, classnames@^2.3.1, classnames@^2.3.2, classnames@^2.5.1: version "2.5.1" resolved "https://registry.yarnpkg.com/classnames/-/classnames-2.5.1.tgz#ba774c614be0f016da105c858e7159eae8e7687b" @@ -4951,6 +5237,68 @@ csstype@^3.0.2, csstype@^3.1.3: resolved "https://registry.yarnpkg.com/csstype/-/csstype-3.1.3.tgz#d80ff294d114fb0e6ac500fbf85b60137d7eff81" integrity sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw== +"d3-color@1 - 3": + version "3.1.0" + resolved "https://registry.yarnpkg.com/d3-color/-/d3-color-3.1.0.tgz#395b2833dfac71507f12ac2f7af23bf819de24e2" + integrity sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA== + +"d3-dispatch@1 - 3": + version "3.0.1" + resolved "https://registry.yarnpkg.com/d3-dispatch/-/d3-dispatch-3.0.1.tgz#5fc75284e9c2375c36c839411a0cf550cbfc4d5e" + integrity sha512-rzUyPU/S7rwUflMyLc1ETDeBj0NRuHKKAcvukozwhshr6g6c5d8zh4c2gQjY2bZ0dXeGLWc1PF174P2tVvKhfg== + +"d3-drag@2 - 3", d3-drag@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/d3-drag/-/d3-drag-3.0.0.tgz#994aae9cd23c719f53b5e10e3a0a6108c69607ba" + integrity sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg== + dependencies: + d3-dispatch "1 - 3" + d3-selection "3" + +"d3-ease@1 - 3": + version "3.0.1" + resolved "https://registry.yarnpkg.com/d3-ease/-/d3-ease-3.0.1.tgz#9658ac38a2140d59d346160f1f6c30fda0bd12f4" + integrity sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w== + +"d3-interpolate@1 - 3": + version "3.0.1" + resolved "https://registry.yarnpkg.com/d3-interpolate/-/d3-interpolate-3.0.1.tgz#3c47aa5b32c5b3dfb56ef3fd4342078a632b400d" + integrity sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g== + dependencies: + d3-color "1 - 3" + +"d3-selection@2 - 3", d3-selection@3, d3-selection@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/d3-selection/-/d3-selection-3.0.0.tgz#c25338207efa72cc5b9bd1458a1a41901f1e1b31" + integrity sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ== + +"d3-timer@1 - 3": + version "3.0.1" + resolved "https://registry.yarnpkg.com/d3-timer/-/d3-timer-3.0.1.tgz#6284d2a2708285b1abb7e201eda4380af35e63b0" + integrity sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA== + +"d3-transition@2 - 3": + version "3.0.1" + resolved "https://registry.yarnpkg.com/d3-transition/-/d3-transition-3.0.1.tgz#6869fdde1448868077fdd5989200cb61b2a1645f" + integrity sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w== + dependencies: + d3-color "1 - 3" + d3-dispatch "1 - 3" + d3-ease "1 - 3" + d3-interpolate "1 - 3" + d3-timer "1 - 3" + +d3-zoom@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/d3-zoom/-/d3-zoom-3.0.0.tgz#d13f4165c73217ffeaa54295cd6969b3e7aee8f3" + integrity sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw== + dependencies: + d3-dispatch "1 - 3" + d3-drag "2 - 3" + d3-interpolate "1 - 3" + d3-selection "2 - 3" + d3-transition "2 - 3" + data-uri-to-buffer@^6.0.2: version "6.0.2" resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz#8a58bb67384b261a38ef18bea1810cb01badd28b" @@ -9432,6 +9780,18 @@ react@^18.2.0: dependencies: loose-envify "^1.1.0" +reactflow@^11.11.4: + version "11.11.4" + resolved "https://registry.yarnpkg.com/reactflow/-/reactflow-11.11.4.tgz#e3593e313420542caed81aecbd73fb9bc6576653" + integrity sha512-70FOtJkUWH3BAOsN+LU9lCrKoKbtOPnz2uq0CV2PLdNSwxTXOhCbsZr50GmZ+Rtw3jx8Uv7/vBFtCGixLfd4Og== + dependencies: + "@reactflow/background" "11.3.14" + "@reactflow/controls" "11.2.14" + "@reactflow/core" "11.11.4" + "@reactflow/minimap" "11.7.14" + "@reactflow/node-resizer" "2.2.14" + "@reactflow/node-toolbar" "1.3.14" + readable-stream@^2.0.1: version "2.3.8" resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.3.8.tgz#91125e8042bba1b9887f49345f6277027ce8be9b" diff --git a/docs/learn-datahub/discovery/advanced-search.md b/docs/learn-datahub/discovery/advanced-search.md new file mode 100644 index 00000000000000..1fa96d01c32dff --- /dev/null +++ b/docs/learn-datahub/discovery/advanced-search.md @@ -0,0 +1,592 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; +import DataHubEntityCard from '@site/src/components/DataHubEntityCard'; +import { SearchExercise, HandsOnExercise, InteractiveDemo } from '@site/src/components/TutorialExercise'; + +# Advanced Search Techniques (15 minutes) + +:::info Tutorial Progress +**Step 1 of 3** | **15 minutes** | [Overview](overview.md) → **Advanced Search** → [Dataset Profiles](dataset-profiles.md) → [Collaborative Discovery](collaborative-discovery.md) +::: + +Master DataHub's powerful search capabilities to find exactly what you need, when you need it. Transform from basic keyword searching to surgical data discovery. + +## Discovery Challenge #1: The New Analyst + +**Your Mission**: You're new at RetailCorp and need to find customer segmentation data for a marketing campaign. You don't know exact table names or locations. + +**What You'll Learn**: Strategic search approaches, advanced operators, and smart filtering techniques. + +## Search Strategy Framework + +Effective data discovery follows a systematic approach: + +**Professional Search Strategy:** + +1. **Start with Business Terms**: Use domain-specific language and common business concepts +2. **Apply Smart Filters**: Narrow scope using platform, domain, and entity type filters +3. **Refine with Operators**: Use advanced search operators for precise matching +4. **Validate Results**: Review relevance, quality, and completeness of results +5. **Save for Reuse**: Create saved searches for recurring discovery needs + +**Search Progression Example:** + +``` +Initial Query: "customer data" +↓ Add Filters: Platform=Hive, Type=Dataset +↓ Use Operators: "customer" AND ("profile" OR "behavior") +↓ Validate: Check schema, lineage, and documentation +↓ Save: "Customer Analytics Datasets" search +``` + +Let's apply this framework to solve our challenge! + +## Level 1: Strategic Keyword Search + +Start with business concepts, not technical terms: + + + + +**Try these searches in your DataHub instance:** + + + +**What You'll Find**: Here are examples of the datasets your search would discover: + +
+ + + +
+ +
+ +:::tip Pro Tip +Business users often name things differently than technical teams. Try both perspectives! +::: + +
+ + +**When business terms don't work, try technical approaches:** + +
+ +**Search 1: Database Patterns** + +``` +customer_segment user_cohort cust_analytics +``` + +**Search 2: Common Prefixes** + +``` +dim_customer fact_customer customer_dim +``` + +**Search 3: Analytics Patterns** + +``` +customer_ltv customer_score customer_tier +``` + +
+ +
+
+ +### 🎮 Interactive Exercise: Your First Search + +
+ +**Try this now in DataHub:** + +1. **Open DataHub** at http://localhost:9002 +2. **Search for**: `customer segmentation` +3. **Count the results**: How many datasets appear? +4. **Note the variety**: Different platforms, naming conventions, descriptions + +**Reflection Questions:** + +- Which results look most relevant for marketing analysis? +- What patterns do you notice in the naming conventions? +- Are there results you didn't expect? + +
+ +## Level 2: Smart Filtering + +Raw search results can be overwhelming. Use filters to focus on what matters: + +### Platform Filtering + + + + +**Follow along in DataHub:** + +1. **Search**: `customer` +2. **Apply Platform Filter**: + - Click "Filters" in the left sidebar + - Select "Platform" + - Choose "PostgreSQL" (for operational data) + - OR choose "Snowflake" (for analytics data) + +**Notice how results change!** + + + + +**Choose filters based on your use case:** + +**For Marketing Analysis:** + +- Snowflake, BigQuery (analytics platforms) +- dbt (transformed data) +- ❌ MySQL, PostgreSQL (raw operational data) + +**For Operational Insights:** + +- PostgreSQL, MySQL (live operational data) +- Kafka (real-time streams) +- ❌ S3 (archived data) + +**For Data Engineering:** + +- All platforms (need complete picture) +- Include pipelines and jobs +- Show lineage connections + + + + +### Entity Type Filtering + +
+ +**Filter by what you're looking for:** + +| Need | Filter Selection | Why | +| --------------------- | --------------------- | -------------------------- | +| **Raw Data** | Datasets only | Focus on tables and views | +| **Business Insights** | Dashboards + Charts | See existing analysis | +| **Data Processing** | Data Jobs + Pipelines | Understand transformations | +| **Complete Picture** | All entity types | Full ecosystem view | + +
+ +### 🎮 Interactive Exercise: Smart Filtering + +
+ +**Challenge**: Find customer segmentation data suitable for marketing analysis + +**Your Turn:** + +1. **Search**: `customer segment` +2. **Apply Filters**: + - Entity Type: "Datasets" + - Platform: "Snowflake" OR "BigQuery" + - (If available) Domain: "Marketing" or "Analytics" +3. **Compare**: How many results now vs. before filtering? + +**Success Criteria**: + +- Results reduced to manageable number (< 20) +- Results are more relevant to marketing use case +- You can see clear candidates for your analysis + +
+ +## Level 3: Advanced Search Operators + +Unlock DataHub's power with search operators: + +### Boolean Operators + + + + +
+ +**AND - All terms must match:** + +``` +customer AND segmentation AND marketing +``` + +_Finds datasets containing all three terms_ + +**OR - Any term can match:** + +``` +customer OR user OR client +``` + +_Finds datasets with any customer-related term_ + +**Combined Logic:** + +``` +(customer OR user) AND (segment OR cohort OR tier) +``` + +_Flexible matching for customer segmentation concepts_ + +
+ +
+ + +
+ +**NOT - Exclude unwanted results:** + +``` +customer NOT test +``` + +_Customer data excluding test tables_ + +**Exclude Multiple Terms:** + +``` +customer NOT (test OR temp OR backup) +``` + +_Clean production customer data only_ + +**Exclude Platforms:** + +``` +customer NOT platform:mysql +``` + +_Customer data from all platforms except MySQL_ + +
+ +
+
+ +### Field-Specific Search + +Target specific metadata fields for precision: + +
+ +**Syntax**: `field:value` or `field:"exact phrase"` + +| Field | Example | Use Case | +| -------------- | --------------------------------------- | ------------------------- | +| `name:` | `name:customer*` | Search table/column names | +| `description:` | `description:"customer lifetime value"` | Search documentation | +| `platform:` | `platform:snowflake` | Specific data platform | +| `tags:` | `tags:pii` | Find tagged datasets | +| `owners:` | `owners:john.doe` | Find owned datasets | + +
+ +### Wildcard and Pattern Matching + + + + +
+ +**Prefix Matching:** + +``` +customer* +``` + +_Matches: customer, customers, customer_data, customer_analytics_ + +**Suffix Matching:** + +``` +*_customer +``` + +_Matches: dim_customer, fact_customer, raw_customer_ + +**Complex Patterns:** + +``` +cust*_seg* +``` + +_Matches: customer_segments, cust_data_segmentation_ + +
+ +
+ + +
+ +**Exact Phrase Matching:** + +``` +"customer lifetime value" +``` + +_Must contain this exact phrase_ + +**Combine with Operators:** + +``` +"customer segmentation" OR "user cohorts" +``` + +_Either exact phrase_ + +**Field + Phrase:** + +``` +description:"high value customers" +``` + +_Exact phrase in description field_ + +
+ +
+
+ +### 🎮 Interactive Exercise: Operator Mastery + +
+ +**Progressive Challenge**: Build increasingly sophisticated searches + +**Level 1 - Basic Operators:** + +``` +customer AND segment +``` + +**Level 2 - Add Exclusions:** + +``` +customer AND segment NOT test +``` + +**Level 3 - Field Targeting:** + +``` +name:customer* AND description:segment* +``` + +**Level 4 - Complex Logic:** + +``` +(name:customer* OR name:user*) AND (description:segment* OR description:cohort*) AND platform:snowflake +``` + +**Your Mission**: Try each level and observe how results change. Which gives you the most relevant results for marketing analysis? + +
+ +## Level 4: Saved Searches & Efficiency + +Don't repeat work - save your successful searches: + +### Creating Saved Searches + +
+ +**When you find a great search:** + +1. **Perfect your search** using the techniques above +2. **Click the bookmark icon** next to the search bar +3. **Name it descriptively**: "Customer Segmentation - Marketing Ready" +4. **Add description**: "Analytics-ready customer segment data for marketing campaigns" +5. **Set sharing**: Team-wide or personal + +**Pro Naming Convention:** + +- `[Use Case] - [Data Type] - [Quality Level]` +- Examples: + - "Marketing - Customer Segments - Production" + - "Analysis - User Behavior - High Quality" + - "Reporting - Sales Metrics - Daily Updated" + +
+ +### Search Templates for Common Scenarios + + + + +``` +# High-quality customer data for campaigns +(customer OR user) AND (segment OR cohort OR tier) +AND platform:(snowflake OR bigquery) +NOT (test OR temp OR backup) +``` + + + + +``` +# Live operational customer data +name:customer* AND platform:(postgres OR mysql) +AND hasOwners:true AND updatedInLastWeek:true +``` + + + + +``` +# Processed analytical datasets +description:(analytics OR analysis OR processed) +AND (customer OR user) AND NOT raw +AND platform:(snowflake OR bigquery OR dbt) +``` + + + + +## Success Checkpoint + +
+ +**You've mastered advanced search when you can:** + +**Speed Test**: Find relevant customer segmentation data in under 2 minutes +**Precision Test**: Get < 10 highly relevant results using operators +**Efficiency Test**: Create and use a saved search for future use +**Strategy Test**: Choose the right approach for different discovery scenarios + +**Validation Exercise:** +Try to solve this in 90 seconds: _"Find production-ready customer analytics data suitable for a marketing campaign, excluding any test or temporary tables."_ + +**Expected Result**: 1-5 highly relevant datasets from analytics platforms + +
+ +## Pro Tips & Shortcuts + +
+ +**🚀 Speed Techniques:** + +- Use browser bookmarks for common DataHub searches +- Set up browser shortcuts: `dh customer` → DataHub customer search +- Learn keyboard shortcuts: `Ctrl+K` for quick search + +**🎯 Accuracy Boosters:** + +- Always check the "Updated" date - stale data wastes time +- Look for owner information - contactable owners = reliable data +- Check description quality - well-documented data is usually better maintained + +**🤝 Team Efficiency:** + +- Share successful search patterns with teammates +- Create team-wide saved searches for common use cases +- Document search strategies in team wikis + +
+ +## Troubleshooting Common Issues + + + + +**Problem**: Search returns hundreds of results + +**Solutions:** + +1. **Add more specific terms**: `customer segmentation marketing` +2. **Use field targeting**: `name:customer* AND description:segment*` +3. **Apply platform filters**: Focus on relevant data platforms +4. **Exclude noise**: `NOT (test OR temp OR backup OR old)` + + + + +**Problem**: Search returns nothing + +**Solutions:** + +1. **Check spelling**: Try variations and wildcards +2. **Broaden terms**: Use OR operators for synonyms +3. **Remove filters**: Start broad, then narrow down +4. **Try different fields**: Maybe it's in descriptions, not names + + + + +**Problem**: Results aren't relevant to your use case + +**Solutions:** + +1. **Add context terms**: Include your domain/use case +2. **Use exclusions**: Remove irrelevant categories +3. **Filter by platform**: Match your analysis environment +4. **Check entity types**: Maybe you need dashboards, not datasets + + + + +## What You've Learned + +🎉 **Congratulations!** You've transformed from basic search to advanced discovery: + +- **Strategic Approach**: Business-first thinking with technical backup +- **Smart Filtering**: Platform and entity type filtering for relevance +- **Advanced Operators**: Boolean logic, field targeting, and wildcards +- **Efficiency Tools**: Saved searches and reusable patterns +- **Troubleshooting**: Common issues and systematic solutions + +--- + +**Next**: Now that you can find data efficiently, let's learn how to [understand and evaluate what you've found](dataset-profiles.md) → diff --git a/docs/learn-datahub/discovery/collaborative-discovery.md b/docs/learn-datahub/discovery/collaborative-discovery.md new file mode 100644 index 00000000000000..6ccf494a825e61 --- /dev/null +++ b/docs/learn-datahub/discovery/collaborative-discovery.md @@ -0,0 +1,543 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Collaborative Discovery (10 minutes) + +:::info Tutorial Progress +**Step 3 of 3** | ⏱️ **10 minutes** | [Overview](overview.md) → [Advanced Search](advanced-search.md) → [Dataset Profiles](dataset-profiles.md) → **Collaborative Discovery** +::: + +Transform DataHub from a solo tool into a team knowledge platform. Learn to document insights, ask questions, and build collective data intelligence that benefits everyone. + +## 🤝 Discovery Challenge #3: The Collaboration Champion + +**Your Mission**: You've discovered valuable insights about customer segmentation data and want to ensure future analysts can benefit from your knowledge. Make this dataset more discoverable and useful for your team. + +**What You'll Learn**: Documentation best practices, effective tagging strategies, and how to build a collaborative data culture. + +## The Collaboration Multiplier Effect + +Individual discoveries become team assets through effective collaboration: + +**Collaborative Discovery Workflow:** + +1. **Individual Discovery**: Find and explore datasets for specific needs +2. **Document Insights**: Add descriptions, business context, and usage notes +3. **Tag & Classify**: Apply consistent tags and domain classifications +4. **Share Knowledge**: Contribute to team understanding through documentation +5. **Team Benefits**: Enable others to discover and understand data faster +6. **Improved Discovery**: Create a self-reinforcing cycle of knowledge sharing + +**Collaboration Impact:** + +``` +Personal Discovery → Team Knowledge → Organizational Asset + ↓ ↓ ↓ +Find what you need Others find it too Enterprise catalog +Document findings Builds on your work Reduces redundancy +Tag for context Improves over time Accelerates innovation +``` + +**The Multiplier Effect**: Your 10 minutes of documentation saves hours for future users. + +## Level 1: Smart Documentation (4 minutes) + +Transform cryptic datasets into self-explanatory resources: + +### Documentation That Actually Helps + + + + +**❌ Typical (Unhelpful) Documentation:** + +``` +Table: customer_seg_v3 +Description: Customer segmentation data +``` + +**Helpful Documentation:** + +``` +Table: customer_seg_v3 +Description: Customer segmentation analysis for marketing campaigns + +Business Purpose: +- Identifies high-value customer segments for targeted marketing +- Updated weekly based on 90-day purchase behavior +- Used by Marketing, Sales, and Customer Success teams + +Key Insights: +- 'Premium' segment represents 15% of customers but 60% of revenue +- 'At Risk' segment requires immediate retention efforts +- Segmentation logic based on RFM analysis (Recency, Frequency, Monetary) + +Usage Notes: +- Use customer_id to join with other customer tables +- segment_score ranges from 1-100 (higher = more valuable) +- last_updated shows when each customer's segment was calculated +``` + + + + +**Use these templates for consistency:** + +
+ +**📊 Analytics Dataset Template:** + +``` +Business Purpose: [What business problem does this solve?] +Key Metrics: [What can you measure with this data?] +Refresh Schedule: [How often is this updated?] +Data Quality: [Known limitations or gotchas] +Common Use Cases: [How do teams typically use this?] +Related Datasets: [What other data works well with this?] +``` + +**🔄 Operational Dataset Template:** + +``` +System Source: [What application generates this data?] +Business Process: [What real-world process does this represent?] +Key Relationships: [How does this connect to other systems?] +SLA Information: [How fresh is this data expected to be?] +Access Patterns: [Who typically needs this data and why?] +``` + +
+ +
+
+ +### 🎮 Interactive Exercise: Documentation Makeover + +
+ +**Your Turn**: Find a poorly documented dataset and give it a makeover. + +**Step 1: Find a Dataset** + +- Search for datasets with minimal descriptions +- Look for technical names without business context +- Choose one that you understand or can research + +**Step 2: Research & Document** + +- What business problem does this solve? +- Who would use this data and why? +- What are the key columns and their meanings? +- Are there any gotchas or limitations? + +**Step 3: Write Helpful Documentation** +Use the templates above to create documentation that would help a new team member understand this dataset in 2 minutes. + +**Success Criteria:** + +- Business purpose is clear +- Key columns are explained +- Usage guidance is provided +- You'd be comfortable with a new hire using this dataset based on your documentation + +
+ +## Level 2: Strategic Tagging (3 minutes) + +Tags are the navigation system for your data catalog. Use them strategically: + +### Tagging Strategy Framework + +
+ +**🏷️ Tag Categories & Examples:** + +| Category | Purpose | Examples | +| -------------------- | -------------------- | ----------------------------------------------------------- | +| **Data Quality** | Signal reliability | `high-quality`, `needs-validation`, `production-ready` | +| **Business Domain** | Organize by function | `marketing`, `finance`, `operations`, `customer-success` | +| **Data Sensitivity** | Privacy & compliance | `pii`, `confidential`, `public`, `gdpr-relevant` | +| **Usage Pattern** | Guide consumption | `real-time`, `batch-processed`, `analytical`, `operational` | +| **Lifecycle Stage** | Indicate status | `active`, `deprecated`, `experimental`, `archived` | + +
+ +### Tagging Best Practices + + + + +**Establish team conventions:** + +
+ +**Good Tag Naming:** + +- Use lowercase with hyphens: `customer-analytics` +- Be specific: `daily-updated` not just `updated` +- Use standard terms: `pii` not `personal-info` +- Include context: `marketing-ready` not just `ready` + +**❌ Avoid These Patterns:** + +- Inconsistent casing: `Customer-Analytics` vs `customer_analytics` +- Vague terms: `good`, `important`, `useful` +- Personal preferences: `johns-favorite`, `team-alpha-data` +- Redundant info: `table-data` (everything in datasets is table data) + +
+ +
+ + +**Create logical tag relationships:** + +**Business Domain Tag Hierarchy:** + +**Marketing Domain:** + +- `marketing-campaigns` - Campaign performance and attribution data +- `marketing-analytics` - Customer behavior and conversion metrics +- `marketing-automation` - Lead scoring and nurturing workflows + +**Finance Domain:** + +- `finance-reporting` - Financial statements and regulatory reports +- `finance-forecasting` - Budget planning and revenue projections +- `finance-compliance` - Audit trails and regulatory compliance data + +**Operations Domain:** + +- `operations-monitoring` - System performance and infrastructure metrics +- `operations-logistics` - Supply chain and fulfillment data +- `operations-support` - Customer service and issue tracking + +**Tag Strategy Best Practices:** + +``` +Domain Level: Broad business area (marketing, finance, operations) + ↓ +Function Level: Specific business function within domain + ↓ +Use Case Level: Specific analytical or operational purpose +``` + +**Example Tag Application:** + +- Dataset: "Customer Campaign Performance Q4 2024" +- Tags: `marketing-campaigns`, `marketing-analytics`, `quarterly-reporting` +- Result: Easily discoverable by marketing team and analysts + +**Benefits:** + +- Easier filtering and discovery +- Consistent team usage +- Scalable organization + + +
+ +### 🎮 Interactive Exercise: Tag Like a Pro + +
+ +**Challenge**: Tag 3 different datasets using strategic tagging. + +**Dataset Types to Find:** + +1. **Customer data** (operational or analytical) +2. **Financial/sales data** (revenue, transactions, etc.) +3. **Product/inventory data** (catalog, usage, etc.) + +**For Each Dataset, Add Tags For:** + +- **Quality level**: How reliable is this data? +- **Business domain**: Which team owns/uses this? +- **Sensitivity**: Any privacy considerations? +- **Usage pattern**: How is this typically consumed? +- **Lifecycle stage**: What's the status of this dataset? + +**Example Tagging:** + +``` +Dataset: customer_segments_weekly +Tags: high-quality, marketing, pii, analytical, production-ready +``` + +**Validation**: Would a new team member understand the dataset's purpose and usage from your tags alone? + +
+ +## Level 3: Knowledge Sharing (3 minutes) + +Turn your discoveries into team assets: + +### Effective Knowledge Sharing Techniques + + + + +**Use DataHub's Q&A features strategically:** + +
+ +**🤔 Ask Good Questions:** + +- "What's the difference between customer_id and user_id in this table?" +- "How often is this data refreshed? I see conflicting information." +- "Are there known data quality issues with the email column?" +- "What's the business logic behind the customer_score calculation?" + +**💡 Provide Helpful Answers:** + +- Be specific and actionable +- Include examples when possible +- Reference related datasets or documentation +- Update your answer if information changes + +**📈 Question Patterns That Help Teams:** + +- Data quality clarifications +- Business logic explanations +- Usage recommendations +- Alternative dataset suggestions + +
+ +
+ + +**Guide future users with recommendations:** + +
+ +**💡 Recommendation Types:** + +**Alternative Datasets:** +"For real-time customer data, consider `customer_events_stream` instead of this daily batch table." + +**Usage Warnings:** +"This table has a 2-hour delay. For time-sensitive analysis, use `customer_realtime_view`." + +**Quality Notes:** +"Email column has ~15% null values. Use `email_verified` flag to filter for valid emails." + +**Best Practices:** +"Join on `customer_uuid` rather than `email` for better accuracy and privacy compliance." + +
+ +
+ + +**Build discovery networks:** + +
+ +**📌 Strategic Bookmarking:** + +- Bookmark datasets you use regularly +- Bookmark high-quality examples for reference +- Bookmark datasets relevant to your domain + +**👀 Smart Following:** + +- Follow datasets critical to your work +- Follow datasets you've contributed documentation to +- Follow datasets in active development + +**🔔 Notification Benefits:** + +- Get alerts when important data changes +- Stay informed about schema updates +- Learn from others' questions and discoveries + +
+ +
+
+ +### Building a Collaborative Culture + +
+ +**🌟 Team Practices That Work:** + +**📅 Regular Data Reviews:** + +- Weekly team check-ins on new datasets +- Monthly data quality discussions +- Quarterly documentation cleanup + +**🎓 Knowledge Sharing:** + +- Document discoveries in team channels +- Share interesting datasets in team meetings +- Create "dataset of the week" highlights + +**🏆 Recognition:** + +- Acknowledge good documentation contributors +- Celebrate data quality improvements +- Share success stories from collaborative discovery + +
+ +## Success Stories: Collaboration in Action + + + + +**Before Collaboration:** + +- Each analyst spent 2-3 hours finding customer data +- Repeated work across team members +- Inconsistent analysis due to different data sources + +**After Implementing Collaboration:** + +- Comprehensive tagging system for marketing data +- Shared documentation with business context +- Team-wide saved searches for common use cases + +**Results:** + +- 70% reduction in data discovery time +- Consistent analysis across team +- New team members productive in days, not weeks + + + + +**Challenge**: Engineering and Marketing teams using different customer datasets + +**Collaboration Solution:** + +- Joint documentation sessions +- Shared tagging conventions +- Cross-team Q&A on dataset differences + +**Outcome:** + +- Clear guidance on when to use each dataset +- Reduced confusion and duplicate analysis +- Better alignment between teams + + + + +## Advanced Collaboration Features + +
+ +**🔄 Automated Collaboration:** + +- Set up alerts for dataset changes +- Use DataHub Actions to notify teams of quality issues +- Integrate with Slack for team notifications + +**📊 Collaboration Analytics:** + +- Track which datasets are most bookmarked +- Identify documentation gaps +- Measure team engagement with data catalog + +**🎯 Targeted Sharing:** + +- Use domain-specific tags for relevant teams +- Create role-based saved searches +- Implement approval workflows for sensitive data + +
+ +## Success Checkpoint + +
+ +**You've mastered collaborative discovery when you can:** + +**Documentation Test**: Write dataset documentation that helps a new team member be productive immediately +**Tagging Test**: Apply consistent, strategic tags that improve discoverability +**Sharing Test**: Contribute questions, answers, or recommendations that benefit the team +**Culture Test**: Establish practices that make collaboration natural and valuable + +**Final Challenge**: +Take a dataset you've worked with and make it 50% more valuable to your team through documentation, tagging, and knowledge sharing. Measure success by asking: "Would this save time for the next person who needs similar data?" + +
+ +## Measuring Collaboration Success + +
+ +**📊 Team Metrics to Track:** + +| Metric | Good Trend | What It Means | +| -------------------------- | ------------- | --------------------------------------- | +| **Documentation Coverage** | ↗️ Increasing | More datasets have helpful descriptions | +| **Tag Consistency** | ↗️ Increasing | Team uses standardized tagging | +| **Q&A Activity** | ↗️ Increasing | Active knowledge sharing | +| **Discovery Time** | ↘️ Decreasing | Faster data finding | +| **Repeat Questions** | ↘️ Decreasing | Better documentation quality | + +
+ +## What You've Accomplished + +🎉 **Outstanding work!** You've completed the Data Discovery & Search mastery series: + +### Skills Mastered: + +- **🔍 Advanced Search**: Strategic search approaches with operators and filters +- **📊 Dataset Evaluation**: Rapid quality assessment and decision-making +- **🤝 Collaborative Discovery**: Documentation, tagging, and knowledge sharing + +### Business Impact: + +- **⚡ Speed**: Find relevant data in minutes, not hours +- **🎯 Accuracy**: Make informed decisions about data quality and fit +- **🤝 Team Efficiency**: Share knowledge that benefits everyone +- **📈 Scalability**: Build practices that improve over time + +## What's Next? + +Choose your next learning adventure based on your role and interests: + +
+ +**For Data Analysts:** +→ [Data Lineage & Impact Analysis](../lineage/overview.md) - Understand data dependencies and trace issues + +**For Data Engineers:** +→ Data Ingestion Mastery (coming soon) - Master recipes, profiling, and production patterns + +**For Data Governance Teams:** +→ Data Governance Fundamentals (coming soon) - Ownership, classification, and business glossaries + +**For Everyone:** +→ Data Quality & Monitoring (coming soon) - Set up assertions and monitoring for reliable data + +
+ +## Keep Learning & Contributing + +
+ +**🌟 Stay Engaged:** + +- Share your success stories with the DataHub community +- Contribute to DataHub documentation and tutorials +- Help other users in the DataHub Slack community +- Suggest improvements to DataHub's collaborative features + +**📚 Additional Resources:** + +- [DataHub Community Slack](https://datahub.com/slack) +- [DataHub Documentation](../../) +- [DataHub GitHub](https://github.com/datahub-project/datahub) + +
+ +**Congratulations on becoming a DataHub Discovery Expert!** 🚀 + +Your investment in learning these skills will pay dividends every time you or your teammates need to find and understand data. Keep practicing, keep collaborating, and keep discovering! diff --git a/docs/learn-datahub/discovery/dataset-profiles.md b/docs/learn-datahub/discovery/dataset-profiles.md new file mode 100644 index 00000000000000..899904227a43ff --- /dev/null +++ b/docs/learn-datahub/discovery/dataset-profiles.md @@ -0,0 +1,512 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Understanding Dataset Profiles (20 minutes) + +:::info Tutorial Progress +**Step 2 of 3** | ⏱️ **20 minutes** | [Overview](overview.md) → [Advanced Search](advanced-search.md) → **Dataset Profiles** → [Collaborative Discovery](collaborative-discovery.md) +::: + +Learn to quickly assess data quality, understand schemas, and make informed decisions about whether a dataset meets your analysis needs. Transform from guessing to knowing. + +## 🕵️ Discovery Challenge #2: The Data Detective + +**Your Mission**: The customer dashboard shows suspicious numbers - customer count dropped 50% overnight. You need to evaluate potential data sources to find the root cause. + +**What You'll Learn**: How to rapidly assess data quality, interpret statistics, and identify data issues using DataHub's automated profiling. + +## The Dataset Intelligence Framework + +Every dataset tells a story through its metadata. Learn to read these signals: + +**Dataset Profile Analysis Workflow:** + +1. **Dataset Discovery**: Locate potential datasets through search or browsing +2. **Quick Health Check**: Review freshness, completeness, and quality indicators +3. **Schema Analysis**: Examine column types, constraints, and relationships +4. **Quality Assessment**: Evaluate data distributions, null rates, and anomalies +5. **Usage Validation**: Check access patterns, downstream dependencies, and documentation +6. **Decision**: Determine dataset suitability for your use case + +**Profile Reading Checklist:** + +``` +✓ Last Updated: Within acceptable freshness window? +✓ Row Count: Reasonable size for expected data volume? +✓ Column Quality: Acceptable null rates and distributions? +✓ Schema Stability: Consistent structure over time? +✓ Documentation: Sufficient context and business meaning? +✓ Access Patterns: Evidence of active usage by others? +``` + +## Quick Health Check (2 minutes) + +Before diving deep, get a rapid overview of dataset health: + +### The 30-Second Assessment + +
+ +**🚦 Traffic Light System:** + +| 🟢 Green Light | 🟡 Yellow Light | 🔴 Red Light | +| ------------------ | -------------------- | -------------------- | +| Updated < 24h ago | Updated 1-7 days ago | Updated > 7 days ago | +| Has owner assigned | Owner unclear | No owner | +| Has description | Minimal description | No description | +| Normal row count | Row count changed | Dramatic row changes | + +
+ +### 🎮 Interactive Exercise: Health Check Practice + +
+ +**Find and evaluate 3 customer-related datasets:** + +1. **Open any customer dataset** from your previous search +2. **Look at the header area** - note the key indicators +3. **Fill out this assessment:** + +``` +Dataset Name: ________________ +Last Updated: ________________ +Owner: ______________________ +Row Count: ___________________ +Health Score: 🟢 🟡 🔴 (circle one) +``` + +**Repeat for 2 more datasets and compare results** + +
+ +## Schema Deep Dive (8 minutes) + +The schema tells you what data is actually available and how it's structured: + +### Reading the Schema Tab + + + + +**What to look for in each column:** + +
+ +**🔍 Column Name Patterns:** + +- `id`, `uuid`, `key` → Identifiers (good for joins) +- `created_at`, `updated_at` → Timestamps (good for time analysis) +- `email`, `phone`, `address` → PII (privacy considerations) +- `status`, `type`, `category` → Categorical data (good for grouping) +- `amount`, `count`, `score` → Numeric data (good for calculations) + +**📈 Data Type Insights:** + +- `VARCHAR(255)` → Text fields, check for standardization +- `TIMESTAMP` → Time-based analysis possible +- `INTEGER` → Counting and math operations +- `DECIMAL(10,2)` → Monetary values, precise calculations +- `BOOLEAN` → Binary flags and filters + +
+ +
+ + +**Understanding table relationships:** + +
+ +**🔑 Primary Keys:** + +- Usually named `id`, `uuid`, or `[table]_id` +- Unique identifier for each row +- Essential for joins and deduplication + +**🔗 Foreign Keys:** + +- References to other tables +- Shows data relationships +- Enables cross-table analysis + +**📊 Composite Keys:** + +- Multiple columns forming unique identifier +- Common in fact tables and junction tables +- Important for grain understanding + +
+ +**🎮 Try This:** Look at a customer table schema and identify: + +- Primary key column +- Foreign key relationships +- PII columns that need special handling +- Timestamp columns for temporal analysis + +
+ + +**Schema-level quality signals:** + +
+ +**🟢 High Quality Indicators:** + +- Consistent naming conventions +- Comprehensive column descriptions +- Appropriate data types +- Clear primary/foreign key relationships +- Reasonable column count (not too sparse/dense) + +**🔴 Quality Concerns:** + +- Inconsistent naming (camelCase + snake_case) +- Missing column descriptions +- Generic column names (`col1`, `field_a`) +- All VARCHAR types (suggests poor modeling) +- Excessive NULL values in key columns + +
+ +
+
+ +### 🎮 Interactive Exercise: Schema Detective Work + +
+ +**Scenario**: You need to join customer data with order data for analysis. + +**Your Task:** + +1. **Find a customer dataset** and examine its schema +2. **Find an orders dataset** and examine its schema +3. **Identify the join key(s)** - what columns connect these tables? +4. **Assess join feasibility:** + - Are the key columns the same data type? + - Do the column names suggest they're related? + - Are there any data quality concerns? + +**Success Criteria:** + +- Identified clear join path between tables +- Assessed potential data quality issues +- Understand what analysis would be possible + +
+ +## Data Statistics & Profiling (7 minutes) + +DataHub's automated profiling reveals data patterns and quality issues: + +### Understanding Profile Statistics + + + + +**Key statistics to interpret:** + +
+ +| Statistic | What It Tells You | Red Flags | +| ---------------------- | ----------------------- | --------------------------------- | +| **Min/Max** | Data range and outliers | Impossible values (negative ages) | +| **Mean/Median** | Central tendency | Large difference = skewed data | +| **Null Count** | Data completeness | High nulls in key fields | +| **Distinct Count** | Data variety | Too few = poor granularity | +| **Standard Deviation** | Data spread | Very high = inconsistent data | + +**🎮 Practice Interpretation:** + +``` +customer_age: Min=18, Max=150, Mean=45, Median=42, Nulls=5% +``` + +**Analysis**: Reasonable age range, slight right skew (mean > median), good completeness + +
+ +
+ + +**Understanding categorical data:** + +
+ +**📊 Value Distribution:** + +- **Top Values**: Most common categories +- **Unique Count**: How many distinct values +- **Null Percentage**: Missing data rate + +**🚨 Quality Signals:** + +- **Good**: Clear categories, low null rate +- **Concerning**: Too many unique values, high null rate +- **Bad**: Inconsistent formatting, obvious data entry errors + +**Example Analysis:** + +``` +customer_status: +- Active: 85% (good - most customers active) +- Inactive: 12% (reasonable churn) +- Pending: 3% (small processing queue) +- Nulls: 0% (excellent - no missing status) +``` + +
+ +
+ + +**Time-based data insights:** + +
+ +**📈 Temporal Patterns:** + +- **Date Range**: How far back does data go? +- **Update Frequency**: Daily, hourly, real-time? +- **Gaps**: Missing time periods? +- **Seasonality**: Regular patterns? + +**🎯 Business Relevance:** + +- **Recent Data**: Good for current analysis +- **Historical Depth**: Enables trend analysis +- **Regular Updates**: Reliable for ongoing monitoring +- **Complete Coverage**: No missing business periods + +
+ +
+
+ +### 🎮 Interactive Exercise: Data Quality Detective + +
+ +**Mystery**: Customer count dropped 50% overnight. Use profiling data to investigate. + +**Investigation Steps:** + +1. **Find customer datasets** updated in the last 2 days +2. **Check row count trends** - look for dramatic changes +3. **Examine key columns** for anomalies: + - Are there unusual null rates? + - Do value distributions look normal? + - Are there data type inconsistencies? + +**Detective Questions:** + +- Which dataset shows the row count drop? +- What columns might explain the change? +- Are there data quality issues that could cause undercounting? + +**Report Your Findings:** + +``` +Suspect Dataset: ________________ +Row Count Change: _______________ +Potential Cause: ________________ +Confidence Level: High/Medium/Low +``` + +
+ +## Usage Patterns & Validation (3 minutes) + +Understand how others use this data to validate your choice: + +### Query History Analysis + +
+ +**📊 Usage Indicators:** + +| Pattern | Interpretation | Decision Impact | +| --------------------- | ------------------------ | ------------------------------- | +| **High Query Volume** | Popular, trusted dataset | Good choice for analysis | +| **Recent Queries** | Actively used, current | Likely up-to-date | +| **Complex Queries** | Rich analytical use | Supports sophisticated analysis | +| **Simple Queries** | Basic lookup use | May lack analytical depth | +| **No Recent Usage** | Potentially stale | Investigate before using | + +
+ +### User Feedback Signals + + + + +**Look for community validation:** + +- **Bookmarks/Follows**: How many users track this dataset? +- **Documentation Quality**: Well-documented = well-used +- **Owner Responsiveness**: Active owners = maintained data +- **Related Datasets**: Part of a larger, maintained ecosystem? + + + + +**User-generated quality signals:** + +- **Tags**: `high-quality`, `production-ready`, `deprecated` +- **Comments**: User experiences and gotchas +- **Issues**: Known problems and limitations +- **Recommendations**: Alternative datasets for similar use cases + + + + +## Making the Go/No-Go Decision + +Synthesize all information into a clear decision: + +### Decision Framework + +
+ +**🎯 Use This Dataset If:** + +- Health check shows green/yellow lights +- Schema matches your analysis needs +- Data quality statistics look reasonable +- Usage patterns indicate active maintenance +- You can contact the owner if needed + +**⚠️ Investigate Further If:** + +- 🟡 Some quality concerns but dataset is unique +- 🟡 Usage is low but data looks comprehensive +- 🟡 Owner is unclear but data seems current + +**❌ Skip This Dataset If:** + +- 🔴 Multiple red flags in health check +- 🔴 Schema doesn't support your use case +- 🔴 Serious data quality issues +- 🔴 No recent usage and no owner contact +- 🔴 Better alternatives are available + +
+ +### 🎮 Final Exercise: Complete Dataset Evaluation + +
+ +**Challenge**: Evaluate 2 customer datasets and choose the better one for marketing analysis. + +**Evaluation Scorecard:** + +``` +Dataset A: ________________ Dataset B: ________________ + +Health Check: ⭐⭐⭐⭐⭐ Health Check: ⭐⭐⭐⭐⭐ +Schema Quality: ⭐⭐⭐⭐⭐ Schema Quality: ⭐⭐⭐⭐⭐ +Data Quality: ⭐⭐⭐⭐⭐ Data Quality: ⭐⭐⭐⭐⭐ +Usage Patterns: ⭐⭐⭐⭐⭐ Usage Patterns: ⭐⭐⭐⭐⭐ +Total Score: ___/20 Total Score: ___/20 + +Winner: Dataset ___ +Reason: ________________________ +``` + +**Validation**: Can you justify your choice to a colleague in 30 seconds? + +
+ +## Pro Tips for Efficient Evaluation + +
+ +**⚡ Speed Techniques:** + +- Develop a mental checklist for rapid assessment +- Use browser tabs to compare multiple datasets +- Focus on deal-breakers first (freshness, schema fit) + +**🎯 Accuracy Boosters:** + +- Always check sample data when available +- Cross-reference with lineage to understand data flow +- Contact owners for clarification on edge cases + +**🤝 Team Efficiency:** + +- Document your evaluation criteria for consistency +- Share findings with teammates to avoid duplicate work +- Create team standards for "good enough" data quality + +
+ +## Success Checkpoint + +
+ +**You've mastered dataset evaluation when you can:** + +**Speed Test**: Complete health check + schema review in under 5 minutes +**Quality Test**: Identify 3 potential data quality issues from profiling stats +**Decision Test**: Make confident go/no-go decisions with clear justification +**Communication Test**: Explain dataset suitability to stakeholders + +**Final Validation:** +Choose the best customer dataset for a marketing campaign analysis. Justify your choice in 3 bullet points covering health, schema, and quality. + +
+ +## Common Evaluation Pitfalls + + + + +**Problem**: Waiting for perfect data that doesn't exist + +**Solution**: + +- Define "good enough" criteria upfront +- Focus on fitness for purpose, not perfection +- Consider data improvement as part of your project + + + + +**Problem**: Making decisions based only on names and descriptions + +**Solution**: + +- Always check actual schema and statistics +- Look at sample data when available +- Verify assumptions with data owners + + + + +**Problem**: Evaluating datasets in isolation without considering alternatives + +**Solution**: + +- Always compare 2-3 options when possible +- Consider combining multiple datasets +- Check lineage for upstream/downstream alternatives + + + + +## What You've Learned + +🎉 **Excellent work!** You can now rapidly assess dataset quality and make informed decisions: + +- **Health Assessment**: Quick evaluation of dataset reliability +- **Schema Intelligence**: Understanding structure and relationships +- **Quality Analysis**: Interpreting statistics and profiling data +- **Usage Validation**: Leveraging community knowledge +- **Decision Framework**: Systematic go/no-go evaluation + +--- + +**Next**: Now that you can find and evaluate data, let's learn how to [collaborate and share knowledge](collaborative-discovery.md) with your team → diff --git a/docs/learn-datahub/discovery/overview.md b/docs/learn-datahub/discovery/overview.md new file mode 100644 index 00000000000000..8fb837d8f4fb17 --- /dev/null +++ b/docs/learn-datahub/discovery/overview.md @@ -0,0 +1,260 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; +import DataHubEntityCard from '@site/src/components/DataHubEntityCard'; +import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode'; + +# Data Discovery & Search (45 minutes) + +:::tip Prerequisites +Complete the [DataHub Quickstart](../quickstart/overview.md) tutorial first to have DataHub running with sample data. +::: + +## What You'll Master + +Transform from basic DataHub user to discovery expert by mastering advanced search techniques, understanding dataset profiles, and leveraging collaborative features. + +**Learning Outcomes:** + +- **Advanced Search Mastery**: Use operators, filters, and saved searches like a pro +- **Dataset Intelligence**: Read and interpret automatically generated data profiles +- **Collaborative Discovery**: Leverage social features to crowdsource data knowledge +- **Search Strategy**: Develop systematic approaches for different discovery scenarios + +**Enterprise Data Discovery Framework:** + + + +**Discovery Navigation Strategy**: + +1. **Start with Business Need** (requirements gathering) +2. **Apply Search Strategy** (targeted discovery) +3. **Filter and Refine**: What platforms, what domains? + - Platform filters → Focus on relevant data systems + - Domain filters → Narrow to business area + - Entity type → Tables, dashboards, or pipelines +4. **Evaluate Data Quality**: Is this the right data? + - Check data freshness and update patterns + - Review schema compatibility with analysis needs + - Assess lineage depth and data reliability +5. **Plan Integration**: How to access and use + - Verify permissions and access controls + - Gather connection details and usage patterns + - Check tags and glossary terms for context + +**Professional Approach**: This 5-step discovery method mirrors the systematic approach used in lineage analysis - ensuring you find the right data efficiently while understanding its full context. + +## Interactive Tutorial Structure + +This hands-on tutorial uses **real search scenarios** you'll encounter daily: + +
+ +| Step | Scenario | Time | Interactive Elements | +| ---- | ----------------------------------------------------- | ------ | ----------------------------------------- | +| 1 | [Advanced Search Techniques](advanced-search.md) | 15 min | Live search examples, Interactive filters | +| 2 | [Understanding Dataset Profiles](dataset-profiles.md) | 20 min | Profile interpretation, Quality analysis | +| 3 | [Collaborative Discovery](collaborative-discovery.md) | 10 min | Documentation exercises, Tagging practice | + +
+ +**Total Time: 45 minutes** + +## Real-World Discovery Scenarios + +Throughout this tutorial, you'll solve these common data challenges: + +:::info Discovery Challenge #1: The New Analyst +**Scenario**: You're new to RetailCorp and need to find customer segmentation data for a marketing campaign. You don't know the exact table names or where the data lives. + +**Skills**: Exploratory search, filtering, schema analysis +::: + +:::info Discovery Challenge #2: The Data Detective +**Scenario**: The customer dashboard shows suspicious numbers. You need to trace back through the data pipeline to find the source of the issue. + +**Skills**: Lineage navigation, data quality assessment, root cause analysis +::: + +:::info Discovery Challenge #3: The Collaboration Champion +**Scenario**: You've discovered valuable insights about a dataset and want to share knowledge with your team for future users. + +**Skills**: Documentation, tagging, collaborative features +::: + +## Interactive Learning Features + +This tutorial leverages Docusaurus's interactive capabilities: + + + + +**Live Search Practice**: Try real searches in your DataHub instance +**Interactive Filters**: Step-by-step filter application +**Profile Analysis**: Guided interpretation of data statistics +**Collaboration Simulation**: Practice documentation and tagging + + + + +**Knowledge Checks**: Quick quizzes to verify understanding +**Practical Validation**: Confirm you can perform key tasks +**Scenario Completion**: Solve real discovery challenges +**Skill Assessment**: Rate your confidence with each technique + + + + +**Cheat Sheets**: Quick reference for search operators +**Best Practices**: Pro tips from experienced users +**Troubleshooting**: Common issues and solutions +**Advanced Techniques**: Power user shortcuts + + + + +## Prerequisites Check + +Before starting, ensure you have: + +
+ +- [ ] **DataHub running locally** at http://localhost:9002 +- [ ] **Sample data ingested** (from quickstart tutorial) +- [ ] **Basic familiarity** with DataHub navigation +- [ ] **15 minutes** of focused time per section + +
+ +:::tip Quick Setup Verification +Test your setup by searching for "customer" in DataHub. You should see several results from the sample data. +::: + +## Learning Path Integration + +**Coming from:** [DataHub Quickstart](../quickstart/overview.md) - You understand basic navigation and have sample data + +**Going to:** Choose your next adventure based on your role: + +- **Data Engineers**: Data Ingestion Mastery (coming soon) +- **Analysts**: [Data Lineage & Impact Analysis](../lineage/overview.md) +- **Governance Teams**: Data Governance Fundamentals (coming soon) + +## Success Metrics + +By the end of this tutorial, you'll be able to: + +
+ +**Speed**: Find relevant datasets in under 2 minutes +**Accuracy**: Identify the right data source for your analysis needs +**Insight**: Quickly assess data quality and freshness +**Collaboration**: Effectively document and share data knowledge + +
+ +## Interactive Demo Preview + +Here's a taste of what you'll learn: + + + + +``` +Search: "customer" +Results: 47 datasets found +``` + + + + +``` +Search: name:customer* AND platform:postgres AND hasOwners:true +Results: 3 highly relevant datasets found +Filters: PostgreSQL, Has Documentation, Updated Last 7 Days +``` + + + + +``` +Search: (customer OR user) AND (segment* OR cohort*) AND NOT test* +Saved Search: "Customer Segmentation Data" +Smart Filters: Production Only, High Quality, Well Documented +Results: 1 perfect match found in 15 seconds +``` + + + + +--- + +**Ready to become a DataHub discovery expert?** Let's start with [Advanced Search Techniques](advanced-search.md) → diff --git a/docs/learn-datahub/governance/business-glossary.md b/docs/learn-datahub/governance/business-glossary.md new file mode 100644 index 00000000000000..9cb155adab60f6 --- /dev/null +++ b/docs/learn-datahub/governance/business-glossary.md @@ -0,0 +1,306 @@ +# Business Glossary + + + +## Creating Consistent Business Language + +**Time Required**: 12 minutes + +### The Business Language Challenge + +Your organization uses terms like "customer," "revenue," and "conversion" across different teams, but everyone has slightly different definitions. The marketing team's "active user" differs from the product team's definition, leading to: + +- **Conflicting reports** with different numbers for the same metric +- **Wasted time** in meetings clarifying what terms mean +- **Poor decision-making** based on misunderstood data +- **Reduced trust** in data and analytics + +**Real-World Impact**: Your executive team received two different "monthly revenue" reports with a $2M discrepancy because Finance and Sales defined "recognized revenue" differently. + +### Understanding Business Glossaries + +A business glossary provides standardized definitions for business terms, ensuring everyone speaks the same data language: + +
+ +
+ +**Glossary Benefits**: + +- **Consistent Definitions**: Single source of truth for business terms +- **Improved Communication**: Teams use standardized language +- **Better Data Discovery**: Find data using business terminology +- **Regulatory Compliance**: Clear definitions for audit requirements + +### Exercise 1: Create Core Business Terms + +Start by defining your organization's most important business concepts: + +#### Step 1: Access the Glossary + +1. **Navigate to "Glossary"** in DataHub's main menu +2. **Click "Create Term"** to add your first business term +3. **Review existing terms** to avoid duplicates + +#### Step 2: Define "Active Customer" + +Create a standardized definition for one of your most important terms: + +1. **Term Name**: "Active Customer" +2. **Definition**: "A customer who has made at least one purchase or engaged with our platform within the last 90 days" +3. **Business Context**: "Used across Marketing, Product, and Finance teams for consistent customer reporting" +4. **Calculation Logic**: "WHERE last_activity_date >= CURRENT_DATE - 90" +5. **Related Terms**: Link to "Customer," "Engagement," "Retention" +6. **Owner**: Assign to your Customer Analytics team + +#### Step 3: Add Financial Terms + +Create definitions for key financial metrics: + +**Revenue Recognition**: + +- **Definition**: "Revenue recorded when goods are delivered or services are performed, following GAAP standards" +- **Business Rules**: "Subscription revenue recognized monthly; one-time purchases at delivery" +- **Calculation**: "SUM(recognized_amount) WHERE recognition_date <= report_date" + +**Customer Lifetime Value (CLV)**: + +- **Definition**: "Predicted total revenue from a customer over their entire relationship with the company" +- **Formula**: "Average Order Value × Purchase Frequency × Customer Lifespan" +- **Usage**: "Used for customer acquisition cost analysis and marketing budget allocation" + +### Exercise 2: Link Terms to Datasets + +Connect your business terms to actual data assets: + +#### Step 1: Navigate to Dataset + +1. **Open the customer analytics dataset** (e.g., "fct_users_created") +2. **Go to the "Properties" tab** +3. **Find the "Glossary Terms" section** + +#### Step 2: Add Relevant Terms + +1. **Click "Add Terms"** +2. **Search for "Active Customer"** and select it +3. **Add "Customer Lifetime Value"** if the dataset contains CLV calculations +4. **Add "Revenue Metric"** for any revenue-related fields +5. **Save the associations** + +#### Step 3: Column-Level Term Assignment + +For specific columns, add more granular terms: + +- `customer_id` column → "Customer Identifier" +- `registration_date` column → "Customer Acquisition Date" +- `last_login_date` column → "Customer Activity Date" +- `total_spent` column → "Customer Value" + +### Exercise 3: Create Term Hierarchies + +Organize terms into logical hierarchies for better navigation: + +#### Step 1: Create Term Categories + +Set up high-level categories using DataHub's glossary hierarchy: + + + +#### Step 2: Implement Hierarchies + +1. **Create parent terms** for each category +2. **Link child terms** to their parents +3. **Add cross-references** between related terms +4. **Document relationships** in term descriptions + +### Exercise 4: Establish Glossary Governance + +Set up processes to maintain glossary quality: + +#### Step 1: Assign Term Stewards + +1. **For each business domain**, assign term stewards: + + - Customer terms → Customer Success Manager + - Financial terms → Finance Business Analyst + - Product terms → Product Manager + - Marketing terms → Marketing Operations + +2. **Define steward responsibilities**: + - Review and approve new terms + - Update definitions when business rules change + - Resolve conflicts between similar terms + +#### Step 2: Create Review Processes + +1. **Quarterly term reviews**: + + - Verify definitions are still accurate + - Update terms based on business changes + - Archive obsolete terms + +2. **New term approval workflow**: + - Propose new terms through formal process + - Business stakeholder review and approval + - Technical validation of term usage + +### Understanding Glossary Impact + +A well-maintained business glossary delivers: + +**Improved Data Literacy**: + +- Business users understand data meaning +- Reduced time spent clarifying definitions +- Increased confidence in data-driven decisions + +**Better Collaboration**: + +- Consistent language across teams +- Faster onboarding of new team members +- More productive data discussions + +**Enhanced Data Discovery**: + +- Find datasets using business terminology +- Understand data context without technical expertise +- Discover related data through term relationships + +### Advanced Glossary Features + +#### 1. Term Lineage + +Track how business terms relate to data lineage: + +- See which datasets contribute to a business metric +- Understand impact of data changes on business terms +- Trace business definitions to source systems + +#### 2. Automated Term Detection + +Use DataHub's capabilities to: + +- Automatically suggest terms for new datasets +- Detect when datasets match existing term definitions +- Alert when term usage becomes inconsistent + +#### 3. Integration with BI Tools + +Connect your glossary to: + +- Business intelligence dashboards +- Reporting tools +- Data visualization platforms + +### Measuring Glossary Success + +Track these metrics to measure glossary adoption: + +- **Term Coverage**: Percentage of datasets with glossary terms +- **Term Usage**: How often terms are referenced +- **Definition Consistency**: Alignment across different uses +- **User Engagement**: Active glossary users and contributions +- **Business Impact**: Reduction in definition-related confusion + +### Best Practices for Business Glossaries + +#### 1. Start with High-Impact Terms + +Focus on terms that: + +- Appear in executive reports +- Are used across multiple teams +- Have caused confusion in the past +- Are required for compliance + +#### 2. Keep Definitions Business-Focused + +- Use language business users understand +- Avoid technical jargon +- Include business context and usage +- Provide concrete examples + +#### 3. Maintain Glossary Quality + +- Regular reviews and updates +- Clear ownership and stewardship +- Version control for definition changes +- Feedback mechanisms for users + +#### 4. Promote Adoption + +- Training sessions for business users +- Integration with existing workflows +- Success stories and use cases +- Executive sponsorship and support + +### Next Steps + +With a comprehensive business glossary in place, you're ready to implement automated governance policies that enforce your data standards at scale. + + diff --git a/docs/learn-datahub/governance/data-classification.md b/docs/learn-datahub/governance/data-classification.md new file mode 100644 index 00000000000000..f0cfd49e4ffffb --- /dev/null +++ b/docs/learn-datahub/governance/data-classification.md @@ -0,0 +1,271 @@ +# Data Classification + + + +## Protecting Sensitive Data Through Classification + +**Time Required**: 15 minutes + +### The Classification Challenge + +Your company handles customer PII, financial data, and proprietary business information across hundreds of datasets. Without proper classification, you can't: + +- **Comply with regulations** like GDPR, CCPA, or SOX +- **Implement appropriate security controls** for different data types +- **Respond to data subject requests** or audit requirements +- **Prevent accidental exposure** of sensitive information + +**Real-World Scenario**: During a recent audit, your team couldn't quickly identify which datasets contained PII, leading to a 2-week manual review process and potential compliance penalties. + +### Understanding Data Classification Levels + +DataHub supports industry-standard classification levels: + +
+ + + +
+ +**Classification Levels**: + +- **🔴 Restricted**: PII, financial data, trade secrets (highest protection) +- **🟡 Confidential**: Internal business data, customer insights +- **🔵 Internal**: General business information, operational data +- **🟢 Public**: Marketing materials, published reports + +### Exercise 1: Implement PII Detection + +Set up automated detection of personally identifiable information: + +#### Step 1: Enable PII Classification + +1. **Navigate to Settings** → **Classification** +2. **Enable "Automatic PII Detection"** +3. **Configure detection patterns** for: + - Email addresses (`*email*`, `*e_mail*`) + - Phone numbers (`*phone*`, `*mobile*`) + - Social Security Numbers (`*ssn*`, `*social_security*`) + - Credit card numbers (`*card*`, `*payment*`) + +#### Step 2: Review Detected PII + +1. **Go to the "Classification" dashboard** +2. **Review automatically detected PII fields** +3. **Verify accuracy** of the detection +4. **Manually classify** any missed sensitive fields + +#### Step 3: Apply PII Tags + +For the customer dataset: + +1. **Open the dataset profile** +2. **Navigate to the Schema tab** +3. **For each PII column**, add appropriate tags: + - `email` column → Add "PII" and "Contact-Info" tags + - `phone` column → Add "PII" and "Contact-Info" tags + - `address` column → Add "PII" and "Location-Data" tags + +### Exercise 2: Set Up Classification Rules + +Create automated rules to classify data based on patterns: + +#### Create Classification Rules + +1. **Go to Settings** → **Classification Rules** +2. **Create new rule**: "Financial Data Detection" + + - **Pattern**: Column names containing `*amount*`, `*price*`, `*cost*`, `*revenue*` + - **Classification**: "Confidential" + - **Tags**: "Financial", "Sensitive" + +3. **Create new rule**: "Customer Data Detection" + - **Pattern**: Table names containing `*customer*`, `*user*`, `*client*` + - **Classification**: "Restricted" + - **Tags**: "Customer-Data", "High-Privacy" + +#### Test Classification Rules + +1. **Run classification** on sample datasets +2. **Review results** in the Classification dashboard +3. **Adjust rules** based on accuracy +4. **Schedule regular re-classification** to catch new data + +### Exercise 3: Implement Data Sensitivity Levels + +Apply consistent sensitivity labeling across your data landscape: + +#### Step 1: Define Sensitivity Framework + +Create a company-wide sensitivity framework: + +``` +Sensitivity Level | Data Types | Access Controls | Examples +-----------------|------------|-----------------|---------- +Restricted | PII, PHI, Financial | Role-based, Encrypted | SSN, Credit Cards +Confidential | Business Critical | Department-based | Revenue, Strategy +Internal | Operational | Employee Access | Logs, Metrics +Public | Marketing | Open Access | Press Releases +``` + +#### Step 2: Apply Sensitivity Labels + +1. **Navigate to each critical dataset** +2. **Add sensitivity tags**: + + - Customer data → "Restricted" + - Financial reports → "Confidential" + - System logs → "Internal" + - Marketing content → "Public" + +3. **Document classification rationale** in dataset descriptions + +### Exercise 4: Set Up Compliance Monitoring + +Monitor classification compliance across your data landscape: + +#### Create Compliance Dashboard + +1. **Go to Analytics** → **Governance Metrics** +2. **Create dashboard** with these metrics: + - Percentage of datasets classified + - Number of PII fields identified + - Compliance score by data domain + - Classification coverage trends + +#### Set Up Compliance Alerts + +1. **Configure alerts** for: + - New datasets without classification + - PII detected in unclassified data + - Changes to restricted data schemas + - Access to sensitive data outside business hours + +### Understanding Classification Impact + +Proper data classification enables: + +**Regulatory Compliance**: + +- Quick identification of data subject to regulations +- Automated compliance reporting +- Audit trail for data handling + +**Risk Management**: + +- Appropriate security controls for different data types +- Incident response prioritization +- Data breach impact assessment + +**Access Control**: + +- Role-based access to sensitive data +- Automated access reviews +- Principle of least privilege enforcement + +### Advanced Classification Techniques + +#### 1. Machine Learning-Based Classification + +Use DataHub's ML capabilities to: + +- Analyze data content patterns +- Identify sensitive data in unstructured fields +- Continuously improve classification accuracy + +#### 2. Column-Level Classification + +Apply granular classification: + +- Different sensitivity levels within the same table +- Field-specific access controls +- Detailed compliance mapping + +#### 3. Dynamic Classification + +Implement classification that adapts to: + +- Data content changes +- Business context evolution +- Regulatory requirement updates + +### Measuring Classification Success + +Track these key metrics: + +- **Classification Coverage**: Percentage of datasets classified +- **PII Detection Accuracy**: True positives vs false positives +- **Compliance Score**: Adherence to classification policies +- **Time to Classify**: Speed of classifying new datasets +- **Access Violations**: Unauthorized access to classified data + +### Best Practices for Data Classification + +#### 1. Start with High-Risk Data + +Prioritize classification of: + +- Customer PII +- Financial information +- Healthcare data +- Intellectual property + +#### 2. Automate Where Possible + +Use automated detection for: + +- Common PII patterns +- Standard data types +- Regulatory data categories + +#### 3. Regular Review and Updates + +- Quarterly classification reviews +- Updates for new data sources +- Refinement of classification rules + +#### 4. Training and Awareness + +- Educate data teams on classification importance +- Provide clear classification guidelines +- Regular training on new regulations + +### Next Steps + +With data properly classified, you're ready to create a business glossary that provides consistent definitions and context for your data assets. + + diff --git a/docs/learn-datahub/governance/governance-policies.md b/docs/learn-datahub/governance/governance-policies.md new file mode 100644 index 00000000000000..4fa50cf417d055 --- /dev/null +++ b/docs/learn-datahub/governance/governance-policies.md @@ -0,0 +1,332 @@ +# Governance Policies + + + +## Automating Governance at Scale + +**Time Required**: 11 minutes + +### The Policy Automation Challenge + +Your organization now has ownership, classification, and glossary terms in place, but governance still requires manual oversight. Without automated policies, you face: + +- **Inconsistent enforcement** of data standards across teams +- **Manual reviews** that don't scale with data growth +- **Policy violations** discovered too late to prevent impact +- **Compliance gaps** that create regulatory risk + +**Real-World Scenario**: A developer accidentally deployed a new dataset containing PII without proper classification or approval, exposing sensitive customer data for 3 days before manual review caught the issue. + +### Understanding DataHub Policies + +DataHub policies automate governance enforcement through configurable rules that monitor, alert, and control data operations: + +**Policy Types**: + +- **🔒 Access Policies**: Control who can view or modify data +- **📋 Metadata Policies**: Enforce required metadata standards +- **🚨 Quality Policies**: Monitor data quality and trigger alerts +- **Approval Policies**: Require reviews for sensitive operations +- **📊 Compliance Policies**: Ensure regulatory requirement adherence + +### Exercise 1: Create Metadata Compliance Policies + +Ensure all datasets meet your organization's metadata standards: + +#### Step 1: Access Policy Management + +1. **Navigate to Settings** → **Policies** +2. **Click "Create Policy"** to start building your first automated rule +3. **Select "Metadata Policy"** as the policy type + +#### Step 2: Create "Required Ownership" Policy + +Build a policy that ensures all datasets have assigned owners: + +**Policy Configuration**: + +- **Name**: "Required Dataset Ownership" +- **Description**: "All datasets must have at least one technical owner assigned" +- **Scope**: All datasets in production domains +- **Rule**: `ownership.owners.length >= 1 AND ownership.owners[].type == "TECHNICAL_OWNER"` +- **Action**: Block dataset publication without ownership +- **Notification**: Alert data governance team + +#### Step 3: Create "PII Classification" Policy + +Ensure PII data is properly classified: + +**Policy Configuration**: + +- **Name**: "PII Data Classification Required" +- **Description**: "Datasets containing PII must be classified as Restricted" +- **Trigger**: When PII tags are detected +- **Rule**: `tags.contains("PII") IMPLIES classification == "RESTRICTED"` +- **Action**: Require data steward approval +- **Escalation**: Auto-escalate to privacy team after 24 hours + +### Exercise 2: Implement Access Control Policies + +Control who can access sensitive data based on classification: + +#### Step 1: Create Role-Based Access Policy + +**Policy Configuration**: + +- **Name**: "Restricted Data Access Control" +- **Description**: "Only authorized roles can access restricted classification data" +- **Scope**: Datasets with "Restricted" classification +- **Allowed Roles**: + - Data Stewards + - Privacy Team + - Designated Business Owners +- **Action**: Block unauthorized access attempts +- **Logging**: Log all access attempts for audit + +#### Step 2: Set Up Time-Based Access + +For highly sensitive data, implement time-based restrictions: + +**Policy Configuration**: + +- **Name**: "After-Hours Restricted Access" +- **Description**: "Restricted data access limited to business hours" +- **Schedule**: Monday-Friday, 8 AM - 6 PM local time +- **Exceptions**: Emergency access with manager approval +- **Override**: Security team can grant temporary access + +### Exercise 3: Create Data Quality Policies + +Automatically monitor and enforce data quality standards: + +#### Step 1: Schema Change Policy + +Prevent breaking changes to critical datasets: + +**Policy Configuration**: + +- **Name**: "Critical Dataset Schema Protection" +- **Description**: "Schema changes to critical datasets require approval" +- **Scope**: Datasets tagged as "Critical" or "Production" +- **Monitored Changes**: + - Column deletions + - Data type changes + - Primary key modifications +- **Approval Required**: Technical owner + business owner +- **Notification**: Alert downstream consumers of pending changes + +#### Step 2: Data Freshness Policy + +Ensure data meets freshness requirements: + +**Policy Configuration**: + +- **Name**: "Data Freshness SLA" +- **Description**: "Critical datasets must be updated within SLA windows" +- **SLA Definitions**: + - Customer data: 4 hours + - Financial data: 1 hour + - Analytics data: 24 hours +- **Action**: Alert owners when SLA is breached +- **Escalation**: Page on-call engineer for critical breaches + +### Exercise 4: Implement Compliance Automation + +Automate compliance with regulatory requirements: + +#### Step 1: GDPR Compliance Policy + +Ensure GDPR compliance for EU customer data: + +**Policy Configuration**: + +- **Name**: "GDPR Data Processing Compliance" +- **Description**: "EU customer data must meet GDPR requirements" +- **Scope**: Datasets containing EU customer PII +- **Requirements**: + - Legal basis documented + - Data retention period defined + - Data processing purpose specified + - Privacy impact assessment completed +- **Monitoring**: Track data subject requests and processing activities + +#### Step 2: SOX Compliance Policy + +Ensure financial data meets SOX requirements: + +**Policy Configuration**: + +- **Name**: "SOX Financial Data Controls" +- **Description**: "Financial datasets must have SOX-compliant controls" +- **Requirements**: + - Segregation of duties in data access + - Change management approval workflows + - Audit trail for all modifications + - Regular access reviews +- **Reporting**: Generate SOX compliance reports quarterly + +### Exercise 5: Set Up Policy Monitoring and Alerting + +Create comprehensive monitoring for policy compliance: + +#### Step 1: Policy Dashboard + +1. **Create governance dashboard** with key metrics: + + - Policy compliance percentage + - Active policy violations + - Resolution time trends + - Compliance by data domain + +2. **Set up real-time monitoring**: + - Policy violation alerts + - Compliance trend analysis + - Exception tracking and reporting + +#### Step 2: Automated Remediation + +Configure automatic responses to policy violations: + +**Immediate Actions**: + +- Block non-compliant operations +- Quarantine problematic datasets +- Revoke inappropriate access +- Generate incident tickets + +**Escalation Procedures**: + +- Notify data owners within 15 minutes +- Escalate to data governance team after 1 hour +- Executive notification for critical violations +- Automatic compliance reporting + +### Understanding Policy Impact + +Automated governance policies provide: + +**Consistent Enforcement**: + +- Policies applied uniformly across all data +- No manual oversight gaps +- 24/7 monitoring and enforcement + +**Proactive Risk Management**: + +- Issues caught before they impact business +- Automatic remediation of common problems +- Reduced compliance risk + +**Scalable Governance**: + +- Governance that grows with your data +- Reduced manual effort for routine checks +- Focus governance team on strategic initiatives + +### Advanced Policy Features + +#### 1. Machine Learning-Enhanced Policies + +Use ML to improve policy effectiveness: + +- **Anomaly Detection**: Identify unusual data access patterns +- **Risk Scoring**: Automatically assess compliance risk +- **Predictive Alerts**: Warn of potential policy violations + +#### 2. Policy Templates + +Create reusable policy templates for: + +- Industry-specific compliance (HIPAA, PCI-DSS) +- Common governance patterns +- Organizational standards + +#### 3. Policy Testing and Simulation + +Before deploying policies: + +- **Test policies** against historical data +- **Simulate impact** of new policy rules +- **Gradual rollout** with monitoring + +### Measuring Policy Success + +Track these key metrics: + +- **Policy Compliance Rate**: Percentage of data assets meeting policies +- **Violation Resolution Time**: Speed of addressing policy violations +- **False Positive Rate**: Accuracy of policy detection +- **Coverage**: Percentage of data covered by policies +- **Business Impact**: Reduction in compliance incidents + +### Best Practices for Governance Policies + +#### 1. Start Simple and Iterate + +- Begin with high-impact, low-complexity policies +- Gather feedback and refine rules +- Gradually add more sophisticated policies + +#### 2. Balance Automation and Human Oversight + +- Automate routine compliance checks +- Require human approval for complex decisions +- Provide override mechanisms for exceptions + +#### 3. Ensure Policy Transparency + +- Document policy rationale and business impact +- Provide clear guidance for compliance +- Regular communication about policy changes + +#### 4. Regular Policy Review + +- Quarterly review of policy effectiveness +- Update policies based on business changes +- Archive obsolete or redundant policies + +### Governance Maturity Assessment + +Evaluate your organization's governance maturity: + +**Level 1 - Basic**: Manual processes, reactive governance +**Level 2 - Managed**: Some automation, defined processes +**Level 3 - Defined**: Comprehensive policies, proactive monitoring +**Level 4 - Quantitatively Managed**: Metrics-driven optimization +**Level 5 - Optimizing**: Continuous improvement, predictive governance + +### Congratulations! + +You've successfully implemented a comprehensive data governance framework using DataHub. Your organization now has: + +**Clear Ownership**: Accountability for every data asset +**Proper Classification**: Risk-appropriate protection for sensitive data +**Consistent Language**: Standardized business terminology +**Automated Policies**: Scalable governance enforcement + +### Next Steps in Your Governance Journey + +1. **Expand Coverage**: Apply governance to additional data domains +2. **Advanced Analytics**: Implement governance metrics and reporting +3. **Integration**: Connect governance to your broader data platform +4. **Culture**: Build a data-driven governance culture across teams + +Your data governance foundation is now ready to support your organization's growth and ensure compliance at scale. + +## Continue Learning + +Ready to explore more DataHub capabilities? Check out these related tutorials: + +- [Data Quality & Monitoring](../quality/overview.md) - Ensure data reliability + + diff --git a/docs/learn-datahub/governance/overview.md b/docs/learn-datahub/governance/overview.md new file mode 100644 index 00000000000000..a1b3c691a22cb3 --- /dev/null +++ b/docs/learn-datahub/governance/overview.md @@ -0,0 +1,189 @@ +import DataHubEntityCard from '@site/src/components/DataHubEntityCard'; +import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode'; + +# Data Governance Fundamentals + + + +## Professional Data Governance Journey + +**Time Required**: 50 minutes | **Skill Level**: Intermediate + +### Your Challenge: Establishing Data Governance at Scale + +You're a **Data Governance Lead** at a growing technology company. Your organization has hundreds of datasets across multiple platforms, but lacks consistent ownership, classification, and business context. Leadership wants to implement proper data governance to ensure compliance, reduce risk, and improve data quality. + +**The Business Impact**: Without proper governance, your company faces: + +- **Compliance Risks**: Inability to track PII and sensitive data +- **Data Quality Issues**: No clear ownership for data problems +- **Business Confusion**: Teams can't understand what data means +- **Operational Inefficiency**: Time wasted searching for the right data + +### What You'll Learn + +This tutorial series walks you through implementing comprehensive data governance using DataHub's governance features: + +#### Chapter 1: Ownership Management (12 minutes) + +**Business Challenge**: No clear accountability for data quality and maintenance +**Your Journey**: + +- Assign technical and business owners to critical datasets +- Set up ownership notifications and responsibilities +- Create ownership hierarchies for different data domains + **Organizational Outcome**: Clear accountability and faster issue resolution + +#### Chapter 2: Data Classification (15 minutes) + +**Business Challenge**: Sensitive data scattered across systems without proper labeling +**Your Journey**: + +- Implement PII detection and classification +- Apply sensitivity labels (Public, Internal, Confidential, Restricted) +- Set up automated classification rules + **Organizational Outcome**: Compliance readiness and risk reduction + +#### Chapter 3: Business Glossary (12 minutes) + +**Business Challenge**: Business terms used inconsistently across teams and systems +**Your Journey**: + +- Create standardized business definitions +- Link glossary terms to datasets and columns +- Establish term hierarchies and relationships + **Organizational Outcome**: Consistent business language and improved data understanding + +#### Chapter 4: Governance Policies (11 minutes) + +**Business Challenge**: Manual governance processes that don't scale +**Your Journey**: + +- Set up automated governance policies +- Configure approval workflows for sensitive data +- Implement data access controls and monitoring + **Organizational Outcome**: Scalable governance that grows with your organization + +### DataHub Governance in Action + +See how proper governance transforms your data assets from unmanaged to enterprise-ready: + +
+ + + +
+ +**Governance Benefits Demonstrated**: + +- **Clear Ownership**: Every dataset has assigned business and technical owners +- **Proper Classification**: Tags indicate sensitivity levels and compliance requirements +- **Business Context**: Glossary terms provide standardized definitions +- **Quality Assurance**: Health indicators show data reliability + +### Governance in Practice: End-to-End Data Flow + +See how governance controls are applied throughout a complete data pipeline: + + + +**Governance Flow Analysis**: + +- **Source Control**: Raw data properly classified as PII/Restricted with clear ownership +- **Processing Governance**: Validation jobs ensure quality and compliance during transformation +- **Output Classification**: Analytics data appropriately tagged and documented for business use +- **Access Control**: Executive dashboards have appropriate sensitivity levels for broad access + +### Interactive Learning Experience + +Each chapter includes: + +- **Real Governance Scenarios**: Based on actual enterprise challenges +- **Hands-on Exercises**: Using DataHub's sample data and governance features +- **Best Practice Guidance**: Industry-standard approaches to data governance +- **Measurable Outcomes**: Clear success metrics for each governance initiative + +### Prerequisites + +- Completed [DataHub Quickstart](../quickstart/overview.md) +- Basic understanding of data management concepts +- Access to DataHub instance with sample data + +### Ready to Begin? + +Start your data governance journey by establishing clear ownership and accountability for your organization's data assets. + + diff --git a/docs/learn-datahub/governance/ownership-management.md b/docs/learn-datahub/governance/ownership-management.md new file mode 100644 index 00000000000000..877da8da538e49 --- /dev/null +++ b/docs/learn-datahub/governance/ownership-management.md @@ -0,0 +1,190 @@ +# Ownership Management + + + +## Establishing Clear Data Ownership + +**Time Required**: 12 minutes + +### The Ownership Challenge + +Your organization has critical datasets like customer information, financial transactions, and product analytics, but when data quality issues arise, nobody knows who to contact. Teams waste hours trying to find the right person to fix problems or answer questions about data. + +**Real-World Impact**: A recent customer complaint about incorrect billing took 3 days to resolve because the team couldn't identify who owned the billing data pipeline. + +### Understanding DataHub Ownership Types + +DataHub supports multiple ownership types to reflect real organizational structures: + +
+ +
+ +**Ownership Types Explained**: + +- **👨‍💻 Technical Owner**: Responsible for data pipeline maintenance, schema changes, and technical issues +- **👔 Business Owner**: Accountable for data accuracy, business rules, and stakeholder communication +- **🛡️ Data Steward**: Ensures data quality, compliance, and governance standards +- **📊 Data Owner**: Ultimate accountability for data asset (often a senior business leader) + +### Exercise 1: Assign Dataset Owners + +Let's establish ownership for your organization's key datasets: + +#### Step 1: Navigate to Dataset Ownership + +1. **Open DataHub** and search for "fct_users_created" +2. **Click on the dataset** to open its profile page +3. **Go to the "Properties" tab** and find the "Ownership" section +4. **Click "Add Owners"** to begin assignment + +#### Step 2: Add Technical Owner + +1. **Select "Technical Owner"** from the ownership type dropdown +2. **Enter the email**: `john.doe@company.com` +3. **Add justification**: "Maintains the user analytics ETL pipeline" +4. **Click "Add"** to save + +#### Step 3: Add Business Owner + +1. **Click "Add Owners"** again +2. **Select "Business Owner"** +3. **Enter the email**: `sarah.smith@company.com` +4. **Add justification**: "Accountable for user metrics accuracy and business requirements" +5. **Click "Add"** to save + +### Exercise 2: Set Up Ownership Notifications + +Configure automatic notifications so owners are alerted to important events: + +#### Configure Owner Notifications + +1. **Go to Settings** → **Notifications** +2. **Enable "Dataset Quality Alerts"** for Technical Owners +3. **Enable "Schema Change Notifications"** for Business Owners +4. **Set up "Data Incident Alerts"** for Data Stewards + +**What This Achieves**: When data quality issues occur, the right people are automatically notified based on their ownership role. + +### Exercise 3: Create Ownership Hierarchies + +For large organizations, establish ownership hierarchies by domain: + +#### Domain-Based Ownership Structure + +**Customer Domain:** + +- **Technical Owners**: Data Engineering Team (infrastructure, pipelines, technical maintenance) +- **Business Owners**: Customer Success Team (business requirements, use cases) +- **Data Stewards**: Customer Data Governance (quality, compliance, documentation) +- **Data Owner**: VP Customer Experience (strategic decisions, access approvals) + +**Financial Domain:** + +- **Technical Owners**: Financial Systems Team (ERP integration, data processing) +- **Business Owners**: Finance Team (reporting requirements, business rules) +- **Data Stewards**: Financial Data Governance (regulatory compliance, audit trails) +- **Data Owner**: CFO (strategic oversight, regulatory accountability) + +#### Implement Domain Ownership + +1. **Navigate to "Domains"** in DataHub +2. **Create "Customer Domain"** if it doesn't exist +3. **Add datasets** to the appropriate domain +4. **Assign domain-level owners** who oversee all datasets in that domain + +### Understanding Ownership Impact + +With proper ownership in place, your organization gains: + +**Faster Issue Resolution**: + +- Data quality problems get routed to the right technical owner +- Business questions go directly to the business owner +- Average resolution time drops from days to hours + +**Clear Accountability**: + +- Each dataset has designated responsible parties +- Ownership information is visible to all data consumers +- No more "orphaned" datasets without clear ownership + +**Improved Data Quality**: + +- Owners receive proactive alerts about their data +- Regular ownership reviews ensure assignments stay current +- Quality metrics are tied to specific owners + +### Best Practices for Ownership Management + +#### 1. Start with Critical Datasets + +Focus on your most important data assets first: + +- Customer data +- Financial transactions +- Product analytics +- Regulatory reporting data + +#### 2. Use Multiple Ownership Types + +Don't rely on just one owner per dataset: + +- Technical owners for operational issues +- Business owners for accuracy and requirements +- Data stewards for governance and compliance + +#### 3. Regular Ownership Reviews + +Set up quarterly reviews to: + +- Verify ownership assignments are current +- Update owners when people change roles +- Add ownership to newly discovered datasets + +#### 4. Document Ownership Responsibilities + +Create clear expectations for each ownership type: + +- Response time commitments +- Escalation procedures +- Quality standards + +### Measuring Ownership Success + +Track these metrics to measure the impact of your ownership program: + +- **Mean Time to Resolution (MTTR)** for data issues +- **Percentage of datasets with assigned owners** +- **Owner response rate** to data quality alerts +- **User satisfaction** with data issue resolution + +### Next Steps + +Now that you've established clear ownership, you're ready to implement data classification to identify and protect sensitive information. + + diff --git a/docs/learn-datahub/ingestion/overview.md b/docs/learn-datahub/ingestion/overview.md new file mode 100644 index 00000000000000..5bb1e14d0df0b5 --- /dev/null +++ b/docs/learn-datahub/ingestion/overview.md @@ -0,0 +1,195 @@ +# Data Ingestion Mastery + + + +## Professional Data Integration at Scale + +**Time Required**: 60 minutes | **Skill Level**: Advanced + +### Your Challenge: Scaling Metadata Management + +You're a **Senior Data Engineer** at a rapidly growing organization. Your data landscape includes 50+ data sources across cloud and on-premises systems, with new sources added weekly. Your current metadata management approach is becoming unsustainable: + +- **Manual documentation** that's always outdated +- **Inconsistent metadata** across different systems +- **No automated discovery** of schema changes or new datasets +- **Limited visibility** into data lineage and dependencies + +**The Business Impact**: Your data team spends 30% of their time answering "where is this data?" questions, and a recent compliance audit revealed significant gaps in data documentation, putting the organization at regulatory risk. + +### What You'll Learn + +This tutorial series teaches you to implement enterprise-grade metadata ingestion using DataHub's advanced capabilities: + +#### Chapter 1: Recipe Fundamentals (15 minutes) + +**Business Challenge**: Inconsistent and manual metadata collection across diverse data sources +**Your Journey**: + +- Master DataHub recipe configuration for different source types +- Implement authentication and connection management +- Configure metadata extraction filters and transformations + **Organizational Outcome**: Standardized, automated metadata collection across all data sources + +#### Chapter 2: Stateful Ingestion (15 minutes) + +**Business Challenge**: Full re-ingestion causing performance issues and unnecessary processing +**Your Journey**: + +- Implement incremental metadata updates +- Configure change detection and delta processing +- Optimize ingestion performance for large-scale environments + **Organizational Outcome**: Efficient metadata updates that scale with organizational growth + +#### Chapter 3: Data Profiling (15 minutes) + +**Business Challenge**: Limited understanding of actual data content and quality patterns +**Your Journey**: + +- Enable automated data profiling and statistics collection +- Configure custom profiling rules for business-specific metrics +- Implement profiling for different data types and sources + **Organizational Outcome**: Deep insights into data content, quality, and usage patterns + +#### Chapter 4: Advanced Patterns (15 minutes) + +**Business Challenge**: Complex enterprise requirements that basic ingestion can't handle +**Your Journey**: + +- Implement custom transformers and processors +- Configure advanced lineage extraction +- Set up multi-environment metadata management + **Organizational Outcome**: Sophisticated metadata management that handles enterprise complexity + +### Interactive Learning Experience + +Each chapter includes: + +- **Real Enterprise Scenarios**: Based on actual large-scale metadata challenges +- **Hands-on Configuration**: Working with DataHub's ingestion framework +- **Performance Optimization**: Techniques for production-scale deployments +- **Troubleshooting Guidance**: Common issues and resolution strategies + +### Understanding Ingestion Architecture + +DataHub's ingestion framework provides enterprise-grade capabilities: + +
+ + + +
+ +**Key Ingestion Capabilities**: + +- **🔌 Universal Connectors**: 50+ pre-built connectors for popular data systems +- **⚡ High Performance**: Optimized for large-scale enterprise environments +- **🔄 Incremental Updates**: Stateful ingestion for efficient metadata synchronization +- **📊 Automated Profiling**: Deep data content analysis and quality metrics +- **🎯 Flexible Configuration**: Customizable extraction, transformation, and loading + +### Ingestion Framework Components + +**Core Components**: + +- **Sources**: Connectors for different data systems (Snowflake, BigQuery, Kafka, etc.) +- **Recipes**: Configuration files that define ingestion behavior +- **Transformers**: Processors that modify metadata during ingestion +- **Sinks**: Destinations for processed metadata (typically DataHub) +- **State Management**: Tracking of ingestion progress and changes + +**Enterprise Features**: + +- **Authentication Management**: Secure credential handling and rotation +- **Error Handling**: Robust failure recovery and retry mechanisms +- **Monitoring**: Comprehensive ingestion observability and alerting +- **Scheduling**: Flexible timing and dependency management +- **Scaling**: Distributed processing for large environments + +### Prerequisites + +- Completed [DataHub Quickstart](../quickstart/overview.md) +- Understanding of data architecture and metadata concepts +- Access to DataHub CLI and sample data sources +- Familiarity with YAML configuration and command-line tools +- Basic knowledge of data systems (databases, streaming platforms, etc.) + +### Ingestion Maturity Levels + +**Level 1 - Basic**: Manual metadata entry, ad-hoc documentation +**Level 2 - Automated**: Scheduled ingestion, basic source coverage +**Level 3 - Optimized**: Stateful ingestion, profiling, performance tuning +**Level 4 - Advanced**: Custom transformers, complex lineage, multi-environment +**Level 5 - Intelligent**: ML-driven optimization, predictive metadata management + +### Common Ingestion Challenges + +**Technical Challenges**: + +- **Scale**: Processing metadata from hundreds of data sources +- **Performance**: Minimizing ingestion time and resource usage +- **Reliability**: Handling network issues, authentication failures, and source changes +- **Complexity**: Managing diverse source types with different metadata models + +**Organizational Challenges**: + +- **Governance**: Ensuring consistent metadata standards across teams +- **Security**: Managing credentials and access controls securely +- **Change Management**: Adapting to evolving data infrastructure +- **Cost Optimization**: Balancing metadata completeness with resource costs + +### Success Metrics + +**Technical Metrics**: + +- **Ingestion Coverage**: Percentage of data sources with automated metadata collection +- **Ingestion Performance**: Time and resources required for metadata updates +- **Data Freshness**: Lag between source changes and metadata updates +- **Error Rate**: Percentage of successful vs. failed ingestion runs + +**Business Metrics**: + +- **Time to Discovery**: Speed of finding relevant data assets +- **Metadata Completeness**: Percentage of assets with comprehensive metadata +- **User Adoption**: Active usage of metadata for data discovery and governance +- **Compliance Readiness**: Ability to respond to audit and regulatory requirements + +### Ready to Begin? + +Start your ingestion mastery journey by learning the fundamentals of DataHub recipes and how to configure them for different data sources. + + diff --git a/docs/learn-datahub/lineage/impact-analysis.md b/docs/learn-datahub/lineage/impact-analysis.md new file mode 100644 index 00000000000000..0687922a2a4c2a --- /dev/null +++ b/docs/learn-datahub/lineage/impact-analysis.md @@ -0,0 +1,642 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import TutorialProgress from '@site/src/components/TutorialProgress'; +import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode'; + +# Performing Impact Analysis (15 minutes) + +:::info Tutorial Progress +**Step 2 of 3** | **15 minutes** | [Overview](overview.md) → [Reading Lineage](reading-lineage.md) → **Impact Analysis** → [Troubleshooting](troubleshooting.md) +::: + + + +**The Critical Decision**: The enterprise analytics migration is approved, but now you need to answer the CEO's question: _"What exactly will be affected, and how do we minimize business risk?"_ This is where impact analysis transforms from guesswork into science. + +**Your Mission**: Learn to perform systematic impact analysis that quantifies risk, prioritizes changes, and creates bulletproof migration plans. + +## What You'll Master + +By the end of this step, you'll be able to: + +- **Quantify downstream impact** with business metrics and risk scores +- **Create stakeholder reports** that clearly communicate change effects +- **Develop rollback strategies** based on lineage dependencies +- **Coordinate cross-team changes** using lineage insights + +## The Impact Analysis Framework + +Professional impact analysis follows a systematic 5-step process: + +**Impact Analysis Process:** + +1. **Scope Definition** → Define what's changing and why +2. **Downstream Mapping** → Identify all affected systems and stakeholders +3. **Risk Assessment** → Quantify business impact and technical risks +4. **Stakeholder Analysis** → Understand who needs to be involved +5. **Mitigation Planning** → Develop rollback and contingency strategies + +## Step 1: Scope Definition + +Before analyzing impact, clearly define what's changing: + +
+ +**Change Scope Template**: + +``` +System/Dataset: ________________________ +Change Type: ___________________________ +Timeline: ______________________________ +Technical Details: _____________________ +Business Justification: ________________ +``` + +**Common Change Types**: + +- **System Migration**: Moving from one platform to another +- **Schema Changes**: Adding, removing, or modifying columns +- **Performance Optimization**: Changing processing logic or infrastructure +- **Security Updates**: Access control or data classification changes +- **Deprecation**: Retiring old systems or datasets + +### Impact Analysis in Action + +Here's a real-world example showing how changes cascade through your data ecosystem: + + + +**Impact Assessment**: This migration affects 15+ downstream systems, including production ML models serving 1M+ customers daily. The health indicators show critical dependencies that require careful coordination. + +
+ +### TechFlow Migration Example + +Let's apply this to our scenario: + + + + +**System/Dataset**: `customer_analytics_pipeline` (Hive tables) +**Change Type**: Platform migration (Hive → Snowflake) +**Timeline**: 48-hour maintenance window, next weekend +**Technical Details**: + +- Migrate 5 core tables: `customers`, `orders`, `customer_metrics`, `daily_summaries`, `customer_segments` +- Preserve all existing schemas and data +- Update connection strings in downstream systems + +**Business Justification**: + +- 10x performance improvement for customer analytics +- $50K/month cost savings +- Enable real-time customer insights + + + + +**High-Risk Elements**: + +- **Customer-facing dashboards**: Sales team uses these daily +- **Automated reports**: CEO gets weekly customer metrics +- **ML pipelines**: Customer segmentation models depend on this data +- **API endpoints**: Mobile app queries customer data directly + +**Timing Risks**: + +- **Weekend migration**: Limited support staff available +- **Monday morning**: Sales team needs dashboards for weekly planning +- **Month-end**: Customer reporting deadline approaching + +**Technical Risks**: + +- **Data format differences**: Snowflake vs. Hive SQL variations +- **Performance changes**: Query patterns may need optimization +- **Connection failures**: Downstream systems need configuration updates + + + + +## Step 2: Downstream Mapping + +Use DataHub's lineage to systematically map all affected systems: + +### The Downstream Discovery Method + +**Starting Point**: Your changing dataset (`customer_analytics_pipeline`) + +**Discovery Process**: + +1. **Open the dataset** in DataHub +2. **Navigate to Lineage tab** +3. **Switch to downstream view** (right side of lineage graph) +4. **Document each downstream connection**: + +
+ +**Downstream Impact Template**: + +| System | Type | Business Impact | Technical Owner | Update Required | +| ------------------ | ------------ | --------------------------- | ---------------- | ------------------- | +| Customer Dashboard | BI Tool | High - Sales team daily use | @sarah-analytics | Connection string | +| Weekly Reports | Automated | High - CEO visibility | @john-reporting | SQL query updates | +| ML Pipeline | Data Science | Medium - Model retraining | @alex-ml | Data source config | +| Mobile API | Application | High - Customer app | @dev-team | Database connection | +| Data Warehouse | Storage | Low - Archive only | @data-ops | Monitoring updates | + +
+ +### Interactive Exercise: Downstream Mapping + +
+ +**Your Challenge**: Map downstream impact for TechFlow's user analytics + +**Step 1**: Open `fct_users_created` in your DataHub instance +**Step 2**: Navigate to the Lineage tab +**Step 3**: Identify all downstream connections +**Step 4**: Fill out the impact template: + +``` +Downstream System 1: ___________________ +Business Impact: _______________________ +Technical Owner: _______________________ +Update Required: _______________________ + +Downstream System 2: ___________________ +Business Impact: _______________________ +Technical Owner: _______________________ +Update Required: _______________________ +``` + +**Success Criteria**: You've identified at least 3 downstream systems and assessed their business impact. + +
+ +## Step 3: Risk Assessment + +Transform your downstream map into quantified risk scores: + +### The Risk Scoring Matrix + + + + +**Impact Scale (1-5)**: + +- **5 - Critical**: Customer-facing, revenue-impacting, or regulatory +- **4 - High**: Executive reporting, key business processes +- **3 - Medium**: Team productivity, internal analytics +- **2 - Low**: Development tools, experimental systems +- **1 - Minimal**: Archive, backup, or deprecated systems + +**Business Impact Factors**: + +- **User Count**: How many people depend on this system? +- **Revenue Impact**: Does this directly affect sales or billing? +- **Compliance**: Are there regulatory or audit requirements? +- **Operational Criticality**: Is this needed for daily operations? + + + + +**Complexity Scale (1-5)**: + +- **5 - Very Complex**: Custom code, multiple integrations, legacy systems +- **4 - Complex**: Requires specialized knowledge, multiple teams +- **3 - Moderate**: Standard configurations, documented processes +- **2 - Simple**: Well-understood, single team ownership +- **1 - Trivial**: Automated, self-service, or minimal changes + +**Technical Factors**: + +- **Integration Complexity**: How many systems need updates? +- **Code Changes**: Are application changes required? +- **Testing Requirements**: How extensive is validation needed? +- **Rollback Difficulty**: How easy is it to undo changes? + + + + +**Risk Score Formula**: + +``` +Risk Score = Business Impact × Technical Complexity × Urgency Factor +``` + +**Urgency Factors**: + +- **1.5x**: Tight deadline (< 1 week) +- **1.2x**: Normal timeline (1-4 weeks) +- **1.0x**: Flexible timeline (> 1 month) + +**Risk Categories**: + +- **20-25**: 🔴 **Critical Risk** - Executive approval required +- **15-19**: 🟡 **High Risk** - Detailed mitigation plan needed +- **10-14**: 🟢 **Medium Risk** - Standard change process +- **5-9**: 🔵 **Low Risk** - Routine change management +- **1-4**: ⚪ **Minimal Risk** - Proceed with standard testing + + + + +### Risk Assessment Exercise + +
+ +**TechFlow Customer Analytics Migration Risk Assessment**: + +| Downstream System | Business Impact | Technical Complexity | Risk Score | Category | +| ----------------- | --------------- | -------------------- | ---------- | ----------- | +| Sales Dashboard | 5 (Critical) | 3 (Moderate) | 22.5 | 🔴 Critical | +| CEO Reports | 4 (High) | 2 (Simple) | 12 | 🟢 Medium | +| ML Pipeline | 3 (Medium) | 4 (Complex) | 18 | 🟡 High | +| Mobile API | 5 (Critical) | 3 (Moderate) | 22.5 | 🔴 Critical | +| Archive System | 1 (Minimal) | 1 (Trivial) | 1.5 | ⚪ Minimal | + +**Analysis**: 2 Critical Risk systems require executive approval and detailed rollback plans. + +
+ +## Step 4: Stakeholder Analysis + +Identify who needs to be involved in the change: + +### Stakeholder Mapping Framework + +
+ +**Stakeholder Categories**: + +**🎯 Primary Stakeholders** (Directly affected): + +- **Data Consumers**: Teams using the affected data +- **System Owners**: Technical teams responsible for downstream systems +- **Business Users**: People whose work depends on the data + +**🤝 Secondary Stakeholders** (Coordination required): + +- **Infrastructure Teams**: Platform and DevOps support +- **Security Teams**: Access control and compliance +- **Project Management**: Timeline and resource coordination + +**📢 Communication Stakeholders** (Keep informed): + +- **Executive Leadership**: High-level impact awareness +- **Customer Support**: Potential user impact preparation +- **Documentation Teams**: Update procedures and guides + +
+ +### Communication Strategy + + + + +**Technical Impact Report Template**: + +```markdown +## System Change Impact: Customer Analytics Migration + +### Technical Changes Required + +- **Connection Updates**: Update database connection strings +- **Query Modifications**: Adapt SQL for Snowflake syntax +- **Testing Requirements**: Validate data accuracy and performance +- **Rollback Plan**: Revert connection strings if issues occur + +### Timeline + +- **Preparation**: This week - update configurations +- **Migration**: Weekend - 48-hour window +- **Validation**: Monday morning - verify all systems + +### Support Contacts + +- **Migration Lead**: @data-engineering-team +- **Emergency Contact**: @on-call-engineer +``` + + + + +**Business Impact Summary Template**: + +```markdown +## Customer Analytics Platform Upgrade + +### What's Changing + +We're upgrading our customer analytics platform to improve performance and reduce costs. + +### Business Benefits + +- **10x faster** customer reports and dashboards +- **$50K monthly savings** in infrastructure costs +- **Real-time insights** for better customer service + +### What You Need to Know + +- **When**: Next weekend (48-hour maintenance window) +- **Impact**: Brief downtime Saturday evening, normal service by Monday +- **Your Action**: No action required - all systems will work as before + +### Questions? + +Contact: @data-team or @project-manager +``` + + + + +**Executive Impact Brief Template**: + +```markdown +## Executive Brief: Customer Analytics Migration + +### Strategic Impact + +- **Business Value**: $600K annual savings + 10x performance improvement +- **Risk Assessment**: 2 critical systems identified, mitigation plans in place +- **Timeline**: 48-hour weekend migration, normal operations by Monday + +### Risk Mitigation + +- **Rollback Plan**: 4-hour recovery time if issues occur +- **Testing Strategy**: Comprehensive validation before go-live +- **Support Coverage**: 24/7 engineering support during migration + +### Success Metrics + +- **Zero customer impact**: No service disruptions +- **Performance targets**: 10x improvement in dashboard load times +- **Cost savings**: $50K monthly reduction starting next month + +### Approval Required + +Proceed with migration: [ ] Yes [ ] No +Executive Sponsor: **\*\***\_\_\_\_**\*\*** +``` + + + + +## Step 5: Mitigation Planning + +Develop comprehensive plans to minimize risk: + +### The Mitigation Strategy Framework + +
+ +**Risk Mitigation Categories**: + +**🛡️ Preventive Measures** (Avoid problems): + +- **Comprehensive testing**: Validate all connections before go-live +- **Staged rollout**: Migrate non-critical systems first +- **Communication plan**: Ensure all stakeholders are prepared +- **Documentation updates**: Keep all procedures current + +**🚨 Detective Measures** (Catch problems early): + +- **Monitoring alerts**: Set up notifications for system failures +- **Health checks**: Automated validation of data flow +- **User feedback channels**: Quick reporting of issues +- **Performance monitoring**: Track system response times + +**🔧 Corrective Measures** (Fix problems quickly): + +- **Rollback procedures**: Detailed steps to revert changes +- **Emergency contacts**: 24/7 support team availability +- **Escalation paths**: Clear decision-making authority +- **Communication templates**: Pre-written status updates + +
+ +### Rollback Strategy Development + +**Critical Success Factor**: Every change needs a tested rollback plan. + + + + +**Rollback Decision Matrix**: + +| Issue Type | Rollback Trigger | Recovery Time | Decision Authority | +| ------------------ | ------------------------- | ------------- | --------------------- | +| Data Corruption | Any data inconsistency | 2 hours | Data Engineering Lead | +| Performance Issues | >50% slower than baseline | 4 hours | Technical Manager | +| System Failures | Any critical system down | 1 hour | On-call Engineer | +| User Complaints | >10 user reports | 6 hours | Product Manager | + +**Rollback Procedure Template**: + +```bash +# Emergency Rollback: Customer Analytics Migration +# Decision Authority: [Name] [Contact] +# Estimated Time: 4 hours + +1. Stop new data processing +2. Revert connection strings to original Hive system +3. Restart downstream applications +4. Validate data flow restoration +5. Notify stakeholders of rollback completion +``` + + + + +**Pre-Migration Testing Checklist**: + +**Data Validation**: + +- [ ] Row counts match between old and new systems +- [ ] Sample data comparison (10% random sample) +- [ ] Schema validation (all columns present and correct types) +- [ ] Data freshness verification (latest timestamps match) + +**System Integration Testing**: + +- [ ] All downstream connections work with new system +- [ ] Query performance meets or exceeds baseline +- [ ] Authentication and authorization function correctly +- [ ] Monitoring and alerting systems recognize new platform + +**User Acceptance Testing**: + +- [ ] Key dashboards load correctly with new data source +- [ ] Reports generate successfully with expected data +- [ ] API endpoints return correct responses +- [ ] Mobile app functions normally with new backend + + + + +**Migration Success Criteria**: + +**Technical Metrics**: + +- **Zero data loss**: 100% data integrity maintained +- **Performance improvement**: >5x faster query response times +- **Uptime target**: 99.9% availability during migration +- **Error rate**: <0.1% failed requests + +**Business Metrics**: + +- **User satisfaction**: <5 user complaints about system changes +- **Productivity impact**: No measurable decrease in team efficiency +- **Cost savings**: Achieve projected $50K monthly reduction +- **Timeline adherence**: Complete migration within 48-hour window + +**Validation Timeline**: + +- **Immediate** (0-4 hours): System connectivity and basic functionality +- **Short-term** (1-7 days): Performance validation and user feedback +- **Medium-term** (1-4 weeks): Cost savings realization and stability +- **Long-term** (1-3 months): Full business value achievement + + + + +## Real-World Impact Analysis Exercise + +
+ +**Your Challenge**: Perform a complete impact analysis for a system change + +**Scenario**: TechFlow wants to add a new `customer_lifetime_value` column to the `customers` table. This requires updating the ETL job and potentially affects all downstream systems. + +**Your Task**: Complete the 5-step impact analysis: + +**Step 1 - Scope Definition**: + +``` +System/Dataset: customers table +Change Type: Schema addition (new column) +Timeline: 2-week implementation +Technical Details: Add CLV calculation to nightly ETL +Business Justification: Enable customer segmentation for marketing +``` + +**Step 2 - Downstream Mapping**: +Use DataHub to identify all systems that consume the `customers` table and document them. + +**Step 3 - Risk Assessment**: +Score each downstream system using the Business Impact × Technical Complexity formula. + +**Step 4 - Stakeholder Analysis**: +Identify who needs to be involved and create appropriate communication plans. + +**Step 5 - Mitigation Planning**: +Develop testing strategy and rollback procedures. + +**Success Criteria**: You've created a comprehensive impact analysis that could be presented to stakeholders for approval. + +
+ +## Success Checkpoint + +
+ +**You've mastered impact analysis when you can:** + +**Planning Skills**: + +- Complete the 5-step impact analysis framework for any system change +- Quantify risk using business impact and technical complexity scores +- Create stakeholder-appropriate communication plans +- Develop comprehensive rollback strategies + +**Analysis Skills**: + +- Map downstream dependencies using DataHub lineage +- Assess business impact across different user types and use cases +- Identify critical path dependencies and single points of failure +- Prioritize changes based on risk scores and business value + +**Communication Skills**: + +- Present technical impact to business stakeholders clearly +- Create executive summaries that enable informed decision-making +- Coordinate cross-team changes using lineage insights +- Document mitigation plans that teams can execute confidently + +**Final Validation**: +Choose a real system change in your organization and perform a complete impact analysis using the framework you've learned. + +
+ +## What You've Accomplished + +🎉 **Outstanding work!** You've transformed from basic lineage viewing to expert-level impact analysis: + +- **Systematic approach**: You can now analyze any system change methodically +- **Risk quantification**: You understand how to score and prioritize risks +- **Stakeholder management**: You can communicate impact to any audience +- **Mitigation planning**: You're prepared for both success and failure scenarios + +:::tip Mark Your Progress +Check off "Performing Impact Analysis" in the progress tracker above! You're ready to troubleshoot lineage issues. +::: + +--- + +**Next**: Complete your lineage mastery by learning [lineage troubleshooting techniques](troubleshooting.md) → diff --git a/docs/learn-datahub/lineage/overview.md b/docs/learn-datahub/lineage/overview.md new file mode 100644 index 00000000000000..f3e818c82a7385 --- /dev/null +++ b/docs/learn-datahub/lineage/overview.md @@ -0,0 +1,231 @@ +import TutorialProgress from '@site/src/components/TutorialProgress'; +import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode'; + +# Data Lineage & Impact Analysis (40 minutes) + +**From Beginner to Expert**: You've learned basic lineage in the quickstart, but production data environments are complex beasts. This series transforms you into a lineage expert who can navigate multi-system architectures, perform systematic impact analysis, and troubleshoot the most challenging data pipeline issues. + +## Your Advanced Data Challenge + +**Meet the Scenario**: You're the senior data engineer at a growing technology company, and leadership has announced a major system migration. Your job is to assess the impact of moving the customer analytics pipeline from the legacy system to a new cloud platform. One wrong move could break customer-facing dashboards used by the entire sales team. + +**The Stakes**: + +- **15+ downstream systems** depend on customer analytics +- **$2M+ in revenue** tracked through affected dashboards +- **48-hour migration window** - no room for errors +- **Your reputation** as the data reliability expert + +**Your Mission**: Master advanced lineage analysis to plan, execute, and validate this critical migration without breaking anything. + +### Enterprise Migration Challenge + +Here's the complex data pipeline you'll be analyzing throughout this tutorial series: + + + +**Migration Complexity**: This seemingly simple 4-node pipeline actually has 15+ downstream dependencies, cross-platform transformations, and business-critical dashboards that cannot afford downtime. + +**Enterprise Lineage Analysis Framework:** + + + +**Architecture Components**: + +- **Source Systems**: Raw data, databases, APIs, files +- **Transformation Layers**: ETL/ELT processes, data pipelines, business logic, quality checks +- **Target Systems**: Analytics/reports, dashboards, ML models, data products + +**Lineage Analysis Capabilities:** + +- **Upstream Tracing**: Follow data back to its original sources +- **Downstream Impact**: Identify all systems affected by changes +- **Transformation Logic**: Understand how data is processed and modified +- **Dependency Mapping**: Visualize critical data relationships +- **Change Impact Assessment**: Predict effects of schema or pipeline changes + +## Learning Path Overview + + + +## What You'll Master + +### **Reading Lineage Graphs** (15 minutes) + +**From**: Basic lineage viewing +**To**: Expert multi-hop navigation across complex architectures + +**You'll Learn**: + +- Navigate 5+ hop lineage paths efficiently +- Interpret different node types (datasets, jobs, applications) +- Understand transformation logic through connections +- Identify critical paths in data infrastructure + +**Real Scenario**: Trace revenue calculation errors through a 7-system pipeline spanning Kafka → Spark → Snowflake → dbt → Looker. + +### **Performing Impact Analysis** (15 minutes) + +**From**: "What uses this data?" +**To**: Systematic impact assessment with risk scoring + +**You'll Learn**: + +- Quantify downstream impact with business metrics +- Create change impact reports for stakeholders +- Develop rollback strategies based on lineage +- Coordinate cross-team changes using lineage insights + +**Real Scenario**: Plan the customer analytics migration by mapping all 15 downstream dependencies and creating a risk-ranked rollout plan. + +### **Lineage Troubleshooting** (10 minutes) + +**From**: "Why is lineage missing?" +**To**: Proactive lineage quality management + +**You'll Learn**: + +- Debug missing lineage connections +- Improve lineage accuracy through configuration +- Handle edge cases and manual processes +- Establish lineage monitoring and validation + +**Real Scenario**: Investigate why the new ML pipeline isn't showing up in lineage and fix the ingestion configuration. + +## Prerequisites + +**Required Knowledge**: + +- Completed [DataHub Quickstart](../quickstart/overview.md) (basic lineage understanding) +- Familiarity with data pipelines and ETL concepts +- Basic understanding of SQL and data transformations + +**Technical Setup**: + +- DataHub instance with sample data (from quickstart) +- Access to lineage views and dataset details +- Ability to navigate the DataHub UI confidently + +**Time Commitment**: 40 minutes of focused learning with hands-on exercises + +## Learning Approach + +**Scenario-Driven**: Every concept is taught through the lens of the enterprise migration challenge + +**Hands-On Practice**: Interactive exercises using your actual DataHub instance with sample data + +**Real-World Applications**: Techniques you'll use immediately in production environments + +**Team-Ready Skills**: Learn to communicate lineage insights to both technical and business stakeholders + +## Success Outcomes + +By completing this series, you'll be able to: + +**Technical Mastery**: + +- Navigate any lineage graph, no matter how complex +- Perform comprehensive impact analysis for system changes +- Troubleshoot and improve lineage quality +- Use lineage for root cause analysis and debugging + +**Business Impact**: + +- Reduce system change risks through proper impact assessment +- Accelerate troubleshooting with systematic lineage analysis +- Improve cross-team coordination using lineage insights +- Build confidence in data reliability and change management + +**Career Growth**: + +- Become the go-to expert for complex data pipeline analysis +- Lead system migrations and architecture changes confidently +- Mentor junior team members on lineage best practices +- Contribute to data governance and reliability initiatives + +## Ready to Begin? + +**Your journey to lineage mastery starts now**. Each tutorial builds on the previous one, taking you from basic lineage reading to expert-level impact analysis and troubleshooting. + +**Start with**: [Reading Lineage Graphs](reading-lineage.md) - Learn to navigate complex data flows like a pro + +--- + +**Pro Tip**: Keep your DataHub instance open in another tab. You'll be using it extensively throughout these tutorials for hands-on practice with the sample data. diff --git a/docs/learn-datahub/lineage/reading-lineage.md b/docs/learn-datahub/lineage/reading-lineage.md new file mode 100644 index 00000000000000..045720ab688f6b --- /dev/null +++ b/docs/learn-datahub/lineage/reading-lineage.md @@ -0,0 +1,427 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import TutorialProgress from '@site/src/components/TutorialProgress'; +import DataHubEntityCard, { SampleEntities } from '@site/src/components/DataHubEntityCard'; +import DataHubLineageNode, { DataHubLineageFlow, SampleLineageFlows } from '@site/src/components/DataHubLineageNode'; +import ProcessFlow, { DataHubWorkflows } from '@site/src/components/ProcessFlow'; + +# Reading Lineage Graphs (15 minutes) + +:::info Tutorial Progress +**Step 1 of 3** | **15 minutes** | [Overview](overview.md) → **Reading Lineage** → [Impact Analysis](impact-analysis.md) → [Troubleshooting](troubleshooting.md) +::: + + + +**The Expert's Challenge**: You've mastered basic lineage in the quickstart, but now you're facing a complex production scenario. The customer dashboard is showing inconsistent numbers, and you need to trace through a multi-hop data pipeline spanning 5 different systems to find the root cause. + +**Your Mission**: Learn to read complex lineage graphs like a seasoned data engineer, understanding every connection, transformation, and dependency in your data ecosystem. + +## What You'll Master + +By the end of this step, you'll be able to: + +- **Navigate multi-hop lineage** across complex data architectures +- **Interpret different node types** (datasets, jobs, applications) +- **Understand transformation logic** through lineage connections +- **Identify critical paths** in your data infrastructure + +## The Lineage Reading Framework + +Professional data engineers follow a systematic approach to lineage analysis: + + + +## Level 1: Understanding Node Types + +Every element in a lineage graph tells a specific story: + + + + +**Tables, Views, and Files**: + +- **Raw Tables**: Source system data (often rectangular nodes) +- **Analytical Views**: Processed, business-ready data +- **🔄 Materialized Views**: Pre-computed results for performance +- **📁 File Assets**: CSV, Parquet, JSON files in data lakes + +**Visual Cues in DataHub**: + +
+ + + +
+ +- **Platform logos**: Each node shows the actual platform logo and type +- **Health indicators**: Color-coded dots show data quality status +- **Node highlighting**: Selected or problematic nodes are visually emphasized + +**Reading Strategy**: Start with the dataset causing issues, then trace backward to find the source. + +
+ + +**Data Processing Elements**: + +- **🔄 ETL Jobs**: Extract, Transform, Load processes +- **🐍 Python Scripts**: Custom data processing logic +- **dbt Models**: Data transformation workflows +- **⚡ Spark Jobs**: Large-scale data processing + +**Connection Patterns**: + +- **Solid lines**: Direct data dependencies +- **Dashed lines**: Indirect or inferred relationships +- **Arrows**: Direction of data flow (always follows the arrows!) + +**Analysis Technique**: Jobs between datasets show _how_ data is transformed, not just _that_ it flows. + + + + +**Business Applications**: + +- **BI Dashboards**: Looker, Tableau, PowerBI reports +- **🤖 ML Models**: Training and inference pipelines +- **📱 Applications**: Customer-facing features +- **📧 Automated Reports**: Scheduled business reports + +**Business Impact Indicators**: + +- **User-facing systems**: High business impact if broken +- **Internal tools**: Important for operations but lower external impact +- **Experimental systems**: Can often tolerate temporary issues + + +
+ +## Level 2: Multi-Hop Navigation + +Real production lineage often spans multiple systems and transformations: + +### The 6-Hop Analysis Method + +**Scenario**: Customer dashboard shows wrong revenue numbers. Let's trace it: + + + +**Navigation Strategy**: + +1. **Start at the problem** (executive dashboard) +2. **Follow arrows backward** (upstream direction) +3. **Document each hop**: What system, what transformation? + - Dashboard ← Chart ← View ← Table ← Job ← Raw Table +4. **Identify the break point**: Where does data look wrong? + - Critical ETL job failure affecting downstream data +5. **Focus investigation**: Drill into the problematic hop + - Expand columns to see field-level transformations + - Check tags and glossary terms for context + +### Interactive Exercise: Multi-Hop Tracing + +
+ +**Your Challenge**: Find the root cause of data quality issues + +**Step 1**: Open any complex dataset in your DataHub instance +**Step 2**: Click "View Lineage" to see the full graph +**Step 3**: Apply the 5-hop analysis method: + +**5-Hop Lineage Analysis Example:** + +``` +← Hop 5 ← Hop 4 ← Hop 3 ← Hop 2 ← Hop 1 Current Dataset +Raw Source → Data Ingestion → Validation → ETL Process → Final Transform → fct_users_created +(HDFS Files) (Kafka Stream) (Quality Check) (Business Logic) (Aggregation) (Analytics Table) +``` + +**Analysis Questions for Each Hop:** + +1. **Hop 1**: What was the last transformation applied? +2. **Hop 2**: What business logic was implemented? +3. **Hop 3**: What quality checks were performed? +4. **Hop 4**: How was the data originally ingested? +5. **Hop 5**: What is the ultimate source system? + +**Professional Lineage Reading Strategy:** + +1. **Start at the Target**: Begin with the dataset you're investigating +2. **Work Backwards**: Follow each upstream connection systematically +3. **Document Each Hop**: Note the transformation type and business purpose +4. **Identify Critical Points**: Mark systems that could cause widespread impact +5. **Validate Understanding**: Confirm your analysis with data owners when possible + +**Analysis Questions**: + +- Which hop has the most complex transformation? +- Where would you focus if data was missing? +- Which systems are most critical to this pipeline? + +
+ +## Level 3: Understanding Transformation Logic + +The connections between nodes reveal how data is processed: + +### Reading Connection Types + + + + +**One-to-One Relationships**: + +``` +Raw Customer Data → Customer Analytics Table +``` + +**What this means**: Direct processing, usually filtering, aggregation, or enrichment + +**Many-to-One Relationships**: + +``` +Orders + Customers + Products → Sales Analytics +``` + +**What this means**: Data joining and consolidation + +**Analysis Approach**: Look for SQL logic, dbt models, or ETL job definitions to understand the exact transformation. + + + + +**Fan-Out Patterns**: + +``` +Raw Events → [Processing Job] → Multiple Analytics Tables +``` + +**Business Meaning**: One source feeding multiple business use cases + +**Fan-In Patterns**: + +``` +Multiple Sources → [ETL Job] → Single Data Warehouse Table +``` + +**Business Meaning**: Data consolidation from various systems + +**🚨 Risk Assessment**: Fan-out = high impact if source breaks; Fan-in = complex debugging if output is wrong + + + + +**Batch Processing Indicators**: + +- **Daily/Hourly jobs**: Look for time-based naming (daily_sales, hourly_events) +- **Scheduled dependencies**: Jobs that run in sequence +- **Lag indicators**: How fresh is each step in the pipeline? + +**Real-Time Processing Indicators**: + +- **Streaming connections**: Kafka topics, event streams +- **Near real-time**: Minimal processing delay +- **Continuous updates**: Always-fresh data + +**⚡ Performance Insight**: Understand processing schedules to set proper expectations for data freshness. + + + + +## Level 4: Critical Path Analysis + +Identify the most important connections in your data ecosystem: + +### The Critical Path Method + +**High-Impact Paths**: + +- **Customer-facing dashboards** ← Highest priority +- **Revenue reporting** ← Business critical +- **Compliance reporting** ← Regulatory requirement +- **Operational monitoring** ← System health + +**Dependency Mapping**: + +1. **Single points of failure**: One dataset feeding many critical systems +2. **Bottleneck jobs**: Processing that everything depends on +3. **Cross-platform bridges**: Connections between different systems + +### Interactive Exercise: Critical Path Identification + +
+ +**Scenario**: You're responsible for data reliability at TechFlow Analytics + +**Your Task**: Using lineage, identify the top 3 most critical data assets + +**Analysis Framework**: + +``` +Asset Name: ________________________ +Downstream Dependencies: ____________ +Business Impact (1-10): _____________ +Failure Risk (1-10): _______________ +Critical Score: ____________________ +``` + +**Success Criteria**: You can explain why these 3 assets deserve the most monitoring and protection. + +
+ +## Pro Tips for Lineage Reading + +
+ +**Speed Techniques**: + +- **Start broad, then narrow**: Use overview mode first, then zoom into problem areas +- **Follow the business logic**: Revenue flows are usually well-documented and critical +- **Use platform knowledge**: Understand your organization's data architecture patterns + +**Accuracy Boosters**: + +- **Verify with owners**: Lineage might miss manual processes or external dependencies +- **Check recency**: When was lineage last updated? Stale lineage can mislead +- **Cross-reference documentation**: Combine lineage with technical docs and business context + +**🤝 Team Efficiency**: + +- **Document your findings**: Share critical path analysis with your team +- **Create lineage maps**: Visual summaries for non-technical stakeholders +- **Establish monitoring**: Set up alerts for critical path failures + +
+ +## Success Checkpoint + +
+ +**You've mastered lineage reading when you can:** + +**Speed Test**: Trace a 5-hop lineage path in under 3 minutes +**Comprehension Test**: Identify all node types and transformation patterns +**Analysis Test**: Determine the critical path for any business process +**Communication Test**: Explain lineage findings to both technical and business stakeholders + +**Final Validation**: +Choose a complex dataset in your DataHub instance and create a complete lineage analysis including: + +- All upstream dependencies (at least 3 hops) +- Transformation logic at each step +- Critical path assessment +- Potential failure points + +
+ +## What You've Learned + +**Excellent progress!** You can now read lineage graphs like a professional data engineer: + +- **Multi-hop navigation**: Trace complex data flows across systems +- **Node type recognition**: Understand datasets, jobs, and applications +- **Transformation analysis**: Interpret how data changes through processing +- **Critical path identification**: Focus on what matters most for business + +:::tip Mark Your Progress +Check off "Reading Lineage Graphs" in the progress tracker above! You're ready to perform impact analysis. +::: + +--- + +**Next**: Now that you can read lineage expertly, let's learn how to [perform systematic impact analysis](impact-analysis.md) → diff --git a/docs/learn-datahub/lineage/troubleshooting.md b/docs/learn-datahub/lineage/troubleshooting.md new file mode 100644 index 00000000000000..bfdad38be3e90e --- /dev/null +++ b/docs/learn-datahub/lineage/troubleshooting.md @@ -0,0 +1,738 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import InteractiveDiagram from '@site/src/components/InteractiveDiagram'; +import TutorialProgress from '@site/src/components/TutorialProgress'; + +# Lineage Troubleshooting (10 minutes) + +:::info Tutorial Progress +**Step 3 of 3** | **10 minutes** | [Overview](overview.md) → [Reading Lineage](reading-lineage.md) → [Impact Analysis](impact-analysis.md) → **Troubleshooting** +::: + + + +**The Mystery**: Three weeks after the TechFlow migration, you notice something troubling. The new ML pipeline that processes customer segments isn't showing up in DataHub's lineage graph. The data team is asking questions, and you need to figure out why this critical connection is missing. + +**Your Mission**: Master the art of lineage troubleshooting - from diagnosing missing connections to proactively improving lineage quality across your entire data ecosystem. + +## What You'll Master + +By the end of this step, you'll be able to: + +- **Diagnose missing lineage** using systematic debugging techniques +- **Fix ingestion issues** that cause incomplete lineage capture +- **Handle edge cases** like manual processes and external dependencies +- **Establish monitoring** to maintain lineage quality over time + +## The Lineage Troubleshooting Framework + +Professional lineage debugging follows a systematic approach: + + + +## Common Lineage Issues + +Understanding the most frequent problems helps you troubleshoot faster: + +
+ +**🔍 Missing Connections** (60% of issues): + +- New systems not yet configured for metadata ingestion +- Changes in connection strings or authentication +- Processing jobs that don't emit lineage metadata +- Manual data movement processes + +**📊 Incomplete Metadata** (25% of issues): + +- Partial schema information from source systems +- Missing column-level lineage in transformations +- Outdated metadata from infrequent ingestion runs +- Custom applications without metadata instrumentation + +**⚡ Performance Problems** (10% of issues): + +- Lineage graphs too complex to render efficiently +- Ingestion jobs timing out on large metadata volumes +- UI responsiveness issues with deep lineage paths +- Memory constraints during lineage computation + +**🔄 Stale Information** (5% of issues): + +- Metadata not refreshed after system changes +- Cached lineage information showing old connections +- Ingestion schedules not aligned with data pipeline changes +- Manual metadata updates not propagated + +
+ +## Step 1: Identify the Gap + +Systematic gap identification prevents wasted troubleshooting effort: + +### The Gap Analysis Method + + + + +**Gap Documentation Template**: + +``` +Missing Connection: ________________________ +Expected Source: ___________________________ +Expected Target: ___________________________ +Business Process: __________________________ +Technical Implementation: ___________________ +Last Known Working: ________________________ +``` + +**TechFlow ML Pipeline Example**: + +``` +Missing Connection: Customer segments → ML training pipeline +Expected Source: customer_segments (Snowflake table) +Expected Target: ml_customer_model (MLflow model) +Business Process: Nightly model retraining using latest customer data +Technical Implementation: Python script with Snowflake connector +Last Known Working: Never appeared in DataHub lineage +``` + + + + +**Missing Lineage Impact**: + +**Business Impact**: + +- **Incomplete dependency mapping**: Can't assess full impact of customer data changes +- **Risk management gaps**: ML model dependencies not visible to data governance +- **Troubleshooting delays**: Root cause analysis missing critical connections +- **Compliance concerns**: Audit trail incomplete for customer data usage + +**Technical Impact**: + +- **Change management risk**: Schema changes might break ML pipeline unknowingly +- **Monitoring gaps**: No alerts if upstream customer data quality degrades +- **Documentation inconsistency**: Technical architecture docs don't match reality +- **Team coordination issues**: ML team not notified of customer data changes + + + + +**Troubleshooting Priority Matrix**: + +| Business Impact | Technical Complexity | Priority | Action Timeline | +| --------------- | -------------------- | ------------ | ------------------- | +| High | Low | 🔴 Critical | Fix within 24 hours | +| High | High | 🟡 Important | Fix within 1 week | +| Medium | Low | 🟢 Standard | Fix within 2 weeks | +| Medium | High | 🔵 Planned | Fix within 1 month | +| Low | Any | ⚪ Backlog | Fix when convenient | + +**TechFlow ML Pipeline**: High business impact (compliance risk) + Medium complexity = 🟡 Important (1 week timeline) + + + + +## Step 2: Check Data Sources + +Most lineage issues stem from ingestion configuration problems: + +### Ingestion Diagnostics Checklist + +
+ +**🔍 Source System Verification**: + +- [ ] **System connectivity**: Can DataHub reach the source system? +- [ ] **Authentication**: Are credentials valid and permissions sufficient? +- [ ] **Metadata availability**: Does the source system expose lineage information? +- [ ] **Recent changes**: Have there been system updates or migrations? + +**📊 Ingestion Configuration**: + +- [ ] **Recipe accuracy**: Is the ingestion recipe configured correctly? +- [ ] **Scheduling**: Is the ingestion running on the expected schedule? +- [ ] **Scope coverage**: Are all relevant databases/schemas included? +- [ ] **Lineage extraction**: Is lineage extraction enabled in the recipe? + +**⚡ Execution Status**: + +- [ ] **Recent runs**: Has ingestion executed successfully recently? +- [ ] **Error logs**: Are there any ingestion failures or warnings? +- [ ] **Data volume**: Is the expected amount of metadata being ingested? +- [ ] **Processing time**: Are ingestion jobs completing within expected timeframes? + +
+ +### Interactive Diagnostics Exercise + +
+ +**Your Challenge**: Diagnose the TechFlow ML pipeline lineage gap + +**Step 1 - Source System Check**: + +``` +ML Pipeline System: Python + MLflow + Snowflake +Expected Metadata: Job definitions, data dependencies, model artifacts +Current Status: ________________________________ +Issues Found: __________________________________ +``` + +**Step 2 - Ingestion Configuration**: + +``` +Ingestion Recipe: ______________________________ +Last Successful Run: ___________________________ +Lineage Extraction Enabled: ____________________ +Scope Includes ML Systems: _____________________ +``` + +**Step 3 - Gap Analysis**: + +``` +Root Cause Hypothesis: _________________________ +Confidence Level (1-10): _______________________ +Next Troubleshooting Step: _____________________ +``` + +
+ +## Step 3: Validate Ingestion + +Deep-dive into ingestion mechanics to find the root cause: + +### Ingestion Debugging Techniques + + + + +**Log Investigation Strategy**: + +**Error Pattern Recognition**: + +```bash +# Common error patterns to search for +grep -i "lineage" ingestion.log +grep -i "connection" ingestion.log +grep -i "timeout" ingestion.log +grep -i "permission" ingestion.log +grep -i "schema" ingestion.log +``` + +**Success Indicators**: + +```bash +# Positive signals in logs +grep "Successfully processed" ingestion.log +grep "Lineage extracted" ingestion.log +grep "Metadata ingested" ingestion.log +``` + +**TechFlow ML Pipeline Investigation**: + +``` +Expected Log Entry: "Successfully extracted lineage from ml_training_job" +Actual Log Entry: "Warning: No lineage metadata found for Python scripts" +Root Cause: Python ML scripts don't emit DataHub-compatible lineage +``` + + + + +**Metadata Completeness Check**: + +**Dataset Metadata**: + +- **Schema information**: Are all columns and types captured? +- **Ownership data**: Are dataset owners properly identified? +- **Custom properties**: Are business-relevant attributes included? +- **Platform details**: Is the source system correctly identified? + +**Lineage Metadata**: + +- **Job information**: Are transformation jobs captured as entities? +- **Input/output mapping**: Are data dependencies clearly defined? +- **Temporal information**: Are processing schedules and frequencies captured? +- **Column-level lineage**: Are field-level transformations tracked? + +**Validation Queries**: + +```sql +-- Check if ML pipeline datasets exist +SELECT * FROM metadata_aspect +WHERE urn LIKE '%ml_customer_model%'; + +-- Verify lineage relationships +SELECT * FROM metadata_aspect +WHERE aspect = 'datasetLineage' +AND urn LIKE '%customer_segments%'; +``` + + + + +**Recipe Optimization**: + +**Lineage Extraction Settings**: + +```yaml +# Enhanced lineage extraction configuration +source: + type: "snowflake" + config: + # Enable comprehensive lineage extraction + include_table_lineage: true + include_view_lineage: true + include_column_lineage: true + + # Capture custom SQL and stored procedures + include_usage_statistics: true + sql_parser_use_external_process: true + + # Extended metadata capture + profiling: + enabled: true + include_field_null_count: true + include_field_min_value: true + include_field_max_value: true +``` + +**Custom Lineage Injection**: + +```python +# For systems that don't auto-emit lineage +from datahub.emitter.mce_builder import make_lineage_mce +from datahub.emitter.rest_emitter import DatahubRestEmitter + +# Create custom lineage for ML pipeline +lineage_mce = make_lineage_mce( + upstream_urns=["urn:li:dataset:(urn:li:dataPlatform:snowflake,customer_segments,PROD)"], + downstream_urn="urn:li:dataset:(urn:li:dataPlatform:mlflow,ml_customer_model,PROD)" +) + +emitter = DatahubRestEmitter(gms_server="http://localhost:8080") +emitter.emit_mce(lineage_mce) +``` + + + + +## Step 4: Handle Edge Cases + +Real-world data pipelines often include scenarios that standard ingestion can't capture: + +### Common Edge Cases and Solutions + +
+ +**🔧 Manual Data Processes**: + +- **Problem**: Excel files, manual data entry, ad-hoc scripts +- **Solution**: Custom metadata emission or documentation-based lineage +- **Implementation**: Create "virtual" datasets representing manual processes + +**🔄 External System Dependencies**: + +- **Problem**: Third-party APIs, vendor data feeds, external databases +- **Solution**: Proxy datasets or external system connectors +- **Implementation**: Document external dependencies as DataHub entities + +**⚡ Real-time Processing**: + +- **Problem**: Streaming pipelines, event-driven architectures, microservices +- **Solution**: Event-based lineage capture or instrumentation +- **Implementation**: Custom lineage emission from application code + +**🎯 Complex Transformations**: + +- **Problem**: Multi-step ETL, custom business logic, conditional processing +- **Solution**: Job-level lineage with detailed transformation documentation +- **Implementation**: Enhanced metadata with transformation descriptions + +
+ +### Edge Case Resolution Framework + + + + +**Documentation-Based Lineage**: + +```python +# Create lineage for manual Excel process +from datahub.emitter.mce_builder import make_dataset_urn, make_lineage_mce + +# Define the manual process as a "dataset" +manual_process_urn = make_dataset_urn( + platform="manual", + name="monthly_customer_review_excel", + env="PROD" +) + +# Create lineage from automated data to manual process +lineage_mce = make_lineage_mce( + upstream_urns=["urn:li:dataset:(urn:li:dataPlatform:snowflake,customer_segments,PROD)"], + downstream_urn=manual_process_urn +) + +# Add custom properties to explain the manual process +properties = { + "process_description": "Monthly customer review conducted by business team", + "frequency": "Monthly", + "owner": "customer_success_team", + "documentation_url": "https://wiki.company.com/customer-review-process" +} +``` + +**Benefits**: + +- Complete lineage visibility including manual steps +- Documentation of business processes in technical lineage +- Compliance and audit trail for manual data handling + + + + +**Proxy Dataset Approach**: + +```python +# Create proxy for external API data source +external_api_urn = make_dataset_urn( + platform="external_api", + name="customer_enrichment_service", + env="PROD" +) + +# Document the external dependency +external_properties = { + "api_endpoint": "https://api.customerdata.com/v2/enrichment", + "update_frequency": "Real-time", + "data_provider": "CustomerData Inc.", + "sla": "99.9% uptime", + "contact": "support@customerdata.com" +} + +# Create lineage showing external data flow +lineage_mce = make_lineage_mce( + upstream_urns=[external_api_urn], + downstream_urn="urn:li:dataset:(urn:li:dataPlatform:snowflake,enriched_customers,PROD)" +) +``` + +**Benefits**: + +- Visibility into external data dependencies +- Risk assessment for third-party data sources +- Contact information for external data issues + + + + +**Code-Level Lineage Emission**: + +```python +# Instrument ML training pipeline +from datahub.emitter.rest_emitter import DatahubRestEmitter +from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetLineageType +from datahub.metadata.schema_classes import DatasetLineageClass + +def train_customer_model(): + # Your ML training code here + input_data = load_customer_segments() + model = train_model(input_data) + save_model(model) + + # Emit lineage metadata + emitter = DatahubRestEmitter(gms_server="http://localhost:8080") + + lineage = DatasetLineageClass( + upstreams=[ + { + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,customer_segments,PROD)", + "type": DatasetLineageType.TRANSFORMED + } + ] + ) + + model_urn = "urn:li:dataset:(urn:li:dataPlatform:mlflow,ml_customer_model,PROD)" + emitter.emit_mcp( + MetadataChangeProposalWrapper( + entityType="dataset", + entityUrn=model_urn, + aspectName="datasetLineage", + aspect=lineage + ) + ) +``` + +**Benefits**: + +- Real-time lineage updates as code executes +- Accurate capture of dynamic data dependencies +- Integration with application deployment pipelines + + + + +## Step 5: Implement Monitoring + +Proactive lineage quality management prevents future troubleshooting: + +### Lineage Quality Monitoring Framework + +
+ +**📊 Quality Metrics**: + +- **Coverage**: Percentage of data assets with complete lineage +- **Freshness**: How recently lineage information was updated +- **Accuracy**: Validation of lineage against known data flows +- **Completeness**: Presence of both upstream and downstream connections + +**🚨 Alert Conditions**: + +- **Missing lineage**: New datasets without any lineage connections +- **Stale metadata**: Lineage not updated within expected timeframe +- **Broken connections**: Previously connected systems showing gaps +- **Ingestion failures**: Metadata extraction jobs failing repeatedly + +**🔄 Maintenance Tasks**: + +- **Regular validation**: Quarterly review of critical data lineage +- **Configuration updates**: Adjust ingestion recipes as systems evolve +- **Documentation sync**: Keep manual lineage documentation current +- **Team training**: Ensure new team members understand lineage practices + +
+ +### Monitoring Implementation + + + + +**Lineage Quality Dashboard**: + +```sql +-- Lineage coverage metrics +SELECT + platform, + COUNT(*) as total_datasets, + COUNT(CASE WHEN has_upstream_lineage THEN 1 END) as with_upstream, + COUNT(CASE WHEN has_downstream_lineage THEN 1 END) as with_downstream, + ROUND(100.0 * COUNT(CASE WHEN has_upstream_lineage THEN 1 END) / COUNT(*), 2) as upstream_coverage_pct +FROM dataset_lineage_summary +GROUP BY platform +ORDER BY upstream_coverage_pct DESC; + +-- Stale lineage detection +SELECT + dataset_urn, + last_lineage_update, + DATEDIFF(CURRENT_DATE, last_lineage_update) as days_since_update +FROM dataset_metadata +WHERE DATEDIFF(CURRENT_DATE, last_lineage_update) > 7 +ORDER BY days_since_update DESC; +``` + +**Automated Alerts**: + +```python +# Lineage quality monitoring script +def check_lineage_quality(): + critical_datasets = [ + "customer_segments", + "fct_users_created", + "ml_customer_model" + ] + + for dataset in critical_datasets: + lineage_age = get_lineage_age(dataset) + if lineage_age > 7: # days + send_alert(f"Stale lineage for {dataset}: {lineage_age} days old") + + if not has_upstream_lineage(dataset): + send_alert(f"Missing upstream lineage for {dataset}") +``` + + + + +**Quarterly Lineage Review Process**: + +**Review Checklist**: + +- [ ] **Critical path validation**: Verify lineage for top 10 most important datasets +- [ ] **New system integration**: Ensure recently added systems appear in lineage +- [ ] **Accuracy spot checks**: Validate 5% random sample against known data flows +- [ ] **Documentation updates**: Sync lineage with architecture documentation + +**Validation Template**: + +``` +Dataset: ___________________________________ +Expected Upstream Count: ____________________ +Actual Upstream Count: ______________________ +Expected Downstream Count: __________________ +Actual Downstream Count: ____________________ +Discrepancies Found: ________________________ +Action Required: ____________________________ +Validation Date: ____________________________ +Reviewer: ___________________________________ +``` + + + + +**Lineage Governance Framework**: + +**Roles and Responsibilities**: + +- **Data Engineers**: Ensure new pipelines emit proper lineage metadata +- **Analytics Engineers**: Validate lineage for dbt models and transformations +- **Data Platform Team**: Maintain ingestion infrastructure and monitoring +- **Data Governance**: Review lineage completeness for compliance requirements + +**Process Integration**: + +- **Code Review**: Include lineage validation in data pipeline code reviews +- **Deployment Gates**: Require lineage metadata before production deployment +- **Incident Response**: Use lineage for root cause analysis and impact assessment +- **Architecture Reviews**: Validate lineage against system design documents + +**Training and Documentation**: + +- **Onboarding**: Include lineage best practices in new team member training +- **Playbooks**: Document troubleshooting procedures for common lineage issues +- **Best Practices**: Maintain guidelines for lineage metadata emission +- **Tool Training**: Regular sessions on DataHub lineage features and capabilities + + + + +## Success Checkpoint + +
+ +**You've mastered lineage troubleshooting when you can:** + +**Diagnostic Skills**: + +- Systematically identify and categorize lineage gaps +- Debug ingestion issues using logs and configuration analysis +- Validate metadata completeness and accuracy +- Prioritize troubleshooting efforts based on business impact + +**Technical Skills**: + +- Configure ingestion recipes for optimal lineage extraction +- Implement custom lineage emission for edge cases +- Handle manual processes and external system dependencies +- Instrument applications for real-time lineage updates + +**Operational Skills**: + +- Establish monitoring and alerting for lineage quality +- Create validation processes for ongoing lineage accuracy +- Integrate lineage governance into team workflows +- Train teams on lineage best practices and troubleshooting + +**Final Validation**: +Identify a lineage gap in your organization and resolve it using the systematic troubleshooting framework you've learned. + +
+ +## Mission Accomplished: Lineage Mastery Complete! + +🎉 **Congratulations!** You've completed the entire Data Lineage & Impact Analysis series and achieved expert-level proficiency: + +**🔍 Reading Lineage Graphs**: Navigate any complexity with confidence +**📊 Performing Impact Analysis**: Systematically assess and communicate change risks +**🔧 Lineage Troubleshooting**: Diagnose and resolve any lineage quality issue + +**Your New Capabilities**: + +- **Lead system migrations** with comprehensive impact analysis +- **Troubleshoot data issues** using lineage-driven root cause analysis +- **Improve data governance** through complete lineage visibility +- **Mentor teams** on lineage best practices and troubleshooting techniques + +**Real-World Impact**: You're now equipped to handle the most complex data lineage challenges in production environments, from multi-system migrations to compliance audits to incident response. + +:::tip Mark Your Progress +Check off "Lineage Troubleshooting" in the progress tracker above! You've completed the entire lineage mastery series! 🎉 +::: + +--- + +**🎯 Ready for More?** Continue your DataHub expertise journey with: + +- **Data Governance Fundamentals (coming soon)** - Master ownership, classification, and business glossary +- **Data Quality & Monitoring (coming soon)** - Learn assertions, health dashboards, and incident management +- **Data Ingestion Mastery (coming soon)** - Deep dive into recipes, stateful ingestion, and profiling diff --git a/docs/learn-datahub/overview.md b/docs/learn-datahub/overview.md new file mode 100644 index 00000000000000..1854210e5d6859 --- /dev/null +++ b/docs/learn-datahub/overview.md @@ -0,0 +1,227 @@ +--- +title: "Learn DataHub" +--- + +# Learn DataHub + +Master DataHub through a comprehensive professional development journey. Follow a realistic business scenario as you progress from basic data discovery to advanced governance and compliance management. + +## Professional Data Management Journey + +**Your Role**: You're a data professional tasked with implementing enterprise-grade metadata management. This tutorial series follows realistic scenarios that data teams encounter when establishing DataHub in production environments. + +**The Business Context**: A growing technology company with data distributed across multiple platforms - Kafka for streaming, Hive for analytics, HDFS for storage. The organization needs to transition from ad-hoc data usage to systematic data governance and discovery. + +**Your Objective**: Implement DataHub to solve real data management challenges: discovery bottlenecks, compliance requirements, quality issues, and system integration complexity. + +--- + +## Chapter 1: Foundation (30 minutes) + +### DataHub Quickstart + +**The Challenge**: You need to quickly assess the organization's data landscape and locate specific user engagement metrics for an executive presentation. The data exists across multiple systems, but there's no centralized metadata management. + +**Your Implementation**: + +- [Overview](quickstart/overview) - Understanding the business requirements +- [Setup DataHub](quickstart/setup) (5 min) - Deploy the metadata platform locally +- [First Ingestion](quickstart/first-ingestion) (10 min) - Connect multi-platform data sources +- [Discovery Basics](quickstart/discovery-basics) (10 min) - Implement systematic data discovery +- [Your First Lineage](quickstart/first-lineage) (5 min) - Analyze data dependencies and quality + +**Outcome**: Establish DataHub as the central metadata repository and demonstrate its value for data discovery and governance. + +--- + +## Chapter 2: Scaling Discovery (45 minutes) + +### Data Discovery & Search + +**The Challenge**: Three months later, the organization has grown to 50+ datasets across 8 platforms. New team members spend days trying to find the right data, and analysts frequently use incorrect or outdated datasets for reports. + +**Business Impact**: + +- **Time Waste**: Data scientists spend 60% of their time searching for data instead of analyzing +- **Inconsistent Metrics**: Different teams calculate customer metrics differently, leading to conflicting reports +- **Compliance Risk**: Teams unknowingly use datasets containing PII without proper approvals + +**DataHub Solution**: Implement systematic data discovery that enables self-service analytics while maintaining governance controls. + +**Your Journey**: + +- **Advanced Search Techniques** (15 min) - Enable teams to find data using business terms, not technical names +- **Understanding Dataset Profiles** (20 min) - Provide rich context so users choose the right data confidently +- **Collaborative Discovery** (10 min) - Build institutional knowledge through documentation and Q&A + +**Organizational Outcome**: Reduce data discovery time from days to minutes, while ensuring teams use trusted, well-documented datasets. + +--- + +## Chapter 3: Managing Dependencies (40 minutes) + +### Data Lineage & Impact Analysis + +**The Challenge**: The organization's customer analytics pipeline needs a major upgrade to support real-time personalization. However, this pipeline feeds 15+ downstream systems including customer-facing dashboards, ML models, and regulatory reports. + +**Business Impact**: + +- **Change Risk**: Modifying core data without understanding dependencies could break critical business processes +- **Coordination Overhead**: Manual impact assessment requires weeks of meetings across multiple teams +- **Incident Response**: When issues occur, root cause analysis takes hours without clear data flow visibility + +**DataHub Solution**: Use comprehensive lineage tracking to plan changes confidently and respond to incidents quickly. + +**Your Journey**: + +- **Reading Lineage Graphs** (15 min) - Navigate complex data flows spanning multiple systems and teams +- **Performing Impact Analysis** (15 min) - Systematically assess risks and coordinate changes across stakeholders +- **Lineage Troubleshooting** (10 min) - Ensure lineage accuracy for reliable decision-making + +**Organizational Outcome**: Execute complex data migrations with zero business disruption and reduce incident response time by 75%. + +--- + +## Chapter 4: Establishing Governance (50 minutes) + +### Data Governance Fundamentals + +**The Challenge**: The organization is preparing for SOC 2 compliance and a potential acquisition. Auditors need clear data ownership, classification, and business definitions. Currently, critical datasets have unclear ownership and inconsistent business terminology. + +**Business Impact**: + +- **Compliance Gaps**: Inability to demonstrate data stewardship and access controls +- **Business Confusion**: Same terms mean different things to different teams (e.g., "active customer") +- **Accountability Issues**: When data quality problems occur, no clear owner to resolve them + +**DataHub Solution**: Implement systematic data governance that scales with organizational growth. + +**Your Journey**: + +- **Ownership & Stewardship** (15 min) - Establish clear accountability for every critical dataset +- **Classification & Tagging** (20 min) - Organize data by sensitivity, domain, and business purpose +- **Business Glossary Management** (15 min) - Create shared vocabulary that aligns technical and business teams + +**Organizational Outcome**: Pass compliance audits confidently and accelerate cross-team collaboration through shared understanding. + +--- + +## Chapter 5: Ensuring Reliability (45 minutes) + +### Data Quality & Monitoring + +**The Challenge**: Organizational growth has led to data quality issues affecting customer experience. Revenue dashboards show inconsistent numbers, ML models receive corrupted training data, and customer support can't trust the data they see. + +**Business Impact**: + +- **Revenue Impact**: Incorrect pricing data led to $50K in lost revenue last quarter +- **Customer Experience**: Personalization algorithms fail due to poor data quality +- **Executive Confidence**: Leadership questions all data-driven decisions due to past inaccuracies + +**DataHub Solution**: Implement proactive data quality management that prevents issues before they impact business operations. + +**Your Journey**: + +- **Setting Up Data Assertions** (20 min) - Automated quality checks that catch issues immediately +- **Data Health Dashboard** (15 min) - Centralized monitoring that provides early warning of problems +- **Incident Management** (10 min) - Systematic response processes that minimize business impact + +**Organizational Outcome**: Achieve 99.9% data reliability and restore executive confidence in data-driven decisions. + +--- + +## Chapter 6: Platform Mastery (60 minutes) + +### Data Ingestion Mastery + +**The Challenge**: The organization is acquiring two companies with different data architectures. You need to integrate 20+ new data sources while maintaining performance and ensuring consistent metadata quality across all systems. + +**Business Impact**: + +- **Integration Complexity**: Manual metadata management doesn't scale to hundreds of datasets +- **Performance Degradation**: Naive ingestion approaches overwhelm DataHub and source systems +- **Metadata Quality**: Inconsistent metadata leads to poor user experience and governance gaps + +**DataHub Solution**: Implement production-grade ingestion patterns that scale efficiently and maintain high metadata quality. + +**Your Journey**: + +- **Understanding Recipes** (20 min) - Configure ingestion for complex, heterogeneous environments +- **Stateful Ingestion Patterns** (20 min) - Optimize for performance and minimize resource usage +- **Data Profiling & Enrichment** (20 min) - Automatically generate rich metadata that enhances discoverability + +**Organizational Outcome**: Successfully integrate acquired companies' data with zero performance impact and improved metadata quality. + +--- + +## Chapter 7: Compliance & Privacy (35 minutes) + +### Privacy & Compliance + +**The Challenge**: The organization operates in healthcare and finance sectors, requiring GDPR, HIPAA, and SOX compliance. Regulators need proof of data handling practices, and privacy teams need to track PII across all systems. + +**Business Impact**: + +- **Regulatory Risk**: Fines up to 4% of revenue for GDPR violations +- **Audit Overhead**: Manual compliance reporting takes weeks of effort quarterly +- **Privacy Breaches**: Inability to locate and protect sensitive data across systems + +**DataHub Solution**: Implement automated compliance workflows that provide continuous regulatory readiness. + +**Your Journey**: + +- **PII Detection & Classification** (15 min) - Automatically identify and classify sensitive data across all systems +- **Compliance Forms & Workflows** (20 min) - Streamline regulatory reporting and audit preparation + +**Organizational Outcome**: Achieve continuous compliance readiness and reduce audit preparation time by 90%. + +--- + +## Tutorial Structure + +Each tutorial follows a consistent, practical format: + +**Learning Objectives**: Clear outcomes you'll achieve +**Time Estimates**: Realistic completion times +**Hands-on Exercises**: Real scenarios with sample data +**Success Checkpoints**: Verify your progress +**What's Next**: Logical progression to related topics + +## Learning Paths + +### Complete Professional Journey (Recommended) + +Follow the full narrative from startup to enterprise-scale data management: +**Chapters 1-7** → Experience the complete organizational transformation + +### Role-Focused Paths + +**Data Analysts & Scientists** +**Chapters 1-3** → Master discovery, search, and lineage analysis for confident data usage + +**Data Engineers & Platform Teams** +**Chapters 1, 3, 5-6** → Focus on technical implementation, quality, and ingestion mastery + +**Data Governance & Compliance Teams** +**Chapters 1, 4, 7** → Establish governance frameworks and compliance processes + +**Leadership & Strategy Teams** +**Chapter overviews only** → Understand business value and organizational impact + +## Getting Help + +**During tutorials:** + +- Each page includes troubleshooting sections +- Common issues and solutions are documented +- Links to relevant documentation sections + +**Community support:** + +- [DataHub Slack Community](https://datahub.com/slack) +- [Full Documentation](../) +- [GitHub Issues](https://github.com/datahub-project/datahub/issues) + +--- + +**Ready to start learning?** Begin with the [DataHub Quickstart](quickstart/overview) → diff --git a/docs/learn-datahub/privacy/overview.md b/docs/learn-datahub/privacy/overview.md new file mode 100644 index 00000000000000..15679db6925148 --- /dev/null +++ b/docs/learn-datahub/privacy/overview.md @@ -0,0 +1,184 @@ +# Privacy & Compliance + + + +## Professional Privacy Protection at Scale + +**Time Required**: 35 minutes | **Skill Level**: Advanced + +### Your Challenge: Comprehensive Privacy Management + +You're a **Privacy Engineering Lead** at a global technology company. Your organization processes personal data from millions of users across multiple jurisdictions, subject to GDPR, CCPA, and other privacy regulations. Current privacy management is fragmented and reactive: + +- **Manual PII discovery** that misses sensitive data in new systems +- **Inconsistent privacy controls** across different data platforms +- **Slow response** to data subject requests and regulatory inquiries +- **Limited visibility** into personal data processing activities + +**The Business Impact**: A recent privacy audit revealed untracked personal data in 15 different systems, resulting in a $2.8M regulatory fine and significant remediation costs. Leadership demands a proactive, comprehensive privacy management approach. + +### What You'll Learn + +This tutorial series teaches you to implement enterprise-grade privacy protection using DataHub's privacy and compliance features: + +#### Chapter 1: PII Detection (12 minutes) + +**Business Challenge**: Hidden personal data creating compliance risks across the organization +**Your Journey**: + +- Implement automated PII discovery across all data systems +- Configure intelligent classification for different types of personal data +- Set up continuous monitoring for new PII in data pipelines + **Organizational Outcome**: Complete visibility into personal data across your data landscape + +#### Chapter 2: Privacy Controls (12 minutes) + +**Business Challenge**: Inconsistent privacy protection and access controls for personal data +**Your Journey**: + +- Implement data minimization and purpose limitation controls +- Configure automated privacy impact assessments +- Set up consent management and data retention policies + **Organizational Outcome**: Systematic privacy protection aligned with regulatory requirements + +#### Chapter 3: Compliance Workflows (11 minutes) + +**Business Challenge**: Manual compliance processes that can't scale with regulatory demands +**Your Journey**: + +- Automate data subject request fulfillment (access, deletion, portability) +- Implement regulatory reporting and audit trail generation +- Set up cross-border data transfer compliance monitoring + **Organizational Outcome**: Efficient compliance operations that reduce regulatory risk and operational overhead + +### Interactive Learning Experience + +Each chapter includes: + +- **Real Privacy Scenarios**: Based on actual regulatory compliance challenges +- **Hands-on Implementation**: Using DataHub's privacy management features +- **Regulatory Alignment**: Mapping to GDPR, CCPA, and other privacy laws +- **Audit Preparation**: Building evidence for regulatory compliance + +### Understanding Privacy Compliance Impact + +Privacy violations carry severe consequences: + +- **GDPR Fines**: Up to 4% of global annual revenue or €20M (whichever is higher) +- **CCPA Penalties**: Up to $7,500 per violation for intentional violations +- **Reputational Damage**: Loss of customer trust and competitive advantage +- **Operational Disruption**: Emergency remediation and system changes + +**Privacy-by-Design Benefits**: + +- **Regulatory Compliance**: Proactive adherence to privacy laws +- **Risk Reduction**: Early identification and mitigation of privacy risks +- **Operational Efficiency**: Automated compliance processes +- **Customer Trust**: Transparent and responsible data handling + +### DataHub Privacy Features Overview + +DataHub provides comprehensive privacy management through: + +
+ + + +
+ +**Key Privacy Capabilities**: + +- **🔍 Automated PII Discovery**: ML-powered detection of personal data across all systems +- **🛡️ Privacy Controls**: Automated enforcement of data minimization and purpose limitation +- **📋 Compliance Automation**: Streamlined data subject request fulfillment +- **📊 Privacy Analytics**: Comprehensive reporting and audit trail generation +- **🌍 Cross-Border Compliance**: Monitoring and controls for international data transfers + +### Privacy Regulatory Landscape + +**Major Privacy Regulations**: + +- **GDPR (EU)**: Comprehensive data protection with strict consent and rights requirements +- **CCPA (California)**: Consumer privacy rights including access, deletion, and opt-out +- **LGPD (Brazil)**: Brazilian data protection law similar to GDPR +- **PIPEDA (Canada)**: Privacy protection for personal information in commercial activities +- **Sector-Specific**: HIPAA (healthcare), FERPA (education), GLBA (financial services) + +**Common Privacy Requirements**: + +- **Lawful Basis**: Legal justification for processing personal data +- **Data Minimization**: Collecting only necessary personal data +- **Purpose Limitation**: Using data only for stated purposes +- **Storage Limitation**: Retaining data only as long as necessary +- **Individual Rights**: Access, rectification, erasure, portability, and objection + +### Prerequisites + +- Completed [Data Governance Fundamentals](../governance/overview.md) +- Understanding of privacy regulations (GDPR, CCPA, etc.) +- Access to DataHub instance with sample personal data +- Familiarity with data classification and governance concepts +- Basic knowledge of privacy engineering principles + +### Privacy Maturity Assessment + +**Level 1 - Reactive**: Manual privacy processes, compliance gaps +**Level 2 - Managed**: Basic privacy controls, some automation +**Level 3 - Proactive**: Comprehensive privacy program, systematic controls +**Level 4 - Optimized**: Advanced privacy engineering, predictive compliance +**Level 5 - Privacy-by-Design**: Privacy embedded in all data processes + +### Success Metrics + +**Compliance Metrics**: + +- **PII Discovery Coverage**: Percentage of systems with automated PII detection +- **Data Subject Request Response Time**: Speed of fulfilling privacy requests +- **Privacy Violation Rate**: Number of privacy incidents and regulatory findings +- **Audit Readiness**: Time required to respond to regulatory inquiries + +**Operational Metrics**: + +- **Privacy Assessment Automation**: Percentage of automated privacy impact assessments +- **Consent Management Coverage**: Tracking of consent across data processing activities +- **Cross-Border Transfer Compliance**: Adherence to international data transfer requirements +- **Privacy Training Completion**: Staff awareness and competency in privacy practices + +### Ready to Begin? + +Start your privacy compliance journey by implementing automated PII detection that provides complete visibility into personal data across your organization. + + diff --git a/docs/learn-datahub/quality/data-assertions.md b/docs/learn-datahub/quality/data-assertions.md new file mode 100644 index 00000000000000..b517709300e10f --- /dev/null +++ b/docs/learn-datahub/quality/data-assertions.md @@ -0,0 +1,367 @@ +import DataHubEntityCard from '@site/src/components/DataHubEntityCard'; + +# Data Assertions + + + +## Building Automated Data Quality Checks + +**Time Required**: 15 minutes + +### The Assertion Challenge + +Your data pipelines are processing customer transactions, but you're discovering quality issues after they've already impacted business operations: + +- **Missing customer IDs** causing failed order processing +- **Negative transaction amounts** appearing in financial reports +- **Duplicate records** inflating customer metrics +- **Stale data** making real-time dashboards unreliable + +**Real-World Impact**: Last week, a batch of transactions with null customer IDs caused the customer service system to crash, resulting in 4 hours of downtime and frustrated customers. + +### Understanding DataHub Assertions + +Assertions are automated quality checks that continuously validate your data against business rules: + +
+ +
+ +**Assertion Types**: + +- **Completeness**: Ensure required fields are not null or empty +- **Uniqueness**: Validate primary keys and unique constraints +- **Range Validation**: Check numeric values fall within expected bounds +- **Freshness**: Verify data is updated within acceptable time windows +- **Referential Integrity**: Ensure foreign key relationships are valid +- **Custom Rules**: Implement business-specific validation logic + +### Exercise 1: Create Completeness Assertions + +Ensure critical fields always contain valid data: + +#### Step 1: Navigate to Assertions + +1. **Open DataHub** and search for "fct_users_created" +2. **Click on the dataset** to open its profile page +3. **Go to the "Quality" tab** and click "Add Assertion" +4. **Select "Column Assertion"** to validate specific fields + +#### Step 2: Create Customer ID Completeness Check + +**Assertion Configuration**: + +- **Name**: "Customer ID Required" +- **Description**: "All records must have a valid customer_id" +- **Column**: `customer_id` +- **Type**: "Not Null" +- **Severity**: "Error" (blocks downstream processing) +- **Schedule**: "Every 15 minutes" + +**SQL Logic**: + +```sql +SELECT COUNT(*) as null_count +FROM fct_users_created +WHERE customer_id IS NULL + OR customer_id = '' +``` + +**Success Criteria**: `null_count = 0` + +#### Step 3: Add Email Validation + +**Assertion Configuration**: + +- **Name**: "Valid Email Format" +- **Description**: "Email addresses must follow standard format" +- **Column**: `email` +- **Type**: "Custom SQL" +- **Severity**: "Warning" + +**SQL Logic**: + +```sql +SELECT COUNT(*) as invalid_emails +FROM fct_users_created +WHERE email IS NOT NULL + AND email NOT REGEXP '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' +``` + +**Success Criteria**: `invalid_emails = 0` + +### Exercise 2: Implement Range Validations + +Validate that numeric values fall within business-acceptable ranges: + +#### Step 1: Transaction Amount Validation + +For financial data, create bounds checking: + +**Assertion Configuration**: + +- **Name**: "Valid Transaction Amount" +- **Description**: "Transaction amounts must be positive and reasonable" +- **Column**: `transaction_amount` +- **Type**: "Range Check" +- **Min Value**: 0.01 (no zero or negative transactions) +- **Max Value**: 100000.00 (flag unusually large transactions) +- **Severity**: "Error" + +#### Step 2: Date Range Validation + +Ensure dates are realistic and current: + +**Assertion Configuration**: + +- **Name**: "Recent Transaction Date" +- **Description**: "Transaction dates should be within the last 2 years" +- **Column**: `transaction_date` +- **Type**: "Custom SQL" +- **Severity**: "Warning" + +**SQL Logic**: + +```sql +SELECT COUNT(*) as invalid_dates +FROM customer_transactions +WHERE transaction_date < CURRENT_DATE - INTERVAL '2 years' + OR transaction_date > CURRENT_DATE + INTERVAL '1 day' +``` + +### Exercise 3: Create Uniqueness Assertions + +Prevent duplicate records that can skew analytics: + +#### Step 1: Primary Key Uniqueness + +**Assertion Configuration**: + +- **Name**: "Unique Transaction ID" +- **Description**: "Each transaction must have a unique identifier" +- **Column**: `transaction_id` +- **Type**: "Uniqueness" +- **Severity**: "Error" +- **Action**: "Block pipeline on failure" + +#### Step 2: Business Key Uniqueness + +For composite business keys: + +**Assertion Configuration**: + +- **Name**: "Unique Customer-Date Combination" +- **Description**: "One transaction per customer per day (business rule)" +- **Type**: "Custom SQL" +- **Severity**: "Warning" + +**SQL Logic**: + +```sql +SELECT customer_id, DATE(transaction_date) as txn_date, COUNT(*) as duplicate_count +FROM customer_transactions +GROUP BY customer_id, DATE(transaction_date) +HAVING COUNT(*) > 1 +``` + +### Exercise 4: Implement Freshness Checks + +Ensure data is updated according to business requirements: + +#### Step 1: Data Freshness Assertion + +**Assertion Configuration**: + +- **Name**: "Customer Data Freshness" +- **Description**: "Customer data must be updated within 4 hours" +- **Type**: "Freshness" +- **Column**: `last_updated_timestamp` +- **Max Age**: "4 hours" +- **Severity**: "Error" + +#### Step 2: Partition Freshness + +For partitioned tables: + +**SQL Logic**: + +```sql +SELECT MAX(partition_date) as latest_partition +FROM customer_transactions +WHERE partition_date >= CURRENT_DATE - INTERVAL '1 day' +``` + +**Success Criteria**: `latest_partition >= CURRENT_DATE` + +### Exercise 5: Custom Business Rule Assertions + +Implement organization-specific validation logic: + +#### Step 1: Customer Lifecycle Validation + +**Business Rule**: "Customers must have a registration date before their first transaction" + +**Assertion Configuration**: + +- **Name**: "Valid Customer Lifecycle" +- **Description**: "Registration must precede first transaction" +- **Type**: "Custom SQL" +- **Severity**: "Error" + +**SQL Logic**: + +```sql +SELECT COUNT(*) as lifecycle_violations +FROM customer_transactions ct +JOIN customer_profiles cp ON ct.customer_id = cp.customer_id +WHERE ct.transaction_date < cp.registration_date +``` + +#### Step 2: Revenue Recognition Rules + +**Business Rule**: "Subscription revenue must be recognized monthly" + +**SQL Logic**: + +```sql +SELECT COUNT(*) as recognition_errors +FROM revenue_transactions +WHERE product_type = 'subscription' + AND recognition_method != 'monthly' +``` + +### Understanding Assertion Results + +DataHub provides comprehensive assertion monitoring: + +**Assertion Status Indicators**: + +- **Passing**: All validation rules met +- **Warning**: Minor issues detected, investigate soon +- **Failing**: Critical issues found, immediate attention required +- **Paused**: Assertion temporarily disabled +- 🔄 **Running**: Currently executing validation + +**Assertion History**: + +- Track assertion results over time +- Identify patterns in quality issues +- Measure quality improvement trends +- Generate compliance reports + +### Best Practices for Data Assertions + +#### 1. Start with Critical Business Rules + +Focus on assertions that protect: + +- Revenue calculations +- Customer data integrity +- Regulatory compliance requirements +- Downstream system dependencies + +#### 2. Use Appropriate Severity Levels + +- **Error**: Critical issues that must block processing +- **Warning**: Issues that need investigation but don't stop pipelines +- **Info**: Monitoring checks for trend analysis + +#### 3. Optimize Assertion Performance + +- Use efficient SQL queries +- Leverage table statistics when possible +- Schedule assertions based on data update frequency +- Consider sampling for large datasets + +#### 4. Provide Clear Context + +- Write descriptive assertion names and descriptions +- Document business rationale for each rule +- Include remediation guidance +- Link to relevant business stakeholders + +### Measuring Assertion Effectiveness + +Track these key metrics: + +- **Assertion Coverage**: Percentage of critical columns with assertions +- **Pass Rate**: Percentage of assertions passing over time +- **Detection Speed**: Time from data issue to assertion failure +- **False Positive Rate**: Assertions failing due to rule issues +- **Business Impact Prevention**: Issues caught before affecting operations + +### Advanced Assertion Techniques + +#### 1. Statistical Assertions + +Monitor data distributions and detect anomalies: + +```sql +-- Detect unusual spikes in transaction volume +SELECT COUNT(*) as daily_transactions +FROM customer_transactions +WHERE DATE(transaction_date) = CURRENT_DATE +HAVING COUNT(*) > ( + SELECT AVG(daily_count) * 2 + FROM daily_transaction_stats + WHERE date >= CURRENT_DATE - INTERVAL '30 days' +) +``` + +#### 2. Cross-Dataset Assertions + +Validate consistency across related datasets: + +```sql +-- Ensure customer counts match between systems +SELECT ABS( + (SELECT COUNT(DISTINCT customer_id) FROM crm_customers) - + (SELECT COUNT(DISTINCT customer_id) FROM billing_customers) +) as customer_count_diff +HAVING customer_count_diff <= 10 -- Allow small variance +``` + +#### 3. Time-Series Assertions + +Monitor trends and seasonal patterns: + +```sql +-- Detect unusual drops in daily active users +SELECT current_dau, previous_week_avg +FROM ( + SELECT + COUNT(DISTINCT user_id) as current_dau, + LAG(COUNT(DISTINCT user_id), 7) OVER (ORDER BY date) as previous_week_avg + FROM user_activity + WHERE date = CURRENT_DATE +) +WHERE current_dau < previous_week_avg * 0.8 -- 20% drop threshold +``` + +### Next Steps + +With automated assertions in place, you're ready to build comprehensive quality monitoring dashboards that provide real-time visibility into your data health. + + diff --git a/docs/learn-datahub/quality/incident-management.md b/docs/learn-datahub/quality/incident-management.md new file mode 100644 index 00000000000000..54b74d9b1fe4a1 --- /dev/null +++ b/docs/learn-datahub/quality/incident-management.md @@ -0,0 +1,363 @@ +# Incident Management + + + +## Rapid Response to Data Quality Issues + +**Time Required**: 10 minutes + +### The Incident Response Challenge + +Your quality monitoring is detecting issues, but your response process is still chaotic: + +- **Delayed notifications** mean issues impact business before teams respond +- **Unclear ownership** leads to finger-pointing instead of resolution +- **Manual escalation** processes that don't scale with your data growth +- **No systematic learning** from incidents to prevent recurrence + +**Real-World Impact**: A data quality issue in the customer segmentation pipeline caused the marketing team to send promotional emails to churned customers, resulting in negative brand impact and a 2-day emergency response to identify and fix the root cause. + +### Understanding Incident Management + +Systematic incident management transforms chaotic fire-fighting into structured, efficient response: + +
+ +
+ +**Incident Management Components**: + +- **🚨 Automated Detection**: Intelligent alerting based on quality thresholds +- **📋 Structured Response**: Standardized workflows for different incident types +- **SLA Management**: Time-bound response and resolution commitments +- **Impact Assessment**: Business impact evaluation and prioritization +- **🔄 Post-Incident Review**: Learning and improvement processes + +### Exercise 1: Set Up Incident Detection + +Configure intelligent alerting that triggers appropriate response levels: + +#### Step 1: Define Incident Severity Levels + +**Severity Classification**: + +- **🔴 Critical (P0)**: Complete data unavailability or major accuracy issues affecting revenue/customers +- **🟡 High (P1)**: Significant quality degradation affecting business operations +- **🟠 Medium (P2)**: Quality issues affecting specific use cases or reports +- **🟢 Low (P3)**: Minor quality issues with workarounds available + +#### Step 2: Configure Automated Detection Rules + +**Critical Incident Triggers**: + +```sql +-- Critical: Customer data pipeline failure +SELECT COUNT(*) as missing_records +FROM customer_daily_summary +WHERE date = CURRENT_DATE +HAVING COUNT(*) = 0; -- No records for today = P0 incident + +-- Critical: Financial data accuracy issue +SELECT COUNT(*) as revenue_discrepancy +FROM revenue_reconciliation +WHERE ABS(system_a_total - system_b_total) > 10000 -- $10K+ discrepancy + AND reconciliation_date = CURRENT_DATE; +``` + +**High Priority Triggers**: + +```sql +-- High: Significant data freshness delay +SELECT MAX(last_updated) as latest_update +FROM critical_datasets +WHERE dataset_name = 'customer_transactions' +HAVING latest_update < CURRENT_TIMESTAMP - INTERVAL '2 hours'; + +-- High: Assertion failure rate spike +SELECT failure_rate +FROM ( + SELECT COUNT(CASE WHEN status = 'FAIL' THEN 1 END) * 100.0 / COUNT(*) as failure_rate + FROM assertion_results + WHERE created_at >= CURRENT_TIMESTAMP - INTERVAL '1 hour' +) +WHERE failure_rate > 15; -- >15% failure rate = High priority +``` + +### Exercise 2: Create Response Workflows + +Build structured response processes for different incident types: + +#### Step 1: Critical Incident Response Workflow + +**P0 Incident Response (Target: 15 minutes)**: + +1. **Immediate Actions (0-5 minutes)**: + + - Automated page to on-call engineer + - Create incident ticket with severity P0 + - Notify stakeholders via Slack #data-incidents + - Activate incident bridge/war room + +2. **Assessment Phase (5-15 minutes)**: + + - Confirm incident scope and business impact + - Identify affected systems and downstream dependencies + - Assign incident commander + - Begin impact mitigation + +3. **Resolution Phase (15+ minutes)**: + - Implement immediate fixes or workarounds + - Monitor for resolution confirmation + - Communicate status updates every 30 minutes + - Document actions taken + +#### Step 2: Automated Incident Creation + +**Incident Ticket Template**: + +``` +Title: [P0] Customer Transaction Pipeline Failure - [Timestamp] + +INCIDENT DETAILS: +- Severity: P0 (Critical) +- Detected: [Automated Detection System] +- Affected System: Customer Transaction Pipeline +- Business Impact: Customer-facing applications unable to process payments + +TECHNICAL DETAILS: +- Failed Assertion: "Customer ID Completeness Check" +- Error Rate: 100% (0/1000 records passing) +- Last Successful Run: [Timestamp] +- Affected Records: ~50,000 transactions + +IMMEDIATE ACTIONS REQUIRED: +1. Investigate data source connectivity +2. Check upstream system status +3. Implement emergency data bypass if needed +4. Notify customer service team of potential impact + +STAKEHOLDERS: +- Incident Commander: [On-call Engineer] +- Technical Owner: payments.team@company.com +- Business Owner: customer.success@company.com +- Executive Sponsor: [VP Engineering] (for P0 incidents) +``` + +### Exercise 3: Implement Escalation Procedures + +Create automatic escalation when response targets are missed: + +#### Step 1: Time-Based Escalation + +**Escalation Timeline**: + +- **15 minutes**: No acknowledgment → Escalate to backup on-call +- **30 minutes**: No progress update → Notify engineering manager +- **1 hour**: Unresolved P0 → Escalate to VP Engineering +- **2 hours**: Unresolved P0 → Executive notification + +#### Step 2: Impact-Based Escalation + +**Business Impact Escalation**: + +``` +Revenue Impact > $100K/hour → Immediate C-level notification +Customer-facing system down → Product team involvement +Regulatory data affected → Compliance team notification +Security implications → Security team involvement +``` + +### Exercise 4: Set Up Communication Protocols + +Ensure stakeholders receive appropriate information at the right time: + +#### Step 1: Stakeholder Communication Matrix + +**Communication Channels by Severity**: + +- **P0**: Slack #data-incidents + Email + Phone/Page +- **P1**: Slack #data-quality + Email +- **P2**: Slack #data-quality + Daily summary email +- **P3**: Weekly quality report + +#### Step 2: Status Update Templates + +**Incident Status Update Template**: + +``` +🚨 INCIDENT UPDATE - [Incident ID] - [Time] + +STATUS: [Investigating/Mitigating/Resolved] +IMPACT: [Brief business impact description] +PROGRESS: [What has been done since last update] +NEXT STEPS: [Immediate actions planned] +ETA: [Expected resolution time] +WORKAROUND: [Temporary solutions available] + +Technical Details: [Link to detailed technical updates] +Questions: Contact [Incident Commander] in #data-incidents +``` + +### Exercise 5: Implement Post-Incident Reviews + +Learn from incidents to prevent recurrence: + +#### Step 1: Post-Incident Review Process + +**Review Timeline**: + +- **P0/P1**: Within 48 hours of resolution +- **P2**: Within 1 week of resolution +- **P3**: Monthly batch review + +**Review Agenda**: + +1. **Incident Timeline**: Detailed chronology of events +2. **Root Cause Analysis**: Technical and process factors +3. **Response Effectiveness**: What worked well and what didn't +4. **Action Items**: Specific improvements to prevent recurrence +5. **Process Updates**: Changes to monitoring, alerting, or procedures + +#### Step 2: Root Cause Analysis Framework + +**5 Whys Analysis Example**: + +``` +Problem: Customer segmentation data contained churned customers + +Why 1: Why did churned customers appear in active segments? +→ The churn detection job failed to update customer status + +Why 2: Why did the churn detection job fail? +→ The upstream CRM system had a schema change + +Why 3: Why didn't we detect the schema change? +→ We don't have schema change monitoring on the CRM system + +Why 4: Why don't we have schema change monitoring? +→ It wasn't considered critical for this data source + +Why 5: Why wasn't it considered critical? +→ We lack a systematic approach to assessing data source criticality + +ROOT CAUSE: Missing systematic data source risk assessment +ACTION ITEM: Implement data source criticality framework and monitoring +``` + +### Understanding Incident Metrics + +**Response Metrics**: + +- **Mean Time to Detection (MTTD)**: Time from issue occurrence to detection +- **Mean Time to Acknowledgment (MTTA)**: Time from detection to human response +- **Mean Time to Resolution (MTTR)**: Time from detection to full resolution +- **Escalation Rate**: Percentage of incidents requiring escalation + +**Business Impact Metrics**: + +- **Revenue Impact**: Financial cost of data quality incidents +- **Customer Impact**: Number of customers affected by incidents +- **SLA Compliance**: Adherence to response time commitments +- **Repeat Incidents**: Percentage of incidents that are recurring issues + +### Advanced Incident Management + +#### 1. Predictive Incident Detection + +Use machine learning to predict incidents before they occur: + +```sql +-- Identify leading indicators of quality incidents +WITH quality_trends AS ( + SELECT + dataset_name, + date, + quality_score, + LAG(quality_score, 1) OVER (PARTITION BY dataset_name ORDER BY date) as prev_score, + LAG(quality_score, 7) OVER (PARTITION BY dataset_name ORDER BY date) as week_ago_score + FROM daily_quality_scores +) +SELECT + dataset_name, + quality_score, + CASE + WHEN quality_score < prev_score * 0.95 AND quality_score < week_ago_score * 0.90 + THEN 'HIGH_RISK' + WHEN quality_score < prev_score * 0.98 AND quality_score < week_ago_score * 0.95 + THEN 'MEDIUM_RISK' + ELSE 'LOW_RISK' + END as incident_risk +FROM quality_trends +WHERE date = CURRENT_DATE + AND incident_risk != 'LOW_RISK'; +``` + +#### 2. Automated Remediation + +Implement self-healing responses for common issues: + +- **Data Refresh**: Automatically retry failed data loads +- **Fallback Data**: Switch to backup data sources during outages +- **Circuit Breakers**: Temporarily disable problematic data flows +- **Auto-Scaling**: Increase resources during processing spikes + +#### 3. Cross-Team Coordination + +Integrate with broader incident management: + +- **ServiceNow Integration**: Link data incidents to IT service management +- **PagerDuty Coordination**: Align with infrastructure incident response +- **Slack Workflows**: Automate cross-team communication +- **Jira Integration**: Track incident resolution as development work + +### Incident Management Best Practices + +#### 1. Prepare for Success + +- **Runbooks**: Document common incident types and responses +- **Training**: Regular incident response drills and training +- **Tools**: Ensure all responders have access to necessary systems +- **Communication**: Pre-established channels and contact lists + +#### 2. Focus on Resolution + +- **Triage Effectively**: Prioritize based on business impact, not technical complexity +- **Communicate Clearly**: Regular updates reduce stakeholder anxiety +- **Document Everything**: Detailed logs enable effective post-incident analysis +- **Learn Continuously**: Every incident is an opportunity to improve + +#### 3. Build Resilience + +- **Redundancy**: Multiple detection methods and backup systems +- **Graceful Degradation**: Systems that fail safely with reduced functionality +- **Quick Recovery**: Automated recovery procedures where possible +- **Continuous Improvement**: Regular review and enhancement of processes + +### Next Steps + +With robust incident management in place, you're ready to implement quality automation that prevents issues before they occur and reduces the need for manual intervention. + + diff --git a/docs/learn-datahub/quality/overview.md b/docs/learn-datahub/quality/overview.md new file mode 100644 index 00000000000000..7ac0cb1662040a --- /dev/null +++ b/docs/learn-datahub/quality/overview.md @@ -0,0 +1,155 @@ +import DataHubEntityCard from '@site/src/components/DataHubEntityCard'; + +# Data Quality & Monitoring + + + +## Professional Data Quality Management + +**Time Required**: 45 minutes | **Skill Level**: Intermediate + +### Your Challenge: Ensuring Data Reliability at Scale + +You're a **Data Platform Engineer** at a fast-growing company. Your data pipelines process millions of records daily, feeding critical business dashboards, ML models, and customer-facing applications. However, data quality issues are becoming frequent: + +- **Executive dashboards** showing incorrect revenue numbers +- **ML models** making poor predictions due to data drift +- **Customer applications** failing due to missing or malformed data +- **Compliance reports** containing inaccurate information + +**The Business Impact**: A recent data quality incident caused the executive team to make a $5M investment decision based on incorrect customer churn metrics, highlighting the critical need for proactive data quality management. + +### What You'll Learn + +This tutorial series teaches you to implement comprehensive data quality monitoring using DataHub's quality management features: + +#### Chapter 1: Data Assertions (15 minutes) + +**Business Challenge**: No early warning system for data quality problems +**Your Journey**: + +- Create automated data quality checks (completeness, uniqueness, range validation) +- Set up custom business rule assertions +- Configure assertion scheduling and execution + **Organizational Outcome**: Proactive detection of data quality issues before they impact business + +#### Chapter 2: Quality Monitoring (12 minutes) + +**Business Challenge**: Reactive approach to data quality management +**Your Journey**: + +- Build comprehensive quality dashboards +- Set up real-time quality monitoring +- Create quality scorecards for different data domains + **Organizational Outcome**: Continuous visibility into data health across the organization + +#### Chapter 3: Incident Management (10 minutes) + +**Business Challenge**: Slow response to data quality incidents +**Your Journey**: + +- Implement automated incident detection and alerting +- Set up escalation procedures for critical quality failures +- Create incident response workflows + **Organizational Outcome**: Rapid resolution of data quality issues with minimal business impact + +#### Chapter 4: Quality Automation (8 minutes) + +**Business Challenge**: Manual quality processes that don't scale +**Your Journey**: + +- Automate quality validation in data pipelines +- Set up quality gates for data promotion +- Implement self-healing data quality processes + **Organizational Outcome**: Scalable quality management that prevents issues rather than just detecting them + +### Interactive Learning Experience + +Each chapter includes: + +- **Real Quality Scenarios**: Based on actual production data quality challenges +- **Hands-on Exercises**: Using DataHub's sample data with realistic quality issues +- **Best Practice Implementation**: Industry-standard approaches to data quality +- **Measurable Outcomes**: Clear metrics for quality improvement + +### Understanding Data Quality Impact + +Poor data quality costs organizations an average of **$15 million annually** through: + +- **Operational Inefficiency**: Teams spending 40% of time cleaning data +- **Poor Decision Making**: Executives losing trust in data-driven insights +- **Customer Experience**: Applications failing due to data issues +- **Compliance Risk**: Regulatory penalties for inaccurate reporting + +### DataHub Quality Features Overview + +DataHub provides comprehensive quality management through: + +
+ + + +
+ +**Key Quality Capabilities**: + +- **Automated Assertions**: Continuous validation of data quality rules +- **Quality Dashboards**: Real-time visibility into data health +- **Intelligent Alerting**: Smart notifications based on quality thresholds +- **Trend Analysis**: Historical quality metrics and improvement tracking +- **Pipeline Integration**: Quality gates in data processing workflows + +### Prerequisites + +- Completed [DataHub Quickstart](../quickstart/overview.md) +- Basic understanding of data pipelines and SQL +- Access to DataHub instance with sample data +- Familiarity with data quality concepts + +### Quality Management Maturity Levels + +**Level 1 - Reactive**: Manual quality checks, issue discovery after impact +**Level 2 - Proactive**: Automated basic checks, regular quality monitoring +**Level 3 - Predictive**: Advanced analytics, quality trend prediction +**Level 4 - Preventive**: Quality-by-design, automated remediation +**Level 5 - Optimizing**: Continuous quality improvement, ML-driven optimization + +### Ready to Begin? + +Start your data quality journey by implementing automated assertions that catch quality issues before they impact your business. + + diff --git a/docs/learn-datahub/quality/quality-automation.md b/docs/learn-datahub/quality/quality-automation.md new file mode 100644 index 00000000000000..1b53f012c4107c --- /dev/null +++ b/docs/learn-datahub/quality/quality-automation.md @@ -0,0 +1,572 @@ +# Quality Automation + + + +## Preventing Issues Through Automation + +**Time Required**: 8 minutes + +### The Automation Challenge + +Your incident management is working well, but you're still fighting fires instead of preventing them: + +- **Reactive quality management** that catches issues after they occur +- **Manual quality gates** that slow down data pipeline deployments +- **Inconsistent quality standards** across different teams and projects +- **Quality debt** accumulating as teams prioritize speed over reliability + +**Real-World Impact**: Your data engineering team spends 40% of their time on quality-related issues that could be prevented through better automation, reducing their capacity for strategic data platform improvements. + +### Understanding Quality Automation + +Quality automation shifts from reactive incident response to proactive issue prevention: + +**Automation Layers**: + +- **🔄 Pipeline Integration**: Quality checks embedded in data processing workflows +- **🚪 Quality Gates**: Automated approval/rejection of data based on quality criteria +- **Self-Healing**: Automatic remediation of common quality issues +- **Continuous Improvement**: ML-driven optimization of quality processes +- **Preventive Monitoring**: Early detection of quality degradation patterns + +### Exercise 1: Implement Pipeline Quality Gates + +Embed quality validation directly into your data pipelines: + +#### Step 1: Pre-Processing Quality Gates + +**Data Ingestion Quality Gate**: + +```python +# Example: Airflow DAG with quality gates +from airflow import DAG +from airflow.operators.python_operator import PythonOperator +from datahub_quality import QualityGate + +def validate_source_data(**context): + """Validate incoming data before processing""" + quality_gate = QualityGate( + dataset="raw_customer_data", + checks=[ + "completeness_check", + "schema_validation", + "freshness_check" + ] + ) + + result = quality_gate.execute() + if not result.passed: + raise ValueError(f"Quality gate failed: {result.failures}") + + return result.quality_score + +# DAG definition +dag = DAG('customer_data_pipeline') + +# Quality gate before processing +quality_check = PythonOperator( + task_id='validate_source_quality', + python_callable=validate_source_data, + dag=dag +) + +# Data processing only runs if quality passes +process_data = PythonOperator( + task_id='process_customer_data', + python_callable=process_data_function, + dag=dag +) + +quality_check >> process_data # Quality gate blocks processing +``` + +#### Step 2: Post-Processing Quality Gates + +**Output Validation Gate**: + +```python +def validate_output_quality(**context): + """Validate processed data before publishing""" + quality_checks = [ + { + "name": "record_count_validation", + "query": """ + SELECT COUNT(*) as record_count + FROM processed_customer_data + WHERE processing_date = CURRENT_DATE + """, + "expected_min": 10000, # Expect at least 10K records + "expected_max": 1000000 # But not more than 1M + }, + { + "name": "revenue_reconciliation", + "query": """ + SELECT ABS( + (SELECT SUM(amount) FROM processed_transactions) - + (SELECT SUM(amount) FROM source_transactions) + ) as revenue_diff + """, + "expected_max": 100 # Revenue difference < $100 + } + ] + + for check in quality_checks: + result = execute_quality_check(check) + if not result.passed: + # Block publication and alert stakeholders + send_quality_alert(check, result) + raise QualityGateFailure(f"Failed: {check['name']}") +``` + +### Exercise 2: Set Up Automated Data Validation + +Create comprehensive validation that runs automatically: + +#### Step 1: Schema Evolution Validation + +**Automated Schema Change Detection**: + +```sql +-- Detect breaking schema changes +WITH schema_changes AS ( + SELECT + table_name, + column_name, + data_type, + is_nullable, + LAG(data_type) OVER (PARTITION BY table_name, column_name ORDER BY schema_version) as prev_type, + LAG(is_nullable) OVER (PARTITION BY table_name, column_name ORDER BY schema_version) as prev_nullable + FROM schema_history + WHERE schema_date >= CURRENT_DATE - INTERVAL '7 days' +) +SELECT + table_name, + column_name, + 'BREAKING_CHANGE' as change_type, + CASE + WHEN data_type != prev_type THEN 'Data type changed' + WHEN is_nullable = 'NO' AND prev_nullable = 'YES' THEN 'Column became non-nullable' + END as change_description +FROM schema_changes +WHERE (data_type != prev_type OR (is_nullable = 'NO' AND prev_nullable = 'YES')) + AND prev_type IS NOT NULL; +``` + +**Automated Response**: + +- Block deployment if breaking changes detected +- Require explicit approval from data owners +- Generate impact analysis for downstream consumers +- Create migration tasks for affected systems + +#### Step 2: Business Rule Validation + +**Automated Business Logic Checks**: + +```python +class BusinessRuleValidator: + def __init__(self, dataset_name): + self.dataset = dataset_name + self.rules = self.load_business_rules() + + def validate_customer_lifecycle(self): + """Ensure customer data follows business logic""" + violations = [] + + # Rule: Registration date must precede first purchase + query = """ + SELECT customer_id, registration_date, first_purchase_date + FROM customer_summary + WHERE first_purchase_date < registration_date + """ + + results = execute_query(query) + if results: + violations.append({ + "rule": "customer_lifecycle_order", + "violations": len(results), + "severity": "ERROR" + }) + + return violations + + def validate_financial_consistency(self): + """Ensure financial calculations are consistent""" + # Rule: Order total must equal sum of line items + query = """ + SELECT + order_id, + order_total, + calculated_total, + ABS(order_total - calculated_total) as difference + FROM ( + SELECT + o.order_id, + o.total_amount as order_total, + SUM(li.quantity * li.unit_price) as calculated_total + FROM orders o + JOIN line_items li ON o.order_id = li.order_id + WHERE o.order_date = CURRENT_DATE + GROUP BY o.order_id, o.total_amount + ) + WHERE ABS(order_total - calculated_total) > 0.01 + """ + + return self.check_rule(query, "financial_consistency") +``` + +### Exercise 3: Implement Self-Healing Mechanisms + +Create systems that automatically fix common quality issues: + +#### Step 1: Automated Data Repair + +**Common Data Fixes**: + +```python +class DataRepairEngine: + def __init__(self): + self.repair_strategies = { + "missing_values": self.handle_missing_values, + "duplicate_records": self.handle_duplicates, + "format_inconsistencies": self.standardize_formats, + "referential_integrity": self.fix_foreign_keys + } + + def handle_missing_values(self, dataset, column, strategy="default"): + """Automatically handle missing values""" + strategies = { + "default": f"UPDATE {dataset} SET {column} = 'UNKNOWN' WHERE {column} IS NULL", + "previous": f""" + UPDATE {dataset} SET {column} = ( + SELECT {column} FROM {dataset} t2 + WHERE t2.id < {dataset}.id AND t2.{column} IS NOT NULL + ORDER BY t2.id DESC LIMIT 1 + ) WHERE {column} IS NULL + """, + "statistical": f""" + UPDATE {dataset} SET {column} = ( + SELECT AVG({column}) FROM {dataset} WHERE {column} IS NOT NULL + ) WHERE {column} IS NULL + """ + } + + return strategies.get(strategy, strategies["default"]) + + def handle_duplicates(self, dataset, key_columns): + """Remove duplicate records automatically""" + return f""" + DELETE FROM {dataset} + WHERE id NOT IN ( + SELECT MIN(id) + FROM {dataset} + GROUP BY {', '.join(key_columns)} + ) + """ +``` + +#### Step 2: Automated Pipeline Recovery + +**Pipeline Self-Healing**: + +```python +class PipelineRecoveryManager: + def __init__(self): + self.recovery_strategies = [ + self.retry_with_backoff, + self.switch_to_backup_source, + self.use_cached_data, + self.trigger_manual_intervention + ] + + def retry_with_backoff(self, pipeline_id, error): + """Retry failed pipeline with exponential backoff""" + max_retries = 3 + base_delay = 60 # seconds + + for attempt in range(max_retries): + delay = base_delay * (2 ** attempt) + time.sleep(delay) + + try: + result = execute_pipeline(pipeline_id) + if result.success: + log_recovery_success(pipeline_id, attempt + 1) + return result + except Exception as e: + log_retry_attempt(pipeline_id, attempt + 1, str(e)) + + return self.switch_to_backup_source(pipeline_id, error) + + def switch_to_backup_source(self, pipeline_id, error): + """Switch to backup data source during outages""" + backup_config = get_backup_configuration(pipeline_id) + if backup_config: + try: + result = execute_pipeline_with_backup(pipeline_id, backup_config) + alert_backup_usage(pipeline_id, backup_config) + return result + except Exception as e: + log_backup_failure(pipeline_id, str(e)) + + return self.use_cached_data(pipeline_id, error) +``` + +### Exercise 4: Create Continuous Quality Improvement + +Use machine learning to continuously optimize quality processes: + +#### Step 1: Quality Pattern Analysis + +**ML-Driven Quality Insights**: + +```python +class QualityMLAnalyzer: + def __init__(self): + self.model = load_quality_prediction_model() + + def predict_quality_issues(self, dataset_features): + """Predict potential quality issues before they occur""" + features = [ + dataset_features['record_count_trend'], + dataset_features['schema_change_frequency'], + dataset_features['source_system_health'], + dataset_features['processing_complexity'], + dataset_features['historical_failure_rate'] + ] + + risk_score = self.model.predict_proba([features])[0][1] + + if risk_score > 0.8: + return { + "risk_level": "HIGH", + "recommended_actions": [ + "Increase monitoring frequency", + "Add additional quality checks", + "Schedule proactive maintenance" + ] + } + elif risk_score > 0.6: + return { + "risk_level": "MEDIUM", + "recommended_actions": [ + "Review recent changes", + "Validate upstream dependencies" + ] + } + + return {"risk_level": "LOW", "recommended_actions": []} + + def optimize_assertion_thresholds(self, assertion_history): + """Automatically tune assertion thresholds to reduce false positives""" + optimal_thresholds = {} + + for assertion_id, history in assertion_history.items(): + # Analyze false positive rate vs detection effectiveness + false_positive_rate = calculate_false_positive_rate(history) + detection_effectiveness = calculate_detection_rate(history) + + # Find optimal threshold that minimizes false positives while maintaining detection + optimal_threshold = find_optimal_threshold( + false_positive_rate, + detection_effectiveness, + target_fp_rate=0.05 # 5% false positive target + ) + + optimal_thresholds[assertion_id] = optimal_threshold + + return optimal_thresholds +``` + +#### Step 2: Automated Quality Recommendations + +**Intelligent Quality Suggestions**: + +```python +class QualityRecommendationEngine: + def generate_recommendations(self, dataset_profile): + """Generate quality improvement recommendations""" + recommendations = [] + + # Analyze data patterns + if dataset_profile['null_percentage'] > 10: + recommendations.append({ + "type": "DATA_COMPLETENESS", + "priority": "HIGH", + "description": f"High null rate ({dataset_profile['null_percentage']}%) detected", + "suggested_actions": [ + "Add completeness assertions", + "Investigate upstream data source", + "Implement default value strategy" + ] + }) + + # Analyze quality trends + if dataset_profile['quality_trend'] == 'DECLINING': + recommendations.append({ + "type": "QUALITY_DEGRADATION", + "priority": "MEDIUM", + "description": "Quality scores declining over past 30 days", + "suggested_actions": [ + "Review recent pipeline changes", + "Increase assertion frequency", + "Schedule data source health check" + ] + }) + + return recommendations +``` + +### Exercise 5: Implement Quality-Driven Development + +Integrate quality into the development lifecycle: + +#### Step 1: Quality-First Pipeline Development + +**Quality-Driven Development Process**: + +1. **Quality Requirements**: Define quality criteria before development +2. **Quality by Design**: Build quality checks into pipeline architecture +3. **Quality Testing**: Test quality scenarios in development environments +4. **Quality Gates**: Automated quality validation in CI/CD pipelines +5. **Quality Monitoring**: Continuous quality tracking in production + +#### Step 2: Automated Quality Testing + +**Quality Test Framework**: + +```python +class QualityTestSuite: + def __init__(self, pipeline_config): + self.pipeline = pipeline_config + self.test_data = load_test_datasets() + + def test_data_completeness(self): + """Test that pipeline handles incomplete data correctly""" + # Inject missing values into test data + test_data_with_nulls = inject_nulls(self.test_data, percentage=20) + + result = run_pipeline_with_data(self.pipeline, test_data_with_nulls) + + assert result.completeness_score >= 0.95, "Pipeline should handle missing data" + assert result.error_count == 0, "No processing errors expected" + + def test_schema_evolution(self): + """Test pipeline resilience to schema changes""" + # Test with added columns + extended_schema = add_columns(self.test_data.schema, ["new_column"]) + result = run_pipeline_with_schema(self.pipeline, extended_schema) + assert result.success, "Pipeline should handle new columns gracefully" + + # Test with removed columns (should fail gracefully) + reduced_schema = remove_columns(self.test_data.schema, ["optional_column"]) + result = run_pipeline_with_schema(self.pipeline, reduced_schema) + assert result.handled_gracefully, "Pipeline should detect missing columns" +``` + +### Understanding Automation ROI + +**Quality Automation Benefits**: + +- **Reduced Manual Effort**: 60-80% reduction in manual quality management tasks +- **Faster Issue Detection**: Issues caught in minutes instead of hours/days +- **Improved Reliability**: 90%+ reduction in quality-related production incidents +- **Increased Confidence**: Teams can deploy changes with confidence in quality +- **Cost Savings**: Significant reduction in quality-related operational costs + +**Measuring Automation Success**: + +- **Automation Coverage**: Percentage of quality processes automated +- **Prevention Rate**: Issues prevented vs. issues detected after occurrence +- **Time to Resolution**: Speed improvement in quality issue resolution +- **False Positive Rate**: Accuracy of automated quality detection +- **Developer Productivity**: Time saved on manual quality tasks + +### Advanced Automation Techniques + +#### 1. Federated Quality Management + +Distribute quality management across teams while maintaining standards: + +- **Team-Specific Rules**: Allow teams to define domain-specific quality criteria +- **Central Governance**: Maintain organization-wide quality standards +- **Automated Compliance**: Ensure local rules align with global policies +- **Quality Metrics Aggregation**: Roll up team metrics to organizational dashboards + +#### 2. Real-Time Quality Streaming + +Implement quality validation in streaming data pipelines: + +```python +# Apache Kafka Streams quality validation +class StreamingQualityProcessor: + def process_record(self, record): + """Validate each record in real-time""" + quality_result = validate_record(record) + + if quality_result.passed: + return record # Forward to downstream + else: + # Route to dead letter queue for investigation + send_to_dlq(record, quality_result.errors) + emit_quality_metric("validation_failure", 1) + return None +``` + +### Quality Automation Best Practices + +#### 1. Start Simple, Scale Gradually + +- Begin with high-impact, low-complexity automation +- Prove value with pilot projects before organization-wide rollout +- Build automation incrementally based on lessons learned + +#### 2. Balance Automation and Human Oversight + +- Automate routine quality checks and responses +- Maintain human decision-making for complex quality issues +- Provide override mechanisms for exceptional cases + +#### 3. Design for Maintainability + +- Create modular, reusable quality components +- Document automation logic and decision criteria +- Plan for automation updates as business rules evolve + +### Congratulations! + +You've successfully implemented a comprehensive data quality management framework using DataHub. Your organization now has: + +**Automated Quality Checks**: Proactive detection of quality issues +**Real-time Monitoring**: Continuous visibility into data health +**Rapid Incident Response**: Structured processes for quality issues +**Preventive Automation**: Systems that prevent issues before they occur + +### Next Steps in Your Quality Journey + +1. **Expand Coverage**: Apply quality automation to additional data domains +2. **Advanced Analytics**: Implement ML-driven quality optimization +3. **Cross-Platform Integration**: Extend quality management across your data ecosystem +4. **Culture Development**: Build a quality-first mindset across data teams + +Your data quality foundation is now ready to support reliable, trustworthy data at scale. + +## Continue Learning + +Ready to explore more DataHub capabilities? Check out these related tutorials: + +- [Data Ingestion Mastery](../ingestion/overview.md) - Advanced data integration techniques +- [Privacy & Compliance](../privacy/overview.md) - Comprehensive privacy protection +- [Data Governance Fundamentals](../governance/overview.md) - Review governance best practices + + diff --git a/docs/learn-datahub/quality/quality-monitoring.md b/docs/learn-datahub/quality/quality-monitoring.md new file mode 100644 index 00000000000000..1f8a9e43a47411 --- /dev/null +++ b/docs/learn-datahub/quality/quality-monitoring.md @@ -0,0 +1,384 @@ +import DataHubEntityCard from '@site/src/components/DataHubEntityCard'; + +# Quality Monitoring + + + +## Building Comprehensive Quality Dashboards + +**Time Required**: 12 minutes + +### The Monitoring Challenge + +You've implemented data assertions, but you need visibility into quality trends across your entire data landscape: + +- **Scattered quality information** across different systems and teams +- **Reactive approach** - discovering issues only when stakeholders complain +- **No quality trends** to identify deteriorating data sources +- **Lack of accountability** for quality improvements + +**Real-World Impact**: Your CEO asked for a "data quality report" for the board meeting, but it took your team 3 days to manually gather quality metrics from various sources, and the information was already outdated by presentation time. + +### Understanding Quality Monitoring + +Quality monitoring provides continuous visibility into data health across your organization: + + + +**Monitoring Capabilities**: + +- **Real-time Dashboards**: Live quality metrics across all data assets +- **Trend Analysis**: Historical quality patterns and improvement tracking +- **Quality Scorecards**: Domain-specific quality assessments +- **Proactive Alerting**: Early warning system for quality degradation +- **Executive Reporting**: Summary views for leadership and stakeholders + +### Exercise 1: Create Quality Dashboards + +Build comprehensive dashboards for different stakeholder needs: + +#### Step 1: Executive Quality Dashboard + +Create a high-level view for leadership: + +1. **Navigate to Analytics** → **Quality Dashboards** +2. **Create New Dashboard**: "Executive Data Quality Overview" +3. **Add Key Metrics**: + - Overall quality score (percentage of passing assertions) + - Critical data assets health status + - Quality trend over last 90 days + - Top quality issues by business impact + +**Executive Dashboard Preview**: + +
+ + + +
+ +**Quality Metrics Summary**: + +- **Overall Quality Score**: 94.2% ↑ (+2.1% vs last month) +- **Critical Assets**: Customer Data (98.5%), Financial Data (89.2% - needs attention) +- **Trending Issues**: Payment processing delays, email validation failures + +#### Step 2: Operational Quality Dashboard + +Create detailed views for data teams: + +**Dashboard Configuration**: + +- **Name**: "Data Engineering Quality Operations" +- **Refresh**: Every 5 minutes +- **Scope**: All production datasets + +**Key Sections**: + +1. **Real-time Assertion Status** +2. **Pipeline Quality Health** +3. **Data Freshness Monitoring** +4. **Quality Issue Queue** + +### Exercise 2: Set Up Quality Scorecards + +Create domain-specific quality assessments: + +#### Step 1: Customer Domain Scorecard + +**Scorecard Configuration**: + +- **Domain**: Customer Data +- **Assets**: Customer profiles, transactions, interactions +- **Quality Dimensions**: + - Completeness (weight: 30%) + - Accuracy (weight: 25%) + - Consistency (weight: 20%) + - Timeliness (weight: 15%) + - Validity (weight: 10%) + +**Scoring Logic**: + +``` +Customer Domain Quality Score = + (Completeness × 0.30) + + (Accuracy × 0.25) + + (Consistency × 0.20) + + (Timeliness × 0.15) + + (Validity × 0.10) +``` + +#### Step 2: Financial Domain Scorecard + +**Enhanced Requirements for Financial Data**: + +- **Regulatory Compliance**: SOX, GAAP adherence +- **Audit Trail**: Complete lineage and change tracking +- **Precision**: Exact decimal calculations +- **Reconciliation**: Cross-system balance validation + +### Exercise 3: Implement Trend Analysis + +Monitor quality patterns over time: + +#### Step 1: Quality Trend Monitoring + +**Trend Metrics to Track**: + +- Daily assertion pass rates +- Weekly quality score changes +- Monthly quality improvement goals +- Quarterly compliance assessments + +**Trend Analysis Queries**: + +```sql +-- Daily quality trend +SELECT + date, + COUNT(CASE WHEN status = 'PASS' THEN 1 END) * 100.0 / COUNT(*) as pass_rate, + COUNT(*) as total_assertions +FROM assertion_results +WHERE date >= CURRENT_DATE - INTERVAL '30 days' +GROUP BY date +ORDER BY date; + +-- Quality improvement by domain +SELECT + domain, + AVG(CASE WHEN date >= CURRENT_DATE - INTERVAL '7 days' THEN quality_score END) as current_week, + AVG(CASE WHEN date >= CURRENT_DATE - INTERVAL '14 days' + AND date < CURRENT_DATE - INTERVAL '7 days' THEN quality_score END) as previous_week +FROM domain_quality_scores +GROUP BY domain; +``` + +#### Step 2: Seasonal Pattern Detection + +Identify recurring quality patterns: + +- **End-of-month** data processing spikes +- **Holiday periods** with reduced data volumes +- **Business cycle** impacts on data quality +- **System maintenance** windows affecting freshness + +### Exercise 4: Create Quality Alerts + +Set up intelligent alerting for quality issues: + +#### Step 1: Threshold-Based Alerts + +**Alert Configuration**: + +- **Critical Alert**: Overall quality drops below 90% +- **Warning Alert**: Domain quality drops below 95% +- **Info Alert**: New quality issues detected + +**Alert Channels**: + +- Slack integration for immediate team notification +- Email summaries for daily quality reports +- PagerDuty integration for critical production issues +- Jira ticket creation for tracking resolution + +#### Step 2: Anomaly Detection Alerts + +**Statistical Alerting**: + +```sql +-- Detect unusual assertion failure rates +WITH daily_stats AS ( + SELECT + date, + COUNT(CASE WHEN status = 'FAIL' THEN 1 END) as failures, + COUNT(*) as total + FROM assertion_results + WHERE date >= CURRENT_DATE - INTERVAL '30 days' + GROUP BY date +), +baseline AS ( + SELECT + AVG(failures * 100.0 / total) as avg_failure_rate, + STDDEV(failures * 100.0 / total) as stddev_failure_rate + FROM daily_stats + WHERE date < CURRENT_DATE +) +SELECT + ds.date, + ds.failures * 100.0 / ds.total as current_failure_rate, + b.avg_failure_rate + (2 * b.stddev_failure_rate) as alert_threshold +FROM daily_stats ds, baseline b +WHERE ds.date = CURRENT_DATE + AND ds.failures * 100.0 / ds.total > b.avg_failure_rate + (2 * b.stddev_failure_rate); +``` + +### Exercise 5: Build Quality Reports + +Create automated reporting for stakeholders: + +#### Step 1: Daily Quality Summary + +**Automated Daily Report**: + +- Overall quality status +- New issues discovered +- Issues resolved +- Quality trends +- Upcoming maintenance impacts + +**Report Template**: + +``` +Daily Data Quality Report - [Date] + +OVERALL STATUS +Quality Score: 94.2% (↑ 0.3% from yesterday) +Critical Issues: 2 (down from 5) +New Issues: 1 +Resolved Issues: 4 + +DOMAIN BREAKDOWN +Customer Data: 96.1% (Good) +Financial Data: 89.2% (Warning - investigating payment delays) +Product Data: 95.8% (Good) +Marketing Data: 94.5% (Good) + +🚨 ATTENTION REQUIRED +1. Payment processing latency (Financial) - ETA: 2PM +2. Customer email validation (CRM) - In progress + +TRENDS +- 7-day average: 93.8% (improving) +- Month-to-date: 94.1% (on track for 95% goal) +``` + +#### Step 2: Executive Monthly Report + +**Strategic Quality Report**: + +- Quality ROI and business impact +- Quality initiative progress +- Resource allocation recommendations +- Compliance status updates + +### Understanding Quality Metrics + +**Key Performance Indicators (KPIs)**: + +**Operational Metrics**: + +- **Assertion Pass Rate**: Percentage of quality checks passing +- **Mean Time to Detection (MTTD)**: Speed of quality issue identification +- **Mean Time to Resolution (MTTR)**: Speed of quality issue fixes +- **Data Freshness**: Timeliness of data updates + +**Business Metrics**: + +- **Quality-Related Incidents**: Business disruptions due to data issues +- **Stakeholder Satisfaction**: User confidence in data quality +- **Compliance Score**: Adherence to regulatory requirements +- **Quality ROI**: Business value of quality improvements + +### Advanced Monitoring Techniques + +#### 1. Machine Learning-Enhanced Monitoring + +Use ML to improve quality detection: + +- **Anomaly Detection**: Identify unusual data patterns +- **Predictive Quality**: Forecast potential quality issues +- **Root Cause Analysis**: Automatically identify issue sources +- **Quality Recommendations**: Suggest improvement actions + +#### 2. Real-Time Quality Streaming + +Monitor quality in real-time data streams: + +```sql +-- Real-time quality monitoring +SELECT + window_start, + COUNT(*) as records_processed, + COUNT(CASE WHEN quality_check = 'PASS' THEN 1 END) as quality_records, + COUNT(CASE WHEN quality_check = 'FAIL' THEN 1 END) as quality_failures +FROM streaming_quality_results +WHERE window_start >= CURRENT_TIMESTAMP - INTERVAL '1 hour' +GROUP BY window_start +ORDER BY window_start DESC; +``` + +#### 3. Cross-System Quality Correlation + +Monitor quality across integrated systems: + +- Correlate quality issues with system performance +- Identify upstream causes of quality problems +- Track quality impact propagation +- Coordinate quality improvements across teams + +### Quality Monitoring Best Practices + +#### 1. Design for Different Audiences + +- **Executives**: High-level trends and business impact +- **Data Teams**: Detailed technical metrics and alerts +- **Business Users**: Domain-specific quality insights +- **Compliance**: Regulatory adherence tracking + +#### 2. Balance Detail and Usability + +- Start with key metrics, add detail as needed +- Use visual indicators for quick status assessment +- Provide drill-down capabilities for investigation +- Include contextual information and recommendations + +#### 3. Ensure Actionability + +- Link quality metrics to specific improvement actions +- Provide clear ownership and escalation paths +- Include remediation guidance and documentation +- Track improvement progress over time + +### Next Steps + +With comprehensive quality monitoring in place, you're ready to implement incident management processes that ensure rapid response to quality issues. + + diff --git a/docs/learn-datahub/quickstart/discovery-basics.md b/docs/learn-datahub/quickstart/discovery-basics.md new file mode 100644 index 00000000000000..9538bda2762221 --- /dev/null +++ b/docs/learn-datahub/quickstart/discovery-basics.md @@ -0,0 +1,412 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import TutorialProgress from '@site/src/components/TutorialProgress'; +import NextStepButton from '@site/src/components/NextStepButton'; +import DataHubEntityCard, { SampleEntities } from '@site/src/components/DataHubEntityCard'; +import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode'; + +# Step 3: Discovery Basics (10 minutes) + + + +**Discovery Implementation**: With enterprise metadata now available in DataHub, you need to demonstrate systematic data discovery capabilities. This step focuses on enabling analysts to efficiently locate and understand relevant datasets. + +**Business Requirement**: Locate user engagement metrics to support executive reporting and strategic decision-making. The data exists within the analytics infrastructure but requires systematic discovery. + +**Your Objective**: Implement and demonstrate DataHub's discovery features to enable self-service data access across the organization. + +## What You'll Master + +By the end of this step, you'll be able to: + +- **Find specific datasets** using strategic search techniques +- **Navigate enterprise data architecture** across multiple platforms +- **Understand data relationships** through schema exploration +- **Identify relevant data** for business requirements + +## Enterprise Data Discovery Framework + +This tutorial demonstrates systematic data discovery techniques used in professional data environments. These methods apply to any enterprise data catalog and are essential for effective data analysis. + +### Professional Data Discovery Approach + +**Requirements Analysis** → **Strategic Search** → **Asset Evaluation** → **Context Gathering** → **Access Planning** + +| Step | Focus | Key Actions | +| ------------------------- | ------------------ | ---------------------------------------------- | +| **Requirements Analysis** | Define objectives | Understand business questions and data needs | +| **Strategic Search** | Target discovery | Use business terms and domain knowledge | +| **Asset Evaluation** | Quality assessment | Review schemas, documentation, and freshness | +| **Context Gathering** | Understand usage | Check lineage, owners, and related assets | +| **Access Planning** | Prepare for use | Verify permissions and connection requirements | + +## Method 1: Strategic Search - Finding User Metrics + +**Your First Lead**: The business requirement focuses on "user" metrics. Let's start there and see what data is available. + +### Strategic Search: User-Related Datasets + +1. **Open DataHub** at [http://localhost:9002](http://localhost:9002) + +2. **Search for user data**: + + ``` + Search: "user" + ``` + +3. **Analyze your results** - you should discover these datasets: + + + + + + +**Search Results Analysis**: This search successfully identified both datasets required for user metrics analysis, demonstrating the effectiveness of targeted search strategies in enterprise data discovery. + +:::tip Real-World Search Strategy +Notice how searching for "user" found tables with "users" in the name? DataHub's search is smart - it finds variations and related terms automatically. This is exactly how you'd search in production systems. +::: + +### Advanced Search Techniques + + + + +**Try these enterprise-specific searches:** + +``` +# Find analytics tables +fct_users +created deleted + +# Find event data +logging events +kafka sample + +# Find processed data +warehouse analytics +``` + +**Why this works**: This enterprise follows standard naming conventions (`fct_` for fact tables, descriptive names for events). + + + + +**Filter by enterprise platforms:** + +- **Hive**: Click to see only warehouse tables (`fct_users_created`, `fct_users_deleted`, `logging_events`) +- **Kafka**: Real-time streaming data (`SampleKafkaDataset`) +- **HDFS**: Data lake storage (`SampleHdfsDataset`) + +**Pro Tip**: For user analytics, focus on the **Hive** platform first for processed data! + + + + +**Advanced search operators:** + +``` +# Find all fact tables +name:fct_* + +# Find user-related data +user OR users + +# Exclude test data +user NOT test NOT sample +``` + +**Learn More**: Check out the [complete search documentation](../../how/search.md) for all available operators and techniques. + + + + +## Method 2: Browse by Organization + +Sometimes browsing is more effective than searching, especially when exploring unfamiliar data. + +### Browse by Platform + +1. **Click "Browse" in the top navigation** + +2. **Explore by data platform:** + + - **Demo Data**: Sample retail datasets + - **PostgreSQL**: Operational databases + - **Snowflake**: Data warehouse tables + - **dbt**: Transformed analytics models + +3. **Drill down into a platform:** + - Click on "Demo Data" + - You'll see all datasets from that platform + - Notice the hierarchical organization + +### Browse by Domain (if configured) + +If your organization uses domains: + +1. **Look for domain groupings** like: + + - Marketing Analytics + - Customer Operations + - Financial Reporting + - Product Analytics + +2. **Each domain contains** related datasets regardless of platform + +## Method 3: Explore Dataset Details + +Let's dive deep into a specific dataset to understand what information DataHub provides. + +### Find the Customer Dataset + +1. **Search for "customer"** or browse to find a customer-related table + +2. **Click on a dataset** (e.g., "customers" or "user_profiles") + +3. **Explore the dataset page** - you'll see several tabs: + +### Schema Tab - Understanding Your Data + +The Schema tab shows the structure of your dataset: + +**Column Information:** + +- **Name**: The column identifier +- **Type**: Data type (string, integer, timestamp, etc.) +- **Description**: Business meaning (if available) +- **Nullable**: Whether the field can be empty + +**Key things to look for:** + +``` +Primary keys (usually ID fields) +Foreign keys (relationships to other tables) +Date fields (for time-based analysis) +Categorical fields (for grouping/segmentation) +Numeric fields (for calculations/metrics) +``` + +### Properties Tab - Metadata & Context + +**Dataset Properties:** + +- **Owner**: Who's responsible for this data +- **Created**: When the dataset was first created +- **Last Modified**: When data was last updated +- **Tags**: Classification labels +- **Custom Properties**: Business-specific metadata + +**Platform Details:** + +- **Database/Schema**: Where the data lives +- **Table Type**: Table, view, or materialized view +- **Row Count**: Approximate number of records + +### Documentation Tab - Business Context + +Look for: + +- **Dataset description**: What this data represents +- **Column descriptions**: Business meaning of each field +- **Usage notes**: How this data should be used +- **Data quality notes**: Known issues or limitations + +## Understanding Data Relationships + +### Related Datasets + +At the bottom of any dataset page, look for: + +**"Frequently Co-occurring"**: Datasets often used together +**"Similar Datasets"**: Tables with similar schemas +**"Related by Lineage"**: Connected through data pipelines + +### Column-Level Relationships + +In the Schema tab: + +- **Foreign key indicators** show relationships to other tables +- **Similar columns** across datasets are highlighted +- **Column lineage** shows data transformation history + +## Practical Exercise: Customer Analysis Scenario + +Let's complete the original task - finding customer segmentation data: + +### Step 1: Search Strategy + +``` +1. Search for "customer segment" +2. Filter results to "Datasets" only +3. Look for tables with names like: + - customer_segments + - user_cohorts + - customer_analytics +``` + +### Step 2: Evaluate Options + +For each potential dataset, check: + +- **Schema**: Does it have the fields you need? +- **Freshness**: Is the data recent enough? +- **Owner**: Can you contact them with questions? +- **Documentation**: Is the business logic clear? + +### Step 3: Understand the Data + +Click into the most promising dataset and review: + +- **Column definitions**: What does each field mean? +- **Sample data**: What do actual values look like? +- **Lineage**: Where does this data come from? + +## Discovery Best Practices + +### For Data Consumers + +1. **Start broad, then narrow**: Begin with keyword searches, then use filters +2. **Check multiple sources**: The same business concept might exist in different systems +3. **Read the documentation**: Don't assume column meanings from names alone +4. **Contact owners**: When in doubt, reach out to dataset owners +5. **Bookmark frequently used datasets**: Save time on repeat searches + +### For Data Producers + +1. **Add clear descriptions**: Help others understand your data +2. **Tag appropriately**: Use consistent classification schemes +3. **Document business logic**: Explain calculations and transformations +4. **Keep metadata current**: Update descriptions when data changes + +## Understanding Data Relationships + +Now that you've discovered the key datasets, let's see how they connect in the data pipeline: + +### User Metrics Data Pipeline + + + +**Data Flow Analysis**: + +- **Source**: `logging_events` captures real-time user interactions +- **Processing**: `user_analytics` job transforms raw events into structured metrics +- **Output**: `fct_users_created` and `fct_users_deleted` provide business-ready analytics + +This lineage view shows you the complete data journey - from raw user events through processing to the final analytics tables. Understanding these relationships is crucial for data quality and impact analysis. + +## Common Discovery Patterns + + + + +**Scenario**: "I need to understand what customer data we have" + +**Approach**: + +1. Search broadly: "customer" +2. Browse by platform to see all sources +3. Compare schemas across datasets +4. Identify the most comprehensive source + + + + +**Scenario**: "I need customer email addresses for a campaign" + +**Approach**: + +1. Search specifically: "email" +2. Filter to datasets only +3. Check column details for email fields +4. Verify data freshness and quality + + + + +**Scenario**: "What would break if I change this table?" + +**Approach**: + +1. Navigate to the dataset +2. Check the Lineage tab +3. Identify downstream consumers +4. Contact owners of dependent systems + + + + +## Success Checkpoint + +**You've successfully completed Step 3 when you can:** + +- Find datasets using both search and browse methods +- Understand what information is available in dataset pages +- Read and interpret schema information +- Identify dataset relationships and dependencies + +**What you've learned:** + +- Multiple ways to discover data in DataHub +- How to evaluate datasets for your analysis needs +- Where to find business context and documentation +- How to understand data relationships + + Next: Explore Data Lineage +> diff --git a/docs/learn-datahub/quickstart/first-ingestion.md b/docs/learn-datahub/quickstart/first-ingestion.md new file mode 100644 index 00000000000000..254b87a0fa658c --- /dev/null +++ b/docs/learn-datahub/quickstart/first-ingestion.md @@ -0,0 +1,449 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import TutorialProgress from '@site/src/components/TutorialProgress'; +import NextStepButton from '@site/src/components/NextStepButton'; +import DataHubEntityCard, { SampleEntities } from '@site/src/components/DataHubEntityCard'; +import OSDetectionTabs from '@site/src/components/OSDetectionTabs'; + +# Step 2: First Data Ingestion (10 minutes) + + + +**The Implementation Challenge**: You have an empty DataHub instance that needs to be populated with enterprise metadata. Before analysts can discover and use data effectively, you must establish connections to the organization's data systems. + +**Your Objective**: Connect multiple data platforms to DataHub and ingest comprehensive metadata that enables self-service data discovery across the organization. + +## What You'll Accomplish + +By the end of this step, you'll have: + +- **Enterprise analytics data** from multiple systems ingested into DataHub +- **Multi-platform connectivity** established (Kafka streams, Hive warehouse, HDFS storage) +- **Comprehensive metadata** including schemas, lineage, and business context +- **Self-service foundation** enabling analysts to discover and understand data independently + +## Understanding Data Ingestion + +DataHub ingestion connects to your data systems and extracts comprehensive metadata through a standardized process: + +### Metadata Ingestion Workflow + +**1. Connection** → **2. Discovery** → **3. Extraction** → **4. Transformation** → **5. Loading** + +| Phase | Description | What Happens | +| ------------------ | -------------------- | --------------------------------------------------------------------- | +| **Connection** | Secure system access | DataHub establishes authenticated connections to source systems | +| **Discovery** | Schema scanning | Identifies databases, tables, views, and data structures | +| **Extraction** | Metadata collection | Pulls schema definitions, statistics, and lineage information | +| **Transformation** | Standardization | Converts metadata into DataHub's unified format | +| **Loading** | Storage & indexing | Stores metadata in DataHub's knowledge graph for search and discovery | + +**What gets ingested:** + +- **Schema information**: Table and column definitions +- **Data statistics**: Row counts, data types, sample values +- **Lineage**: How data flows between systems +- **Usage patterns**: Query history and access patterns (when available) + +## Connecting Enterprise Data Systems + +**The Situation**: This tutorial uses a representative enterprise data architecture with data scattered across multiple systems - just like most real companies. Let's get it all connected to DataHub. + +**What You're About to Ingest**: This enterprise data architecture includes: + +
+ + + + +```cmd +# Connect sample data ecosystem to DataHub +datahub docker ingest-sample-data + +# If datahub command not found: +python -m datahub docker ingest-sample-data +``` + + + + +```bash +# Connect sample data ecosystem to DataHub +datahub docker ingest-sample-data + +# If datahub command not found: +python3 -m datahub docker ingest-sample-data +``` + + + + +```bash +# Connect sample data ecosystem to DataHub +datahub docker ingest-sample-data + +# If datahub command not found: +python3 -m datahub docker ingest-sample-data + +# If permission issues: +sudo datahub docker ingest-sample-data +``` + + + + +**Enterprise Data Landscape:** + +
+ +| System | Platform | What's Inside | Business Purpose | +| -------------------- | -------- | ------------------------------------------------- | ------------------------------------- | +| **Real-time Events** | Kafka | `SampleKafkaDataset` - Live user activity streams | Track user behavior as it happens | +| **Data Warehouse** | Hive | `fct_users_created`, `fct_users_deleted` | Monthly user metrics for analytics | +| **Event Logs** | Hive | `logging_events` - Detailed activity logs | Source data for user analytics | +| **Data Lake** | HDFS | `SampleHdfsDataset` - Raw data storage | Historical data backup and processing | + +
+ +
+ +**Your Mission**: This ingestion will give you access to the complete enterprise data ecosystem. Pay special attention to the `fct_users_created` and `fct_users_deleted` tables - these contain the user metrics data. + +:::tip Real-World Context +This mirrors what you'd find at most tech companies: streaming data (Kafka), processed analytics (Hive), and data lake storage (HDFS). You're learning with realistic, production-like data architecture! +::: + +**Watch the Magic Happen**: As the ingestion runs, you'll see DataHub discovering these key datasets: + +
+ + +
+ +DataHub automatically extracts: + +- **Table schemas** with column definitions and data types +- **Data lineage** showing how tables connect across platforms +- **Ownership information** (John Doe owns most of this sample data) +- **Documentation** and business context + +**What happens during ingestion:** + +``` +Starting ingestion... +Extracting metadata from demo source... +Found 12 datasets +Found 156 columns +Found 8 lineage relationships +Found 3 dashboards +Found 2 data pipelines +Ingestion completed successfully! +``` + +## Option 2: Connect a Real Database (Advanced) + +If you want to connect your own database, here's how to create an ingestion recipe: + + + + +Create a file called `postgres-recipe.yml`: + +```yaml +source: + type: postgres + config: + host_port: localhost:5432 + database: retail_db + username: postgres + password: password + # Optional: specific schemas to ingest + schema_pattern: + allow: ["public", "analytics"] + +sink: + type: datahub-rest + config: + server: http://localhost:8080 +``` + +Run the ingestion: + +```bash +datahub ingest -c postgres-recipe.yml +``` + + + + +Create a file called `mysql-recipe.yml`: + +```yaml +source: + type: mysql + config: + host_port: localhost:3306 + database: retail_db + username: root + password: password + +sink: + type: datahub-rest + config: + server: http://localhost:8080 +``` + +Run the ingestion: + +```bash +datahub ingest -c mysql-recipe.yml +``` + + + + +For CSV files in a directory: + +```yaml +source: + type: csv-enricher + config: + # Path to your CSV files + filename: "/path/to/csv/files/*.csv" + +sink: + type: datahub-rest + config: + server: http://localhost:8080 +``` + +Run the ingestion: + +```bash +datahub ingest -c csv-recipe.yml +``` + + + + +## Mission Status: Did We Connect Enterprise Data? + +**The Moment of Truth**: Let's see if you successfully connected the enterprise data systems to DataHub. + +### 1. Check Your Ingestion Results + +Look for these success indicators in your terminal: + +``` +Ingestion completed successfully +Processed 5 datasets (SampleKafkaDataset, fct_users_created, fct_users_deleted, logging_events, SampleHdfsDataset) +Processed 15+ columns across all tables +Discovered lineage relationships between tables +No errors encountered +``` + +**Success Indicator**: If you see "Ingestion completed successfully", you have successfully connected a multi-platform data architecture to DataHub. + +### 2. Explore Enterprise Data in DataHub + +1. **Refresh DataHub** at [http://localhost:9002](http://localhost:9002) + +2. **Check the home page transformation**: + + - Dataset count jumped from 0 to 5+ datasets + - Recent activity shows "SampleKafkaDataset", "fct_users_created", etc. + - You can see the enterprise data platforms: Kafka, Hive, HDFS + +3. **Quick victory lap** - click "Browse" in the top navigation: + - **Hive platform**: You should see `fct_users_created` and `fct_users_deleted` (the user metrics datasets) + - **Kafka platform**: Real-time streaming data (`SampleKafkaDataset`) + - **HDFS platform**: Data lake storage (`SampleHdfsDataset`) + +**Pro Tip**: Notice how DataHub automatically organized everything by platform? This is how you'll navigate complex data ecosystems in real companies. + +### 3. Your First Dataset Deep-Dive: Exploring User Metrics Data + +**Time to investigate!** Let's look at the user metrics data. Click on `fct_users_created` (you'll find it under the Hive platform). + +**What You'll Discover**: + +**Schema Tab** - The data structure: + +- `user_id`: The key field for tracking individual users +- `created_date`: When each user was created (perfect for monthly analysis!) +- You'll see this is a proper fact table with clean, analytics-ready data + +**Properties Tab** - Business context: + +- **Owner**: John Doe (jdoe@linkedin.com) - now you know who to contact with questions! +- **Platform**: Hive (enterprise data warehouse) +- **Custom Properties**: You might see metadata like `prop1: fakeprop` - this is where business teams add context + +**Lineage Tab** - The data story: + +- **Upstream**: This table is built from `logging_events` (the raw event data) +- **Downstream**: You'll see connections to other analytics tables +- This shows you the complete data pipeline from raw events to business metrics + +**Mission Progress**: You've just found the user metrics data! The `fct_users_created` table has user creation data with timestamps - perfect for monthly analysis. + +:::tip Real-World Learning +This exploration pattern is exactly what you'd do at any company: find the table, understand its structure, identify the owner, and trace its lineage. You're learning production data analysis skills! +::: + +**Want to Learn More?** Check out the [full dataset documentation](/docs/generated/metamodel/entities/dataset.md) to understand all the metadata DataHub captures. + +## Understanding the Ingestion Process + +Let's break down what just happened: + +### 1. Connection & Discovery + +``` +DataHub Connector → Data Source +├── Authenticates using provided credentials +├── Discovers available schemas/databases +└── Lists all tables and views +``` + +### 2. Metadata Extraction + +``` +For each table/view: +├── Extract schema (columns, types, constraints) +├── Collect statistics (row counts, data distribution) +├── Identify relationships (foreign keys, joins) +└── Gather usage information (if available) +``` + +### 3. Lineage Detection + +``` +DataHub analyzes: +├── SQL queries in views and stored procedures +├── ETL pipeline definitions +├── Data transformation logic +└── Cross-system data flows +``` + +### 4. Storage & Indexing + +``` +Metadata is stored in: +├── MySQL (primary metadata storage) +├── OpenSearch (search index) +└── Kafka (real-time event stream) +``` + +## Ingestion Best Practices + +**For production environments:** + +1. **Start small**: Begin with a few important datasets +2. **Use scheduling**: Set up regular ingestion to keep metadata fresh +3. **Monitor performance**: Large databases may need configuration tuning +4. **Secure credentials**: Use environment variables or secret management +5. **Test first**: Always test ingestion recipes in development + +## Troubleshooting Common Issues + + + + +**Error:** `Failed to connect to database` + +**Common causes:** + +- Incorrect host/port +- Wrong credentials +- Database not accessible from Docker container +- Firewall blocking connection + +**Solutions:** + +```bash +# Test connection manually +telnet your-db-host 5432 + +# For local databases, use host.docker.internal instead of localhost +host_port: host.docker.internal:5432 +``` + + + + +**Error:** `Ingestion completed but no datasets found` + +**Common causes:** + +- Schema/database doesn't exist +- User lacks permissions +- Pattern filters too restrictive + +**Solutions:** + +```yaml +# Check permissions +GRANT SELECT ON ALL TABLES IN SCHEMA public TO datahub_user; + +# Broaden patterns +schema_pattern: + allow: [".*"] # Allow all schemas +``` + + + + +**Issue:** Ingestion taking very long + +**Solutions:** + +```yaml +# Disable profiling for large tables +profiling: + enabled: false + +# Limit table discovery +table_pattern: + allow: ["important_table_.*"] +``` + + + + +## Implementation Checkpoint: Verify Success + +**You've successfully completed the metadata ingestion when:** + +- **Enterprise data is live**: 5+ datasets visible in DataHub (Kafka, Hive, HDFS platforms) +- **Analytics tables discovered**: You can see `fct_users_created` and `fct_users_deleted` in the Hive platform +- **Data exploration complete**: You've clicked into a dataset and seen schema, properties, and lineage +- **Owner identified**: You know John Doe owns the user analytics data + +**Implementation Success**: You've successfully connected a multi-platform data architecture to DataHub, establishing comprehensive metadata visibility across the organization's data ecosystem. + +**What you've accomplished:** + +- **Enterprise integration**: Connected Kafka streams, Hive warehouse, and HDFS storage systems +- **Automated metadata discovery**: Extracted schemas, lineage, and ownership information +- **Business enablement**: Created the foundation for self-service data discovery +- **Production-ready skills**: Implemented the same processes used in enterprise environments + +**Next Phase**: With metadata ingestion complete, you can now enable systematic data discovery and analysis across the organization. + + Next: Discover and Explore Your Data +> diff --git a/docs/learn-datahub/quickstart/first-lineage.md b/docs/learn-datahub/quickstart/first-lineage.md new file mode 100644 index 00000000000000..52b3fa88a8903e --- /dev/null +++ b/docs/learn-datahub/quickstart/first-lineage.md @@ -0,0 +1,362 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import TutorialProgress from '@site/src/components/TutorialProgress'; + +# Step 4: Your First Lineage (5 minutes) + + + +**The Final Piece**: You've located the user metrics data (`fct_users_created` and `fct_users_deleted`), but before delivering the analysis, you need to understand something crucial: _Where does this data come from?_ Is it reliable? What happens if something breaks upstream? + +**Your Objective**: Use DataHub's lineage features to trace the data pipeline and understand how the organization's user metrics are created. This knowledge will make you confident in your analysis and help you spot potential issues. + +## What You'll Accomplish + +By the end of this step, you'll be able to: + +- Navigate lineage graphs to understand data flow +- Distinguish between upstream and downstream dependencies +- Use lineage for impact analysis and troubleshooting +- Understand column-level lineage relationships + +## Understanding Data Lineage + +Data lineage provides a comprehensive view of data flow throughout your organization, tracking data from its origin through all transformations to final consumption points. + +**Enterprise Lineage Components:** + +- **Source Systems**: Original data repositories and databases +- **Transformation Layers**: ETL processes, data pipelines, and business logic +- **Intermediate Storage**: Staging areas, data warehouses, and data lakes +- **Consumption Points**: Reports, dashboards, and analytical applications +- **Data Dependencies**: Relationships between datasets and processes + +**Why lineage matters:** + +- **Impact Analysis**: "What breaks if I change this table?" +- **Root Cause Analysis**: "Why is this dashboard showing wrong numbers?" +- **Data Governance**: "Where does this sensitive data flow?" +- **Compliance**: "Can we trace this data back to its source?" + +## Tracing Enterprise Data Pipelines + +### Method 1: Following User Metrics Data Trail + +Let's trace the lineage of user analytics data: + +1. **Navigate to `fct_users_created`** (the table you found in discovery) + +2. **Click the "Lineage" tab** to see the data story + +3. **Analyze the enterprise data flow:** + - **Upstream (left)**: `logging_events` - This is where user creation events are captured + - **Current dataset (center)**: `fct_users_created` - The processed analytics table for user metrics + - **Downstream (right)**: Any dashboards or reports that use this data + +**What This Tells You**: The user creation data flows from raw events (`logging_events`) through processing into the analytics table (`fct_users_created`). This is a clean, reliable pipeline! + +### Method 2: Global Lineage View + +1. **From any dataset page**, click the **"View Lineage"** button + +2. **This opens the full lineage explorer** with: + - Interactive graph visualization + - Zoom and pan controls + - Filter options + - Multi-hop lineage traversal + +## Reading Lineage Graphs + +Let's understand the visual elements: + +### Node Types + + + + +**Tables/Views** (rectangular nodes): + +- **Database tables**: Raw operational data +- **Analytics views**: Transformed/aggregated data +- **Materialized views**: Pre-computed results +- **Files**: CSV, Parquet, JSON data files + + + + +**Data Jobs** (circular nodes): + +- **ETL jobs**: Extract, Transform, Load processes +- **dbt models**: Data transformation logic +- **Python scripts**: Custom data processing +- **Airflow DAGs**: Workflow orchestration + + + + +**Consuming Applications** (diamond nodes): + +- **BI Dashboards**: Looker, Tableau, PowerBI +- **ML Models**: Training and inference pipelines +- **Applications**: Customer-facing features +- **Reports**: Automated business reports + + + + +### Connection Types + +**Solid lines**: Direct data dependencies +**Dashed lines**: Indirect or inferred relationships +**Colored lines**: Different types of transformations + +## Practical Lineage Scenarios + +### Scenario 1: Impact Analysis + +**Question**: "I need to update the customer table schema. What will be affected?" + +**Steps**: + +1. Navigate to the `customers` table +2. Click the Lineage tab +3. Look at **downstream dependencies** (right side) +4. Identify all affected: + - Analytics tables that read from customers + - Dashboards that display customer data + - ML models that use customer features + - Reports that include customer metrics + +**What you'll see**: + +``` +customers → customer_analytics → customer_dashboard +customers → ml_features → churn_model → recommendation_api +customers → daily_report_job → executive_dashboard +``` + +### Scenario 2: Root Cause Analysis + +**Question**: "The customer dashboard shows wrong numbers. Where's the problem?" + +**Steps**: + +1. Start at the `customer_dashboard` +2. Trace **upstream dependencies** (left side) +3. Check each step in the pipeline: + - Is the source data fresh? + - Did any ETL jobs fail? + - Are transformations working correctly? + +**Debugging path**: + +``` +customer_dashboard ← customer_metrics ← etl_job ← raw_customers + ↑ + Check here first! +``` + +### Scenario 3: Data Governance + +**Question**: "This table contains PII. Where does this sensitive data flow?" + +**Steps**: + +1. Find the table with PII (e.g., `customer_profiles`) +2. Examine **all downstream paths** +3. Identify systems that receive sensitive data +4. Verify proper access controls and compliance + +## Column-Level Lineage + +For detailed analysis, DataHub can show how individual columns flow through transformations: + +### Viewing Column Lineage + +1. **In the Schema tab** of any dataset +2. **Click on a specific column** +3. **Select "View Column Lineage"** + +This shows: + +- **Source columns**: Which upstream columns contribute to this field +- **Transformation logic**: How the column is calculated or derived +- **Downstream usage**: Where this column is used in other systems + +### Example: Customer Segment Column + +```sql +-- Source: customers.customer_type + orders.total_spent +-- Transformation: +CASE + WHEN total_spent > 1000 THEN 'Premium' + WHEN total_spent > 500 THEN 'Standard' + ELSE 'Basic' +END as customer_segment + +-- Used in: marketing_campaigns, customer_dashboard, ml_features +``` + +## Lineage Best Practices + +### For Data Consumers + +1. **Always check lineage** before using unfamiliar data +2. **Trace to the source** to understand data freshness and quality +3. **Identify alternatives** by looking at similar downstream datasets +4. **Contact upstream owners** when you need data changes + +### For Data Producers + +1. **Document transformations** so lineage is meaningful +2. **Use consistent naming** to make lineage easier to follow +3. **Tag critical paths** to highlight important data flows +4. **Monitor downstream usage** to understand impact of changes + +## Advanced Lineage Features + +### Multi-Hop Lineage + +**View end-to-end data journeys:** + +- Set lineage depth to 3+ hops +- Trace from raw source to final application +- Understand complete data supply chains + +### Lineage Filtering + +**Focus on specific aspects:** + +- Filter by entity type (datasets only, pipelines only) +- Filter by platform (show only Snowflake → dbt flow) +- Filter by time (show recent lineage changes) + +### Lineage Search + +**Find specific relationships:** + +- "Show me all paths from customers to dashboards" +- "Find datasets that depend on this API" +- "Trace this column through all transformations" + +## Troubleshooting Lineage Issues + + + + +**Issue**: Expected lineage connections don't appear + +**Common causes**: + +- Ingestion didn't capture SQL parsing +- Complex transformations not detected +- Cross-platform connections not configured + +**Solutions**: + +- Enable SQL parsing in ingestion config +- Add manual lineage for complex cases +- Check cross-platform lineage settings + + + + +**Issue**: Lineage shows wrong relationships + +**Common causes**: + +- Temporary tables confusing lineage detection +- Dynamic SQL not parsed correctly +- Naming conflicts between systems + +**Solutions**: + +- Review and correct automatic lineage +- Add manual lineage overrides +- Use more specific naming conventions + + + + +**Issue**: Lineage graphs load slowly + +**Common causes**: + +- Very deep lineage (many hops) +- Large number of connected entities +- Complex transformation logic + +**Solutions**: + +- Limit lineage depth +- Use filters to focus on relevant paths +- Break down complex transformations + + + + +## Tutorial Objectives Achieved + +**You've successfully completed your DataHub journey when you can:** + +- **Navigate lineage confidently**: Trace enterprise data from `logging_events` to `fct_users_created` +- **Understand data reliability**: Know that user metrics come from a clean, traceable pipeline +- **Identify data owners**: You know John Doe owns the user analytics pipeline +- **Assess data quality**: The lineage shows a proper fact table structure + +**Your Achievement**: In 30 minutes, you've mastered essential DataHub skills! You can now: + +- **Deploy DataHub** and connect multi-platform data architectures +- **Find specific datasets** using strategic search techniques +- **Understand data pipelines** through lineage analysis +- **Deliver confident analysis** backed by metadata insights + +**Analysis Ready**: You now have everything needed to answer business questions about user creation vs. deletion metrics, plus the confidence that comes from understanding the complete data pipeline. + +:::tip Mark Your Progress +Check off "Your First Lineage" in the progress tracker above! You've completed the entire DataHub Quickstart. +::: + +## Tutorial Complete + +You've completed the **DataHub in 30 Minutes** tutorial! You now have hands-on experience with DataHub's core capabilities: + +**Deployed DataHub** locally and understand its architecture +**Ingested metadata** from data sources +**Discovered datasets** using search and browse features +**Traced data lineage** to understand dependencies + +## What's Next? + +Now that you understand DataHub fundamentals, explore these advanced topics: + +### Immediate Next Steps + +- **[Data Discovery & Search](../discovery/overview.md)** - Master advanced search techniques and filters +- **[Data Lineage & Impact Analysis](../lineage/overview.md)** - Deep dive into lineage analysis and troubleshooting +- **Data Governance Fundamentals (coming soon)** - Learn about ownership, classification, and business glossaries + +### For Your Organization + +- **Plan your DataHub deployment** for production use +- **Identify key data sources** to ingest first +- **Establish governance processes** for metadata management +- **Train your team** on DataHub best practices + +### Get Help & Stay Connected + +- **[Join DataHub Slack](https://datahub.com/slack)** - Connect with the community +- **[Read the full documentation](../../)** - Comprehensive guides and references +- **[Watch DataHub tutorials](https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w)** - Video walkthroughs +- **[Report issues](https://github.com/datahub-project/datahub/issues)** - Help improve DataHub + +**Happy data discovering!** diff --git a/docs/learn-datahub/quickstart/overview.md b/docs/learn-datahub/quickstart/overview.md new file mode 100644 index 00000000000000..71d3f7da8e9d07 --- /dev/null +++ b/docs/learn-datahub/quickstart/overview.md @@ -0,0 +1,137 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import TutorialProgress from '@site/src/components/TutorialProgress'; + +# Chapter 1: DataHub Foundation (30 minutes) + +:::tip Professional Development Journey +This tutorial follows realistic challenges that data professionals face when implementing metadata management in production environments. +::: + +## The Business Challenge + +**Your Role**: You're a data professional tasked with implementing metadata management for a growing technology organization. Data is distributed across multiple systems without centralized discovery or governance. + +**The Business Challenge**: Executive leadership requires user engagement metrics for strategic decision-making. The data exists across various systems, but there's no efficient way to locate, validate, and understand data relationships. + +**What You'll Accomplish**: + +- **Deploy DataHub** as the central metadata management platform +- **Connect enterprise data systems** across streaming, analytics, and storage platforms +- **Implement systematic data discovery** to reduce time-to-insight +- **Establish data lineage tracking** for quality assurance and impact analysis + +**Business Outcome**: Enable self-service data discovery while establishing enterprise-grade metadata governance. + +## Tutorial Structure + +This tutorial is designed to be completed in sequence. **Track your progress** as you go: + + + +**Total Time: 30 minutes** | **Your Progress: Tracked above** + +## Prerequisites + +Before starting, ensure you have: + +- **Docker Desktop** installed and running +- **Python 3.9+** installed +- **Basic familiarity** with databases and data concepts +- **15 minutes** of uninterrupted time + +## The Business Scenario + +**Organizational Context**: You're implementing DataHub for a technology company experiencing rapid growth. Data teams are struggling with: + +- **Discovery bottlenecks**: Analysts spend 60% of their time finding relevant data +- **Quality uncertainty**: No systematic way to validate data reliability +- **Compliance gaps**: Difficulty tracking data lineage for regulatory requirements +- **Knowledge silos**: Critical data knowledge trapped with individual team members + +**Your Implementation Goal**: Establish DataHub as the central metadata platform to solve these enterprise challenges. + +**Enterprise Data Architecture**: You'll work with a realistic multi-platform data ecosystem: + +- **Analytics Layer**: User behavior metrics and business KPIs +- **Streaming Platform**: Real-time event data from Kafka +- **Data Warehouse**: Processed analytical datasets in Hive +- **Data Lake**: Raw data storage in HDFS + +**Why This Matters**: This architecture represents common enterprise patterns where data teams need centralized metadata management to maintain productivity and compliance. + +### DataHub Integration Architecture + +DataHub acts as the central metadata hub connecting your entire data ecosystem: + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ Source Data │ │ DataHub Core │ │ User Interface │ +│ │ │ │ │ │ +│ • Cloud DBs │───▶│ • Metadata API │───▶│ • Web App │ +│ • Warehouses │ │ • Graph Store │ │ • CLI Tools │ +│ • Streaming │ │ • Search Index │ │ • Dashboards │ +└─────────────────┘ └─────────────────┘ └─────────────────┘ +``` + +**Key Integration Points**: + +- **Automated Discovery**: DataHub connectors extract metadata from your existing systems +- **Unified View**: All metadata is standardized and searchable through a single interface +- **Real-time Updates**: Changes in source systems are reflected immediately in DataHub +- **API Access**: Programmatic access enables integration with your existing workflows + +## Learning Outcomes + +After completing this tutorial, you'll be able to: + +- **Deploy DataHub** in a local development environment +- **Connect data sources** and understand ingestion concepts +- **Find datasets** using DataHub's search and discovery features +- **Read data lineage** to understand data dependencies +- **Navigate the DataHub UI** confidently for daily data work + +## What's Next? + +This tutorial provides the foundation for more advanced DataHub concepts. After completion, consider exploring: + +- **[Data Discovery & Search](../discovery/overview.md)** - Master advanced search techniques +- **[Data Lineage & Impact Analysis](../lineage/overview.md)** - Deep dive into lineage and impact analysis +- **Data Governance Fundamentals (coming soon)** - Learn ownership, classification, and glossaries + +## Need Help? + +If you encounter issues during this tutorial: + +- Check the [Troubleshooting Guide](../../troubleshooting/quickstart.md) +- Visit the [DataHub Slack Community](https://datahub.com/slack) +- Review the [Full Quickstart Documentation](../../quickstart.md) + +--- + +**Ready to get started?** Let's begin with [Setting up DataHub](setup.md) → diff --git a/docs/learn-datahub/quickstart/setup.md b/docs/learn-datahub/quickstart/setup.md new file mode 100644 index 00000000000000..9ad10cebbeebeb --- /dev/null +++ b/docs/learn-datahub/quickstart/setup.md @@ -0,0 +1,460 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import TutorialProgress from '@site/src/components/TutorialProgress'; +import OSDetectionTabs from '@site/src/components/OSDetectionTabs'; + +# Step 1: Setup DataHub (5 minutes) + + + +In this step, you'll deploy DataHub locally using Docker. This gives you a complete DataHub environment running on your machine for learning and experimentation. + +## What You'll Accomplish + +By the end of this step, you'll have: + +- DataHub running locally at `http://localhost:9002` +- Understanding of DataHub's core components +- Access to the DataHub web interface + +## Prerequisites Check + +Before we begin, verify you have the required software: + + + + +**Run these commands to verify your setup:** + +
+ +```bash +# Check Docker +docker --version +docker-compose --version + +# Check Python +python3 --version + +# Check Docker is running +docker ps +``` + +**Expected output:** + +- Docker version 20.10+ +- Docker Compose version 2.0+ +- Python 3.9+ +- Docker ps should run without errors + +
+ +:::tip Success Indicator +If all commands run without errors, you're ready to proceed! +::: + +
+ + +If you're missing any prerequisites, follow the OS-specific instructions below: + + + + +**Install Docker Desktop:** + +1. Download [Docker Desktop for Windows](https://www.docker.com/products/docker-desktop/) +2. Run the installer and follow the setup wizard +3. Restart your computer when prompted +4. Launch Docker Desktop from the Start menu + +**Install Python 3.9+:** + +1. Download from [python.org](https://www.python.org/downloads/windows/) +2. **Important**: Check "Add Python to PATH" during installation +3. Verify installation: Open Command Prompt and run `python --version` + +**System Requirements:** + +- Windows 10 64-bit: Pro, Enterprise, or Education (Build 16299 or later) +- WSL 2 feature enabled (Docker Desktop will help set this up) +- 2 CPUs minimum, 8GB RAM minimum, 12GB free disk space + + + + +**Install Docker Desktop:** + +1. Download [Docker Desktop for Mac](https://www.docker.com/products/docker-desktop/) +2. Drag Docker.app to your Applications folder +3. Launch Docker Desktop from Applications +4. Follow the setup assistant + +**Install Python 3.9+:** + +```bash +# Using Homebrew (recommended) +brew install python@3.9 + +# Or download from python.org +# Visit: https://www.python.org/downloads/macos/ +``` + +**System Requirements:** + +- macOS 10.15 or newer +- Apple chip (M1/M2) or Intel chip +- 2 CPUs minimum, 8GB RAM minimum, 12GB free disk space + + + + +**Install Docker:** + +```bash +# Ubuntu/Debian +sudo apt-get update +sudo apt-get install docker.io docker-compose-plugin +sudo systemctl start docker +sudo systemctl enable docker + +# CentOS/RHEL/Fedora +sudo yum install docker docker-compose +sudo systemctl start docker +sudo systemctl enable docker + +# Add your user to docker group (logout/login required) +sudo usermod -aG docker $USER +``` + +**Install Python 3.9+:** + +```bash +# Ubuntu/Debian +sudo apt-get install python3 python3-pip + +# CentOS/RHEL/Fedora +sudo yum install python3 python3-pip + +# Verify installation +python3 --version +``` + +**System Requirements:** + +- 64-bit Linux distribution +- Kernel version 3.10 or higher +- 2 CPUs minimum, 8GB RAM minimum, 12GB free disk space + + + + +**Common Resource Requirements:** + +- 2 CPUs minimum +- 8GB RAM minimum +- 12GB free disk space + + +
+ +## Install DataHub CLI + +The DataHub CLI is your primary tool for managing DataHub deployments and ingestion. + + + + +```cmd +# Install the DataHub CLI (Command Prompt or PowerShell) +python -m pip install --upgrade pip wheel setuptools +python -m pip install --upgrade acryl-datahub + +# Verify installation +datahub version +``` + +**Troubleshooting:** + +- If `python` command not found, try `py` instead +- If `datahub` command not found, use `python -m datahub version` +- Ensure Python was added to PATH during installation + + + + +```bash +# Install the DataHub CLI +python3 -m pip install --upgrade pip wheel setuptools +python3 -m pip install --upgrade acryl-datahub + +# Verify installation +datahub version +``` + +**Troubleshooting:** + +- If `datahub` command not found, use `python3 -m datahub version` +- On M1/M2 Macs, you might need to install Rosetta 2 for some dependencies + + + + +```bash +# Install the DataHub CLI +python3 -m pip install --upgrade pip wheel setuptools +python3 -m pip install --upgrade acryl-datahub + +# Verify installation +datahub version + +# Alternative: Install with user flag if permission issues +python3 -m pip install --user --upgrade acryl-datahub +``` + +**Troubleshooting:** + +- If `datahub` command not found, use `python3 -m datahub version` +- Add `~/.local/bin` to PATH if using `--user` flag +- Use `sudo` only if installing system-wide (not recommended) + + + + +**Expected output:** + +``` +DataHub CLI version: 0.13.x +Python version: 3.x.x +``` + +## Deploy DataHub + +Now let's start DataHub using the quickstart deployment: + + + + +```cmd +# Deploy DataHub locally (Command Prompt) +datahub docker quickstart + +# If datahub command not found, use: +python -m datahub docker quickstart +``` + +**Windows-specific notes:** + +- Ensure Docker Desktop is running before executing +- The process may take longer on Windows due to WSL 2 overhead +- If you encounter permission issues, run Command Prompt as Administrator + + + + +```bash +# Deploy DataHub locally +datahub docker quickstart + +# If datahub command not found, use: +python3 -m datahub docker quickstart +``` + +**macOS-specific notes:** + +- Ensure Docker Desktop is running and has sufficient resources allocated +- On M1/M2 Macs, some images may need to be built for ARM architecture +- Grant Docker Desktop access to your file system when prompted + + + + +```bash +# Deploy DataHub locally +datahub docker quickstart + +# If datahub command not found, use: +python3 -m datahub docker quickstart + +# If permission issues with Docker: +sudo datahub docker quickstart +``` + +**Linux-specific notes:** + +- Ensure Docker service is running: `sudo systemctl status docker` +- If using sudo, DataHub files will be owned by root +- Consider adding your user to the docker group to avoid sudo + + + + +This command will: + +1. **Download** the DataHub Docker Compose configuration +2. **Pull** all required Docker images (this may take a few minutes) +3. **Start** all DataHub services + +**What's happening behind the scenes:** + +### DataHub Deployment Process + +The `datahub docker quickstart` command orchestrates a complete DataHub deployment: + +**Phase 1: Environment Preparation** + +- Validates Docker installation and system requirements +- Checks available ports (9002 for frontend, 8080 for backend) +- Prepares configuration files and networking + +**Phase 2: Infrastructure Setup** + +- Downloads the latest docker-compose configuration +- Pulls required Docker images: + - `acryldata/datahub-gms` (Backend services) + - `acryldata/datahub-frontend-react` (Web interface) + - `mysql:8` (Metadata storage) + - `opensearchproject/opensearch` (Search index) + - `confluentinc/cp-kafka` (Message queue) + +**Phase 3: Service Orchestration** + +- Starts core infrastructure (MySQL, OpenSearch, Kafka) +- Initializes DataHub backend services (GMS) +- Launches the web frontend +- Configures DataHub Actions for automation + +**Expected Timeline**: Initial deployment takes 3-5 minutes depending on your internet connection and system performance. + +## Verify Deployment + +When deployment completes successfully, you should see: + +``` +DataHub is now running +Ingest some demo data using `datahub docker ingest-sample-data`, +or head to http://localhost:9002 (username: datahub, password: datahub) to play around with the frontend. +``` + +**Let's verify everything is working:** + +1. **Check running containers:** + + ```bash + docker ps + ``` + + You should see 6-8 containers running with names like: + + - `datahub-frontend-quickstart-1` + - `datahub-datahub-gms-quickstart-1` + - `datahub-mysql-1` + - `datahub-opensearch-1` + +2. **Access the DataHub UI:** + + - Open your browser to [http://localhost:9002](http://localhost:9002) + - You should see the DataHub login page + +3. **Sign in with default credentials:** + ``` + Username: datahub + Password: datahub + ``` + +## Understanding DataHub Architecture + +Now that DataHub is running, let's understand what you've deployed: + +| Component | Purpose | Port | +| -------------------- | --------------------------------- | ---- | +| **DataHub Frontend** | Web UI for users | 9002 | +| **DataHub GMS** | Metadata API and business logic | 8080 | +| **MySQL** | Stores metadata and configuration | 3306 | +| **OpenSearch** | Powers search and discovery | 9200 | +| **Kafka** | Handles real-time metadata events | 9092 | +| **DataHub Actions** | Automation and workflows | - | + +**Data Flow:** + +1. **Metadata ingestion** → GMS API → MySQL (storage) + OpenSearch (search) +2. **User searches** → Frontend → GMS → OpenSearch → Results +3. **Real-time updates** → Kafka → Actions → UI notifications + +## Troubleshooting + +**Common issues and solutions:** + + + + +**Error:** `Port already in use` + +**Solution:** + +```bash +# Check what's using the port +lsof -i :9002 + +# Stop conflicting services or use different ports +datahub docker quickstart --port 9003 +``` + + + + +**Error:** `Container fails to start` or `Out of memory` + +**Solution:** + +1. Increase Docker Desktop memory to 8GB+ +2. Close other applications +3. Restart Docker Desktop + + + + +**Issue:** Services taking a long time to start + +**This is normal for first-time setup:** + +- Image downloads: 5-10 minutes +- Service initialization: 2-3 minutes +- Total first-time setup: 10-15 minutes + + + + +## Success Checkpoint + +**You've successfully completed Step 1 when:** + +- DataHub UI loads at http://localhost:9002 +- You can sign in with datahub/datahub credentials +- You see the empty DataHub home page +- All Docker containers are running properly + +**What you've learned:** + +- How to deploy DataHub locally using Docker +- DataHub's core architecture components +- How to verify a successful deployment + +import NextStepButton from '@site/src/components/NextStepButton'; + + Next: Ingest Your First Dataset +> From 71472d0b6bb733fdb4723a7f3cb465342d10702c Mon Sep 17 00:00:00 2001 From: Jonny Dixon Date: Wed, 29 Oct 2025 22:20:57 +0000 Subject: [PATCH 02/10] second commit --- .../DataHubEntityCard/styles.module.css | 45 ++ .../components/DataHubLineageNode/index.jsx | 341 ++++++-- .../DataHubLineageNode/styles.module.css | 173 +++++ .../components/LineageLayoutGrid/index.jsx | 296 +++++++ .../LineageLayoutGrid/styles.module.css | 125 +++ .../TutorialExercise/styles.module.css | 91 ++- docs/api/tutorials/sdk/bulk-assertions-sdk.md | 26 +- .../discovery/advanced-search.md | 97 +-- .../discovery/collaborative-discovery.md | 72 +- .../discovery/dataset-profiles.md | 135 ++-- docs/learn-datahub/discovery/overview.md | 2 +- .../governance/business-glossary.md | 66 +- .../governance/data-classification.md | 6 +- .../governance/governance-policies.md | 76 +- .../governance/ownership-management.md | 8 +- docs/learn-datahub/ingestion/overview.md | 78 +- docs/learn-datahub/lineage/impact-analysis.md | 50 +- docs/learn-datahub/lineage/reading-lineage.md | 730 ++++++++++++++++-- docs/learn-datahub/lineage/troubleshooting.md | 84 +- docs/learn-datahub/privacy/overview.md | 78 +- docs/learn-datahub/quality/data-assertions.md | 2 +- .../quality/incident-management.md | 14 +- docs/learn-datahub/quality/overview.md | 68 +- .../quality/quality-automation.md | 4 +- .../quality/quality-monitoring.md | 2 +- .../quickstart/first-ingestion.md | 44 ++ .../learn-datahub/quickstart/first-lineage.md | 313 +++++++- docs/learn-datahub/quickstart/overview.md | 53 +- docs/learn-datahub/quickstart/setup.md | 3 +- 29 files changed, 2589 insertions(+), 493 deletions(-) create mode 100644 docs-website/src/components/LineageLayoutGrid/index.jsx create mode 100644 docs-website/src/components/LineageLayoutGrid/styles.module.css diff --git a/docs-website/src/components/DataHubEntityCard/styles.module.css b/docs-website/src/components/DataHubEntityCard/styles.module.css index e6863d669b456d..d87d559c25ed8f 100644 --- a/docs-website/src/components/DataHubEntityCard/styles.module.css +++ b/docs-website/src/components/DataHubEntityCard/styles.module.css @@ -186,6 +186,51 @@ --datahub-border: #374066; } +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + /* DataHub Entity Card - Uses actual DataHub design tokens */ /* Import DataHub color variables */ diff --git a/docs-website/src/components/DataHubLineageNode/index.jsx b/docs-website/src/components/DataHubLineageNode/index.jsx index 6fc22076f23203..b67c94fecdcda5 100644 --- a/docs-website/src/components/DataHubLineageNode/index.jsx +++ b/docs-website/src/components/DataHubLineageNode/index.jsx @@ -1,5 +1,6 @@ -import React from 'react'; +import React, { useState } from 'react'; import styles from './styles.module.css'; +import LineageLayoutGrid from '../LineageLayoutGrid'; // Simplified version of DataHub's LineageEntityNode for tutorials const DataHubLineageNode = ({ @@ -423,63 +424,309 @@ const DataHubLineageNode = ({ }; // Component for showing lineage connections with interactive expansion and column-level lineage -export const DataHubLineageFlow = ({ nodes, title, className = '', showColumnLineage = false }) => { - const [expandedNodes, setExpandedNodes] = React.useState(new Set()); - - const toggleNodeExpansion = (nodeId) => { - setExpandedNodes(prev => { - const newSet = new Set(prev); - if (newSet.has(nodeId)) { - newSet.delete(nodeId); - } else { - newSet.add(nodeId); - } - return newSet; - }); +export const DataHubLineageFlow = ({ + nodes = [], + title, + className = '', + showColumnLineage = false, + layout = 'linear', // 'linear', 'hierarchical', 'layers' + layers = null, // For hierarchical layout: [{ name: 'sources', nodes: [...] }, ...] + showConnections = false, + connectionColor = 'var(--datahub-primary)', + connectionColors = {}, + defaultColors = ['#533FD1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6'] +}) => { + const [allExpanded, setAllExpanded] = React.useState(false); + + const toggleAllExpansion = () => { + setAllExpanded(!allExpanded); }; - // Column lineage mappings - shows which columns connect between nodes - const getColumnLineage = (sourceNodeIndex, targetNodeIndex) => { - // Only show column lineage when going from DataJob to Dataset (after transformation) - if (sourceNodeIndex === 1 && targetNodeIndex === 2) { - // DataJob -> fct_users_created (this represents the transformation from user_events through the ETL job) - return [ - { source: 'user_id', target: 'user_id' }, - { source: 'timestamp', target: 'created_date' }, - { source: 'event_type', target: 'signup_source' }, - ]; + // Build connection map for hierarchical layouts + const buildConnectionMap = () => { + const connections = new Map(); + + if (layers) { + // Build connections from layer structure + layers.forEach((layer, layerIndex) => { + layer.nodes.forEach(node => { + if (node.downstreamConnections) { + connections.set(node.name, node.downstreamConnections); + } + }); + }); + } else if (nodes.length > 0 && nodes[0].downstreamConnections) { + // Build connections from node structure + nodes.forEach(node => { + if (node.downstreamConnections) { + connections.set(node.name, node.downstreamConnections); + } + }); } - return []; + + return connections; }; - const allNodesExpanded = nodes.every(node => expandedNodes.has(node.id)); - const shouldShowColumnConnections = false; // Disabled for now + const connectionMap = buildConnectionMap(); - return ( -
- {title &&

{title}

} -
- {nodes.map((node, index) => ( - - toggleNodeExpansion(node.id)} - /> - {index < nodes.length - 1 && ( -
-
+ // Render linear layout (original behavior) + const renderLinearLayout = () => ( +
+ {nodes.map((node, index) => ( + + + {index < nodes.length - 1 && ( +
+
+
+ )} +
+ ))} +
+ ); + + // Render hierarchical layout with proper multi-downstream support + const renderHierarchicalLayout = () => { + if (!layers || layers.length === 0) { + return renderLinearLayout(); // Fallback to linear + } + + // Keep all layers as they are - DataJobs can be in layers alongside data assets + const dataAssetLayers = layers; + + // Find DataJobs for connection logic (but they stay in their assigned layers) + const allDataJobs = []; + layers.forEach(layer => { + layer.nodes.forEach(node => { + if (node.entityType === 'DataJob') { + allDataJobs.push(node); + } + }); + }); + + return ( +
+ {dataAssetLayers.map((layer, layerIndex) => ( + +
+ {layer.title && ( +
{layer.title}
+ )} +
+ {layer.nodes.map((node, nodeIndex) => ( + + ))} +
+
+ + {layerIndex < dataAssetLayers.length - 1 && ( +
+ { + const sourceNodes = dataAssetLayers[layerIndex].nodes; + const targetNodes = dataAssetLayers[layerIndex + 1].nodes; + const nodeHeight = 120; + const nodeSpacing = 20; + const layerPadding = 20; + const totalSourceContentHeight = sourceNodes.length * nodeHeight + (sourceNodes.length - 1) * nodeSpacing; + const totalTargetContentHeight = targetNodes.length * nodeHeight + (targetNodes.length - 1) * nodeSpacing; + return Math.max(totalSourceContentHeight + (layerPadding * 2), totalTargetContentHeight + (layerPadding * 2), 300); + })()}`} + preserveAspectRatio="none" + > + {renderLayerConnections(dataAssetLayers[layerIndex], dataAssetLayers[layerIndex + 1], layerIndex, allDataJobs)} +
)}
))}
+ ); + }; + + // Render DataJobs in intermediate positions between layers + const renderIntermediateDataJobs = (sourceLayer, targetLayer, allDataJobs) => { + // Find DataJobs that connect these layers + const relevantDataJobs = allDataJobs.filter(dataJob => { + const hasSourceConnection = sourceLayer.nodes.some(sourceNode => + sourceNode.downstreamConnections?.includes(dataJob.name) + ); + const hasTargetConnection = dataJob.downstreamConnections?.some(targetName => + targetLayer.nodes.some(targetNode => targetNode.name === targetName) + ); + return hasSourceConnection && hasTargetConnection; + }); + + if (relevantDataJobs.length === 0) return null; + + return ( +
+ {relevantDataJobs.map((dataJob, index) => ( + + ))} +
+ ); + }; + + // Render connections between layers + const renderLayerConnections = (sourceLayer, targetLayer, layerIndex, allDataJobs = []) => { + const connections = []; + const sourceNodes = sourceLayer.nodes; + const targetNodes = targetLayer.nodes; + + // Calculate actual node positions based on CSS layout + // Nodes are centered with justify-content: center and have gap: 20px + const nodeHeight = 120; // Approximate height of a collapsed node + const nodeSpacing = 20; // Gap between nodes (from CSS: gap: 20px) + const layerPadding = 20; // Padding from CSS: padding: 20px 0 + + // Calculate total content height for each layer + const totalSourceContentHeight = sourceNodes.length * nodeHeight + (sourceNodes.length - 1) * nodeSpacing; + const totalTargetContentHeight = targetNodes.length * nodeHeight + (targetNodes.length - 1) * nodeSpacing; + + // SVG height should match the layer height including padding + const svgHeight = Math.max(totalSourceContentHeight + (layerPadding * 2), totalTargetContentHeight + (layerPadding * 2), 300); + + // Calculate starting Y position - nodes are centered in the available space + const sourceStartY = layerPadding + (svgHeight - totalSourceContentHeight - (layerPadding * 2)) / 2; + const targetStartY = layerPadding + (svgHeight - totalTargetContentHeight - (layerPadding * 2)) / 2; + + sourceNodes.forEach((sourceNode, sourceIndex) => { + if (sourceNode.downstreamConnections) { + sourceNode.downstreamConnections.forEach(targetNodeName => { + // Find target node in the target layer + const targetIndex = targetNodes.findIndex(node => node.name === targetNodeName); + + if (targetIndex !== -1) { + // Calculate actual vertical center of each node + const sourceY = sourceStartY + (sourceIndex * (nodeHeight + nodeSpacing)) + (nodeHeight / 2); + const targetY = targetStartY + (targetIndex * (nodeHeight + nodeSpacing)) + (nodeHeight / 2); + + // Use different colors for different source nodes + const colors = ['#533FD1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6']; + const connectionColor = colors[sourceIndex % colors.length]; + + // Connection positioning - from right edge of source to left edge of target + const startX = 200; // Right edge of source layer + const endX = 0; // Left edge of target layer + + // Create smooth curves with proper horizontal arrow positioning + const cp1X = startX + (endX - startX) * 0.3; + const cp1Y = sourceY; + const cp2X = startX + (endX - startX) * 0.7; + const cp2Y = targetY; + + const pathData = `M ${startX} ${sourceY} + C ${cp1X} ${cp1Y}, ${cp2X} ${cp2Y}, ${endX} ${targetY}`; + + connections.push( + + {/* Main connection path */} + + {/* Connection points */} + + + + ); + } + }); + } + }); + + // Create unique markers for each source node color with horizontal orientation + const markers = []; + const colors = ['#533FD1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6']; + + sourceNodes.forEach((sourceNode, sourceIndex) => { + const connectionColor = colors[sourceIndex % colors.length]; + markers.push( + + + + ); + }); + + return ( + <> + + {markers} + + {connections} + + ); + }; + + // Choose layout based on props + const renderLayout = () => { + switch (layout) { + case 'hierarchical': + case 'layers': + return ( + + ); + case 'linear': + default: + return renderLinearLayout(); + } + }; + + return ( +
+ {title &&

{title}

} + {renderLayout()}
); }; diff --git a/docs-website/src/components/DataHubLineageNode/styles.module.css b/docs-website/src/components/DataHubLineageNode/styles.module.css index 645482e7218046..a9437f8d5bc57f 100644 --- a/docs-website/src/components/DataHubLineageNode/styles.module.css +++ b/docs-website/src/components/DataHubLineageNode/styles.module.css @@ -156,6 +156,51 @@ --datahub-border: #374066; } +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme='dark'] { + --datahub-primary: #7565DA; + --datahub-primary-dark: #533FD1; + --datahub-primary-light: #8C7EE0; + --datahub-primary-lightest: #2E2373; + --datahub-gray-100: #2F3657; + --datahub-gray-600: #CFD1DA; + --datahub-gray-1700: #A9ADBD; + --datahub-gray-1800: #81879F; + --datahub-gray-1500: #1E2338; + --datahub-white: #272D48; + --datahub-border: #374066; +} + /* DataHub Lineage Node - Uses actual DataHub design tokens */ /* Import DataHub color variables */ @@ -635,6 +680,134 @@ border: 1px solid var(--datahub-border); } +/* Hierarchical Layout Styles */ +.hierarchicalContainer { + display: flex; + flex-direction: row; + align-items: stretch; + gap: 24px; + max-width: 100%; + overflow-x: auto; + padding: 16px 0; +} + +.layer { + display: flex; + flex-direction: column; + align-items: center; + justify-content: flex-start; + min-width: 300px; + flex-shrink: 0; + min-height: 400px; /* Ensure consistent layer heights */ +} + +.layerTitle { + font-size: 14px; + font-weight: 600; + color: var(--datahub-gray-600); + text-align: center; + margin-bottom: 24px; + padding: 8px 16px; + background: var(--datahub-white); + border: 1px solid var(--datahub-border); + border-radius: 20px; + text-transform: uppercase; + letter-spacing: 0.5px; + flex-shrink: 0; +} + +.layerNodes { + display: flex; + flex-direction: column; + gap: 20px; + align-items: center; + width: 100%; + flex: 1; + justify-content: center; /* Center nodes vertically in available space */ + padding: 20px 0; +} + +.layerConnection { + display: flex; + align-items: stretch; + justify-content: center; + width: 200px; + flex-shrink: 0; + position: relative; + min-height: 200px; +} + +.connectionSvg { + width: 100%; + height: 100%; + min-height: 200px; + flex: 1; +} + +/* Layout-specific flow styles */ +.lineageFlow.hierarchical .flowContainer, +.lineageFlow.layers .flowContainer { + display: none; /* Hide linear container for hierarchical layouts */ +} + +.lineageFlow.linear .hierarchicalContainer { + display: none; /* Hide hierarchical container for linear layouts */ +} + +/* Expand toggle */ +.expandToggle { + display: flex; + justify-content: center; + margin-bottom: 20px; +} + +.expandAllButton { + background: var(--datahub-primary); + color: white; + border: none; + padding: 8px 16px; + border-radius: 6px; + font-size: 14px; + font-weight: 500; + cursor: pointer; + transition: all 0.2s ease; +} + +.expandAllButton:hover { + background: var(--datahub-primary-dark); + transform: translateY(-1px); +} + +/* Intermediate DataJobs */ +.intermediateDataJobs { + position: absolute; + top: 50%; + left: 50%; + transform: translate(-50%, -50%); + display: flex; + flex-direction: column; + gap: 12px; + z-index: 10; +} + +.intermediateDataJob { + background: var(--datahub-white); + border: 2px solid var(--datahub-primary); + box-shadow: 0 4px 12px rgba(83, 63, 209, 0.2); +} + +/* Connection path animations and styling */ +.connectionPath { + transition: all 0.3s ease; + filter: drop-shadow(0 1px 2px rgba(0, 0, 0, 0.1)); +} + +.connectionPath:hover { + stroke-width: 3; + opacity: 1; + filter: drop-shadow(0 2px 4px rgba(0, 0, 0, 0.2)); +} + .flowTitle { margin: 0 0 16px 0; font-size: 16px; diff --git a/docs-website/src/components/LineageLayoutGrid/index.jsx b/docs-website/src/components/LineageLayoutGrid/index.jsx new file mode 100644 index 00000000000000..0511ddeb7bc949 --- /dev/null +++ b/docs-website/src/components/LineageLayoutGrid/index.jsx @@ -0,0 +1,296 @@ +import React, { useRef, useEffect, useState } from 'react'; +import styles from './styles.module.css'; +import DataHubLineageNode from '../DataHubLineageNode'; + +const LineageLayoutGrid = ({ + title, + layers = [], + showConnections = true, + allExpanded = false, + onToggleExpand = () => {}, + connectionColors = {}, + defaultColors = ['#533FD1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6'] +}) => { + const containerRef = useRef(null); + const [connections, setConnections] = useState([]); + + // Build a map of all nodes with their positions + const buildNodeMap = () => { + const nodeMap = new Map(); + + layers.forEach((layer, layerIndex) => { + layer.nodes.forEach((node, nodeIndex) => { + const element = containerRef.current.querySelector(`[data-node-id="${node.name}"]`); + if (element) { + const containerRect = containerRef.current.getBoundingClientRect(); + const nodeRect = element.getBoundingClientRect(); + const containerScrollLeft = containerRef.current.scrollLeft; + const containerScrollTop = containerRef.current.scrollTop; + + nodeMap.set(node.name, { + node, + layerIndex, + nodeIndex, + x: nodeRect.left - containerRect.left + containerScrollLeft, + y: nodeRect.top - containerRect.top + containerScrollTop, + width: nodeRect.width, + height: nodeRect.height, + centerX: nodeRect.left - containerRect.left + nodeRect.width / 2 + containerScrollLeft, + centerY: nodeRect.top - containerRect.top + nodeRect.height / 2 + containerScrollTop, + rightEdge: nodeRect.right - containerRect.left + containerScrollLeft, + leftEdge: nodeRect.left - containerRect.left + containerScrollLeft + }); + } + }); + }); + + return nodeMap; + }; + + // Calculate connections with proper routing around nodes + const calculateConnections = () => { + if (!containerRef.current || !showConnections) return; + + const nodeMap = buildNodeMap(); + const newConnections = []; + + // Find all connections across all layers + layers.forEach((layer, layerIndex) => { + layer.nodes.forEach((sourceNode, sourceIndex) => { + if (!sourceNode.downstreamConnections) return; + + sourceNode.downstreamConnections.forEach(targetNodeName => { + const sourceNodeData = nodeMap.get(sourceNode.name); + const targetNodeData = nodeMap.get(targetNodeName); + + if (sourceNodeData && targetNodeData) { + // Get connection color from props or use default + const connectionColor = connectionColors[sourceNode.name] || defaultColors[sourceIndex % defaultColors.length]; + + // Calculate routing path that avoids intermediate nodes + const path = calculateRoutingPath(sourceNodeData, targetNodeData, nodeMap); + + // Debug: Log DataJob connections specifically + if (sourceNode.entityType === 'DataJob' || targetNodeData.node.entityType === 'DataJob') { + console.log(`DataJob connection: ${sourceNode.name} (${sourceNode.entityType}) → ${targetNodeName} (${targetNodeData.node.entityType})`); + } + + newConnections.push({ + id: `${sourceNode.name}-${targetNodeName}`, + sourceX: sourceNodeData.rightEdge, + sourceY: sourceNodeData.centerY, + targetX: targetNodeData.leftEdge - 16, // Account for arrowhead + targetY: targetNodeData.centerY, + color: connectionColor, + path: path, + layerIndex: sourceNodeData.layerIndex, + sourceIndex + }); + } + }); + }); + }); + + setConnections(newConnections); + }; + + // Calculate routing path that avoids nodes + const calculateRoutingPath = (sourceData, targetData, nodeMap) => { + const sourceX = sourceData.rightEdge; + const sourceY = sourceData.centerY; + const targetX = targetData.leftEdge - 16; + const targetY = targetData.centerY; + + // Check if there are nodes between source and target that we need to route around + const intermediateNodes = Array.from(nodeMap.values()).filter(nodeData => { + // Only consider nodes that are between source and target horizontally + return nodeData.centerX > sourceX && nodeData.centerX < targetX && + nodeData.node.name !== sourceData.node.name && + nodeData.node.name !== targetData.node.name; + }); + + if (intermediateNodes.length === 0) { + // Direct path if no obstacles + return null; // Will use default curve + } + + // Find routing level that avoids all intermediate nodes + const allNodeYs = intermediateNodes.map(n => [n.y, n.y + n.height]).flat(); + allNodeYs.push(sourceY, targetY); + + const minY = Math.min(...allNodeYs); + const maxY = Math.max(...allNodeYs); + + // Route above or below based on which has more space and is more natural + const routingOffset = 40; + const routeAbove = minY - routingOffset; + const routeBelow = maxY + routingOffset; + + // Choose the route that's closer to the average of source and target + const avgY = (sourceY + targetY) / 2; + const routingY = Math.abs(routeAbove - avgY) < Math.abs(routeBelow - avgY) ? routeAbove : routeBelow; + + return { + type: 'routed', + routingY + }; + }; + + // Recalculate connections when layout changes + useEffect(() => { + const timer = setTimeout(calculateConnections, 100); // Small delay for DOM updates + return () => clearTimeout(timer); + }, [layers, allExpanded]); + + // Recalculate on window resize and scroll + useEffect(() => { + const handleResize = () => calculateConnections(); + const handleScroll = () => calculateConnections(); + + window.addEventListener('resize', handleResize); + + // Add scroll listener to the container + if (containerRef.current) { + containerRef.current.addEventListener('scroll', handleScroll); + } + + return () => { + window.removeEventListener('resize', handleResize); + if (containerRef.current) { + containerRef.current.removeEventListener('scroll', handleScroll); + } + }; + }, []); + + // Generate path that routes around nodes when needed + const generatePath = (connection) => { + const { sourceX, sourceY, targetX, targetY, path } = connection; + + if (!path || path.type !== 'routed') { + // Simple Bezier curve for direct connections + const horizontalDistance = targetX - sourceX; + const cp1X = sourceX + horizontalDistance * 0.5; + const cp1Y = sourceY; + const cp2X = sourceX + horizontalDistance * 0.5; + const cp2Y = targetY; + + return `M ${sourceX} ${sourceY} C ${cp1X} ${cp1Y}, ${cp2X} ${cp2Y}, ${targetX} ${targetY}`; + } + + // Routed path that curves above or below obstacles + const { routingY } = path; + const horizontalDistance = targetX - sourceX; + + // Create a smooth S-curve that goes through the routing level + const midX = sourceX + horizontalDistance * 0.5; + + // Control points for smooth routing curve + const cp1X = sourceX + horizontalDistance * 0.25; + const cp1Y = sourceY + (routingY - sourceY) * 0.3; + + const cp2X = sourceX + horizontalDistance * 0.75; + const cp2Y = targetY + (routingY - targetY) * 0.3; + + return `M ${sourceX} ${sourceY} C ${cp1X} ${cp1Y}, ${cp2X} ${cp2Y}, ${targetX} ${targetY}`; + }; + + return ( +
+ {title &&

{title}

} + +
+ {layers.map((layer, layerIndex) => ( +
+ {layer.title && ( +
{layer.title}
+ )} +
+ {layer.nodes.map((node, nodeIndex) => ( +
+ +
+ ))} +
+
+ ))} +
+ + {/* SVG overlay for connections */} + {showConnections && connections.length > 0 && ( + + + {connections.map((connection, index) => ( + + + + ))} + + + {connections.map((connection) => ( + + + + + + ))} + + )} +
+ ); +}; + +export default LineageLayoutGrid; +export { LineageLayoutGrid }; diff --git a/docs-website/src/components/LineageLayoutGrid/styles.module.css b/docs-website/src/components/LineageLayoutGrid/styles.module.css new file mode 100644 index 00000000000000..875b1751e3effb --- /dev/null +++ b/docs-website/src/components/LineageLayoutGrid/styles.module.css @@ -0,0 +1,125 @@ +/* LineageLayoutGrid Component Styles */ +.lineageContainer { + position: relative; + width: 100%; + max-width: 100%; + overflow-x: auto; + padding: 20px 40px; /* More horizontal padding to prevent cutoff */ + background: var(--ifm-background-color); + border-radius: 8px; + border: 1px solid var(--ifm-color-emphasis-300); +} + +.title { + text-align: center; + margin: 0 0 24px 0; + font-size: 18px; + font-weight: 600; + color: var(--ifm-color-content); +} + +/* CSS Grid Layout for Layers */ +.layersGrid { + display: grid; + grid-auto-flow: column; + grid-auto-columns: minmax(320px, 1fr); + gap: 100px; /* More space for connections */ + align-items: start; + justify-content: start; /* Align to start to prevent cutoff */ + min-height: 400px; + padding: 20px 0; + min-width: fit-content; /* Ensure grid doesn't shrink below content */ +} + +/* Individual Layer Styling */ +.layer { + display: flex; + flex-direction: column; + align-items: center; + justify-content: flex-start; + min-width: 320px; + width: 100%; /* Take full grid column width */ + height: 100%; +} + +.layerTitle { + font-size: 14px; + font-weight: 600; + color: var(--ifm-color-emphasis-700); + text-align: center; + margin-bottom: 24px; + padding: 8px 16px; + background: var(--ifm-background-color); + border: 1px solid var(--ifm-color-emphasis-300); + border-radius: 20px; + text-transform: uppercase; + letter-spacing: 0.5px; + white-space: nowrap; +} + +/* Nodes within each layer */ +.layerNodes { + display: flex; + flex-direction: column; + gap: 20px; + align-items: center; + width: 100%; + flex: 1; + justify-content: center; +} + +.nodeWrapper { + width: 100%; + max-width: 300px; + position: relative; + display: flex; + justify-content: center; /* Center nodes within their wrapper */ +} + +/* SVG Overlay for Connections */ +.connectionsOverlay { + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 100%; + pointer-events: none; + z-index: 1; + overflow: visible; +} + +/* Connection Path Styling */ +.connectionPath { + transition: all 0.3s ease; + filter: drop-shadow(0 1px 2px rgba(0, 0, 0, 0.1)); +} + +.connectionPath:hover { + stroke-width: 3; + opacity: 1; + filter: drop-shadow(0 2px 4px rgba(0, 0, 0, 0.2)); +} + +/* Responsive Design */ +@media (max-width: 768px) { + .layersGrid { + grid-auto-columns: minmax(280px, 1fr); + gap: 60px; + } + + .nodeWrapper { + max-width: 260px; + } +} + +/* Dark mode support */ +[data-theme='dark'] .lineageContainer { + background: var(--ifm-background-color); + border-color: var(--ifm-color-emphasis-300); +} + +[data-theme='dark'] .layerTitle { + background: var(--ifm-background-color); + border-color: var(--ifm-color-emphasis-300); + color: var(--ifm-color-emphasis-800); +} diff --git a/docs-website/src/components/TutorialExercise/styles.module.css b/docs-website/src/components/TutorialExercise/styles.module.css index 2190c4226ee8b7..35f58e06af89b9 100644 --- a/docs-website/src/components/TutorialExercise/styles.module.css +++ b/docs-website/src/components/TutorialExercise/styles.module.css @@ -1,24 +1,21 @@ -/* Import DataHub design tokens */ -@import url('../../css/datahub-tokens.css'); - .exerciseContainer { - background: var(--datahub-white); - border: 1px solid var(--datahub-border); - border-radius: 12px; - box-shadow: var(--datahub-shadow); + background: var(--ifm-background-color); + border: 1px solid var(--ifm-color-emphasis-300); + border-radius: var(--ifm-border-radius); + box-shadow: var(--ifm-shadow-md); margin: 24px 0; overflow: hidden; transition: all 0.2s ease; } .exerciseContainer:hover { - box-shadow: var(--datahub-shadow-hover); - border-color: var(--datahub-primary-light); + box-shadow: var(--ifm-shadow-lg); + border-color: var(--ifm-color-primary-light); } .exerciseHeader { - background: linear-gradient(135deg, var(--datahub-gray-50) 0%, var(--datahub-white) 100%); - border-bottom: 1px solid var(--datahub-border); + background: linear-gradient(135deg, var(--ifm-color-emphasis-100) 0%, var(--ifm-background-color) 100%); + border-bottom: 1px solid var(--ifm-color-emphasis-300); padding: 16px 20px; display: flex; align-items: center; @@ -34,8 +31,8 @@ .typeIcon { width: 40px; height: 40px; - background: var(--datahub-primary); - color: var(--datahub-white); + background: var(--ifm-color-primary); + color: white; border-radius: 8px; display: flex; align-items: center; @@ -54,7 +51,7 @@ margin: 0; font-size: 16px; font-weight: 600; - color: var(--datahub-text-primary); + color: var(--ifm-font-color-base); line-height: 1.3; } @@ -70,23 +67,23 @@ border-radius: 12px; font-size: 11px; font-weight: 500; - color: var(--datahub-white); + color: white; text-transform: uppercase; letter-spacing: 0.5px; } .timeEstimate { font-size: 12px; - color: var(--datahub-text-secondary); - background: var(--datahub-gray-100); + color: var(--ifm-color-emphasis-700); + background: var(--ifm-color-emphasis-200); padding: 2px 6px; border-radius: 4px; } .platform { font-size: 12px; - color: var(--datahub-primary); - background: var(--datahub-primary-light); + color: var(--ifm-color-primary); + background: var(--ifm-color-primary-lightest); padding: 2px 6px; border-radius: 4px; font-weight: 500; @@ -104,16 +101,16 @@ } .searchItem { - background: var(--datahub-gray-50); - border: 1px solid var(--datahub-border-light); + background: var(--ifm-color-emphasis-100); + border: 1px solid var(--ifm-color-emphasis-200); border-radius: 8px; padding: 16px; transition: all 0.2s ease; } .searchItem:hover { - background: var(--datahub-white); - border-color: var(--datahub-primary-light); + background: var(--ifm-background-color); + border-color: var(--ifm-color-primary-light); } .searchQuery { @@ -121,11 +118,11 @@ } .searchQuery code { - background: var(--datahub-primary-dark); - color: var(--datahub-white); + background: var(--ifm-color-primary-dark); + color: white; padding: 8px 12px; border-radius: 6px; - font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; + font-family: var(--ifm-font-family-monospace); font-size: 14px; font-weight: 500; display: inline-block; @@ -133,19 +130,19 @@ } .searchDescription { - color: var(--datahub-text-secondary); + color: var(--ifm-color-emphasis-700); font-size: 14px; margin-bottom: 6px; line-height: 1.4; } .searchExpected { - color: var(--datahub-text-primary); + color: var(--ifm-font-color-base); font-size: 13px; - background: var(--datahub-success-light); + background: var(--ifm-color-success-lightest); padding: 6px 10px; border-radius: 4px; - border-left: 3px solid var(--datahub-success); + border-left: 3px solid var(--ifm-color-success); } /* Hands-On Exercise Styles */ @@ -164,8 +161,8 @@ .stepNumber { width: 28px; height: 28px; - background: var(--datahub-primary); - color: var(--datahub-white); + background: var(--ifm-color-primary); + color: white; border-radius: 50%; display: flex; align-items: center; @@ -185,31 +182,31 @@ .stepTitle { font-weight: 600; - color: var(--datahub-text-primary); + color: var(--ifm-font-color-base); font-size: 15px; line-height: 1.4; } .stepDescription { - color: var(--datahub-text-secondary); + color: var(--ifm-color-emphasis-700); font-size: 14px; line-height: 1.5; } .stepCode { - background: var(--datahub-gray-900); - color: var(--datahub-white); + background: var(--ifm-color-emphasis-900); + color: white; padding: 10px 12px; border-radius: 6px; - font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; + font-family: var(--ifm-font-family-monospace); font-size: 13px; margin-top: 4px; } /* Interactive Demo Styles */ .interactiveContent { - background: linear-gradient(135deg, var(--datahub-primary-light) 0%, var(--datahub-white) 100%); - border: 1px solid var(--datahub-primary-light); + background: linear-gradient(135deg, var(--ifm-color-primary-lightest) 0%, var(--ifm-background-color) 100%); + border: 1px solid var(--ifm-color-primary-light); border-radius: 8px; padding: 20px; } @@ -261,24 +258,24 @@ /* Dark mode support */ [data-theme='dark'] .exerciseContainer { - background: var(--datahub-gray-800); - border-color: var(--datahub-gray-700); + background: var(--ifm-background-surface-color); + border-color: var(--ifm-color-emphasis-300); } [data-theme='dark'] .exerciseHeader { - background: linear-gradient(135deg, var(--datahub-gray-700) 0%, var(--datahub-gray-800) 100%); - border-bottom-color: var(--datahub-gray-700); + background: linear-gradient(135deg, var(--ifm-color-emphasis-200) 0%, var(--ifm-background-surface-color) 100%); + border-bottom-color: var(--ifm-color-emphasis-300); } [data-theme='dark'] .searchItem { - background: var(--datahub-gray-700); - border-color: var(--datahub-gray-600); + background: var(--ifm-color-emphasis-200); + border-color: var(--ifm-color-emphasis-300); } [data-theme='dark'] .searchItem:hover { - background: var(--datahub-gray-600); + background: var(--ifm-color-emphasis-300); } [data-theme='dark'] .interactiveContent { - background: linear-gradient(135deg, var(--datahub-primary-dark) 0%, var(--datahub-gray-800) 100%); + background: linear-gradient(135deg, var(--ifm-color-primary-dark) 0%, var(--ifm-background-surface-color) 100%); } diff --git a/docs/api/tutorials/sdk/bulk-assertions-sdk.md b/docs/api/tutorials/sdk/bulk-assertions-sdk.md index a23e6311215130..4a25183f9ac82e 100644 --- a/docs/api/tutorials/sdk/bulk-assertions-sdk.md +++ b/docs/api/tutorials/sdk/bulk-assertions-sdk.md @@ -176,7 +176,7 @@ def create_freshness_assertions(datasets, client, registry): # Store the assertion URN for future reference registry["freshness"][str(dataset_urn)] = str(freshness_assertion.urn) - print(f"✅ Created freshness assertion for {dataset_urn.name}: {freshness_assertion.urn}") + print(f"Created freshness assertion for {dataset_urn.name}: {freshness_assertion.urn}") except Exception as e: print(f"❌ Failed to create freshness assertion for {dataset_urn.name}: {e}") @@ -212,7 +212,7 @@ def create_volume_assertions(datasets, client, registry): # Store the assertion URN registry["volume"][str(dataset_urn)] = str(volume_assertion.urn) - print(f"✅ Created volume assertion for {dataset_urn.name}: {volume_assertion.urn}") + print(f"Created volume assertion for {dataset_urn.name}: {volume_assertion.urn}") except Exception as e: print(f"❌ Failed to create volume assertion for {dataset_urn.name}: {e}") @@ -248,7 +248,7 @@ dataset_columns = {} for dataset_urn in datasets: columns = get_dataset_columns(client, dataset_urn) dataset_columns[str(dataset_urn)] = columns - print(f"📊 Found {len(columns)} columns in {dataset_urn.name}") + print(f"Found {len(columns)} columns in {dataset_urn.name}") ``` ## Step 4: Create Column-Level Assertions @@ -321,7 +321,7 @@ def create_column_assertions(datasets, columns_dict, client, registry): registry["column_metrics"][dataset_key][column_name] = {} registry["column_metrics"][dataset_key][column_name][rule_name] = str(assertion.urn) - print(f"✅ Created {rule_name} assertion for {dataset_urn.name}.{column_name}") + print(f"Created {rule_name} assertion for {dataset_urn.name}.{column_name}") except Exception as e: print(f"❌ Failed to create {rule_name} assertion for {dataset_urn.name}.{column_name}: {e}") @@ -383,7 +383,7 @@ def save_assertion_registry(registry, filename=None): with open(filename, 'w') as f: json.dump(registry_with_metadata, f, indent=2) - print(f"💾 Saved assertion registry to {filename}") + print(f"Saved assertion registry to {filename}") return filename # Save the registry @@ -602,7 +602,7 @@ def main(): print("\n📋 Creating freshness assertions...") create_freshness_assertions(datasets, client, assertion_registry) - print("\n📊 Creating volume assertions...") + print("\nCreating volume assertions...") create_volume_assertions(datasets, client, assertion_registry) # Step 2: Get column information and create column assertions @@ -615,7 +615,7 @@ def main(): create_column_assertions(datasets, dataset_columns, client, assertion_registry) # Step 3: Save results - print("\n💾 Saving assertion registry...") + print("\nSaving assertion registry...") registry_file = save_assertion_registry(assertion_registry) # Summary @@ -625,12 +625,12 @@ def main(): sum(len(cols) for cols in assertion_registry["column_metrics"].values()) ) - print(f"\n✅ Bulk assertion creation complete!") - print(f" 📈 Total assertions created: {total_assertions}") - print(f" 🕐 Freshness assertions: {len(assertion_registry['freshness'])}") - print(f" 📊 Volume assertions: {len(assertion_registry['volume'])}") - print(f" 🎯 Column assertions: {sum(len(cols) for cols in assertion_registry['column_metrics'].values())}") - print(f" 💾 Registry saved to: {registry_file}") + print(f"\nBulk assertion creation complete!") + print(f" Total assertions created: {total_assertions}") + print(f" Freshness assertions: {len(assertion_registry['freshness'])}") + print(f" Volume assertions: {len(assertion_registry['volume'])}") + print(f" Column assertions: {sum(len(cols) for cols in assertion_registry['column_metrics'].values())}") + print(f" Registry saved to: {registry_file}") if __name__ == "__main__": main() diff --git a/docs/learn-datahub/discovery/advanced-search.md b/docs/learn-datahub/discovery/advanced-search.md index 1fa96d01c32dff..621a1b85bc52ab 100644 --- a/docs/learn-datahub/discovery/advanced-search.md +++ b/docs/learn-datahub/discovery/advanced-search.md @@ -141,7 +141,7 @@ customer_ltv customer_score customer_tier -### 🎮 Interactive Exercise: Your First Search +### Interactive Exercise: Your First Search
@@ -167,7 +167,7 @@ Raw search results can be overwhelming. Use filters to focus on what matters: ### Platform Filtering - + **Follow along in DataHub:** @@ -181,7 +181,7 @@ Raw search results can be overwhelming. Use filters to focus on what matters: **Notice how results change!** - + **Choose filters based on your use case:** @@ -189,13 +189,13 @@ Raw search results can be overwhelming. Use filters to focus on what matters: - Snowflake, BigQuery (analytics platforms) - dbt (transformed data) -- ❌ MySQL, PostgreSQL (raw operational data) +- MySQL, PostgreSQL (raw operational data) - not recommended **For Operational Insights:** - PostgreSQL, MySQL (live operational data) - Kafka (real-time streams) -- ❌ S3 (archived data) +- S3 (archived data) - not recommended **For Data Engineering:** @@ -221,7 +221,7 @@ Raw search results can be overwhelming. Use filters to focus on what matters:
-### 🎮 Interactive Exercise: Smart Filtering +### Interactive Exercise: Smart Filtering
@@ -400,39 +400,48 @@ _Exact phrase in description field_ -### 🎮 Interactive Exercise: Operator Mastery +### Interactive Exercise: Operator Mastery -
- -**Progressive Challenge**: Build increasingly sophisticated searches - -**Level 1 - Basic Operators:** - -``` -customer AND segment -``` - -**Level 2 - Add Exclusions:** - -``` -customer AND segment NOT test -``` - -**Level 3 - Field Targeting:** - -``` -name:customer* AND description:segment* -``` + -``` -(name:customer* OR name:user*) AND (description:segment* OR description:cohort*) AND platform:snowflake -``` +**Your Mission**: Try each level in DataHub and observe how results change. Notice how each level gives you more control and precision. Which approach gives you the most relevant results for marketing analysis? -**Your Mission**: Try each level and observe how results change. Which gives you the most relevant results for marketing analysis? +**Pro Tip**: Copy each query into DataHub's search bar and compare the result quality. Level 4 should give you the most targeted, actionable datasets. -
+ ## Level 4: Saved Searches & Efficiency @@ -463,7 +472,7 @@ Don't repeat work - save your successful searches: ### Search Templates for Common Scenarios - + ``` # High-quality customer data for campaigns @@ -473,7 +482,7 @@ NOT (test OR temp OR backup) ``` - + ``` # Live operational customer data @@ -482,7 +491,7 @@ AND hasOwners:true AND updatedInLastWeek:true ``` - + ``` # Processed analytical datasets @@ -516,19 +525,19 @@ Try to solve this in 90 seconds: _"Find production-ready customer analytics data
-**🚀 Speed Techniques:** +**Speed Techniques:** - Use browser bookmarks for common DataHub searches - Set up browser shortcuts: `dh customer` → DataHub customer search - Learn keyboard shortcuts: `Ctrl+K` for quick search -**🎯 Accuracy Boosters:** +**Accuracy Boosters:** - Always check the "Updated" date - stale data wastes time - Look for owner information - contactable owners = reliable data - Check description quality - well-documented data is usually better maintained -**🤝 Team Efficiency:** +**Team Efficiency:** - Share successful search patterns with teammates - Create team-wide saved searches for common use cases @@ -539,7 +548,7 @@ Try to solve this in 90 seconds: _"Find production-ready customer analytics data ## Troubleshooting Common Issues - + **Problem**: Search returns hundreds of results @@ -551,7 +560,7 @@ Try to solve this in 90 seconds: _"Find production-ready customer analytics data 4. **Exclude noise**: `NOT (test OR temp OR backup OR old)` - + **Problem**: Search returns nothing @@ -563,7 +572,7 @@ Try to solve this in 90 seconds: _"Find production-ready customer analytics data 4. **Try different fields**: Maybe it's in descriptions, not names - + **Problem**: Results aren't relevant to your use case @@ -579,7 +588,7 @@ Try to solve this in 90 seconds: _"Find production-ready customer analytics data ## What You've Learned -🎉 **Congratulations!** You've transformed from basic search to advanced discovery: +**Congratulations!** You've transformed from basic search to advanced discovery: - **Strategic Approach**: Business-first thinking with technical backup - **Smart Filtering**: Platform and entity type filtering for relevance diff --git a/docs/learn-datahub/discovery/collaborative-discovery.md b/docs/learn-datahub/discovery/collaborative-discovery.md index 6ccf494a825e61..a52f41f107bab0 100644 --- a/docs/learn-datahub/discovery/collaborative-discovery.md +++ b/docs/learn-datahub/discovery/collaborative-discovery.md @@ -4,12 +4,12 @@ import TabItem from '@theme/TabItem'; # Collaborative Discovery (10 minutes) :::info Tutorial Progress -**Step 3 of 3** | ⏱️ **10 minutes** | [Overview](overview.md) → [Advanced Search](advanced-search.md) → [Dataset Profiles](dataset-profiles.md) → **Collaborative Discovery** +**Step 3 of 3** | **10 minutes** | [Overview](overview.md) → [Advanced Search](advanced-search.md) → [Dataset Profiles](dataset-profiles.md) → **Collaborative Discovery** ::: Transform DataHub from a solo tool into a team knowledge platform. Learn to document insights, ask questions, and build collective data intelligence that benefits everyone. -## 🤝 Discovery Challenge #3: The Collaboration Champion +## Discovery Challenge #3: The Collaboration Champion **Your Mission**: You've discovered valuable insights about customer segmentation data and want to ensure future analysts can benefit from your knowledge. Make this dataset more discoverable and useful for your team. @@ -49,7 +49,7 @@ Transform cryptic datasets into self-explanatory resources: -**❌ Typical (Unhelpful) Documentation:** +**Typical (Unhelpful) Documentation:** ``` Table: customer_seg_v3 @@ -79,13 +79,13 @@ Usage Notes: ``` - + **Use these templates for consistency:**
-**📊 Analytics Dataset Template:** +**Analytics Dataset Template:** ``` Business Purpose: [What business problem does this solve?] @@ -96,7 +96,7 @@ Common Use Cases: [How do teams typically use this?] Related Datasets: [What other data works well with this?] ``` -**🔄 Operational Dataset Template:** +**Operational Dataset Template:** ``` System Source: [What application generates this data?] @@ -111,7 +111,7 @@ Access Patterns: [Who typically needs this data and why?] -### 🎮 Interactive Exercise: Documentation Makeover +### Interactive Exercise: Documentation Makeover
@@ -150,7 +150,7 @@ Tags are the navigation system for your data catalog. Use them strategically:
-**🏷️ Tag Categories & Examples:** +**Tag Categories & Examples:** | Category | Purpose | Examples | | -------------------- | -------------------- | ----------------------------------------------------------- | @@ -178,7 +178,7 @@ Tags are the navigation system for your data catalog. Use them strategically: - Use standard terms: `pii` not `personal-info` - Include context: `marketing-ready` not just `ready` -**❌ Avoid These Patterns:** +**Avoid These Patterns:** - Inconsistent casing: `Customer-Analytics` vs `customer_analytics` - Vague terms: `good`, `important`, `useful` @@ -237,7 +237,7 @@ Use Case Level: Specific analytical or operational purpose -### 🎮 Interactive Exercise: Tag Like a Pro +### Interactive Exercise: Tag Like a Pro
@@ -281,21 +281,21 @@ Turn your discoveries into team assets:
-**🤔 Ask Good Questions:** +**Ask Good Questions:** - "What's the difference between customer_id and user_id in this table?" - "How often is this data refreshed? I see conflicting information." - "Are there known data quality issues with the email column?" - "What's the business logic behind the customer_score calculation?" -**💡 Provide Helpful Answers:** +**Provide Helpful Answers:** - Be specific and actionable - Include examples when possible - Reference related datasets or documentation - Update your answer if information changes -**📈 Question Patterns That Help Teams:** +**Question Patterns That Help Teams:** - Data quality clarifications - Business logic explanations @@ -305,13 +305,13 @@ Turn your discoveries into team assets:
- + **Guide future users with recommendations:**
-**💡 Recommendation Types:** +**Recommendation Types:** **Alternative Datasets:** "For real-time customer data, consider `customer_events_stream` instead of this daily batch table." @@ -386,7 +386,7 @@ Turn your discoveries into team assets: ## Success Stories: Collaboration in Action - + **Before Collaboration:** @@ -430,19 +430,19 @@ Turn your discoveries into team assets:
-**🔄 Automated Collaboration:** +**Automated Collaboration:** - Set up alerts for dataset changes - Use DataHub Actions to notify teams of quality issues - Integrate with Slack for team notifications -**📊 Collaboration Analytics:** +**Collaboration Analytics:** - Track which datasets are most bookmarked - Identify documentation gaps - Measure team engagement with data catalog -**🎯 Targeted Sharing:** +**Targeted Sharing:** - Use domain-specific tags for relevant teams - Create role-based saved searches @@ -470,34 +470,34 @@ Take a dataset you've worked with and make it 50% more valuable to your team thr
-**📊 Team Metrics to Track:** +**Team Metrics to Track:** -| Metric | Good Trend | What It Means | -| -------------------------- | ------------- | --------------------------------------- | -| **Documentation Coverage** | ↗️ Increasing | More datasets have helpful descriptions | -| **Tag Consistency** | ↗️ Increasing | Team uses standardized tagging | -| **Q&A Activity** | ↗️ Increasing | Active knowledge sharing | -| **Discovery Time** | ↘️ Decreasing | Faster data finding | -| **Repeat Questions** | ↘️ Decreasing | Better documentation quality | +| Metric | Good Trend | What It Means | +| -------------------------- | ---------- | --------------------------------------- | +| **Documentation Coverage** | Increasing | More datasets have helpful descriptions | +| **Tag Consistency** | Increasing | Team uses standardized tagging | +| **Q&A Activity** | Increasing | Active knowledge sharing | +| **Discovery Time** | Decreasing | Faster data finding | +| **Repeat Questions** | Decreasing | Better documentation quality |
## What You've Accomplished -🎉 **Outstanding work!** You've completed the Data Discovery & Search mastery series: +**Outstanding work!** You've completed the Data Discovery & Search mastery series: ### Skills Mastered: -- **🔍 Advanced Search**: Strategic search approaches with operators and filters -- **📊 Dataset Evaluation**: Rapid quality assessment and decision-making -- **🤝 Collaborative Discovery**: Documentation, tagging, and knowledge sharing +- **Advanced Search**: Strategic search approaches with operators and filters +- **Dataset Evaluation**: Rapid quality assessment and decision-making +- **Collaborative Discovery**: Documentation, tagging, and knowledge sharing ### Business Impact: -- **⚡ Speed**: Find relevant data in minutes, not hours -- **🎯 Accuracy**: Make informed decisions about data quality and fit -- **🤝 Team Efficiency**: Share knowledge that benefits everyone -- **📈 Scalability**: Build practices that improve over time +- **Speed**: Find relevant data in minutes, not hours +- **Accuracy**: Make informed decisions about data quality and fit +- **Team Efficiency**: Share knowledge that benefits everyone +- **Scalability**: Build practices that improve over time ## What's Next? @@ -538,6 +538,6 @@ Choose your next learning adventure based on your role and interests:
-**Congratulations on becoming a DataHub Discovery Expert!** 🚀 +**Congratulations on becoming a DataHub Discovery Expert!** Your investment in learning these skills will pay dividends every time you or your teammates need to find and understand data. Keep practicing, keep collaborating, and keep discovering! diff --git a/docs/learn-datahub/discovery/dataset-profiles.md b/docs/learn-datahub/discovery/dataset-profiles.md index 899904227a43ff..2124170ad52ec9 100644 --- a/docs/learn-datahub/discovery/dataset-profiles.md +++ b/docs/learn-datahub/discovery/dataset-profiles.md @@ -1,15 +1,16 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; +import DataHubEntityCard from '@site/src/components/DataHubEntityCard'; # Understanding Dataset Profiles (20 minutes) :::info Tutorial Progress -**Step 2 of 3** | ⏱️ **20 minutes** | [Overview](overview.md) → [Advanced Search](advanced-search.md) → **Dataset Profiles** → [Collaborative Discovery](collaborative-discovery.md) +**Step 2 of 3** | **20 minutes** | [Overview](overview.md) → [Advanced Search](advanced-search.md) → **Dataset Profiles** → [Collaborative Discovery](collaborative-discovery.md) ::: Learn to quickly assess data quality, understand schemas, and make informed decisions about whether a dataset meets your analysis needs. Transform from guessing to knowing. -## 🕵️ Discovery Challenge #2: The Data Detective +## Discovery Challenge #2: The Data Detective **Your Mission**: The customer dashboard shows suspicious numbers - customer count dropped 50% overnight. You need to evaluate potential data sources to find the root cause. @@ -47,9 +48,9 @@ Before diving deep, get a rapid overview of dataset health:
-**🚦 Traffic Light System:** +**Traffic Light System:** -| 🟢 Green Light | 🟡 Yellow Light | 🔴 Red Light | +| Green Light | Yellow Light | Red Light | | ------------------ | -------------------- | -------------------- | | Updated < 24h ago | Updated 1-7 days ago | Updated > 7 days ago | | Has owner assigned | Owner unclear | No owner | @@ -58,7 +59,47 @@ Before diving deep, get a rapid overview of dataset health:
-### 🎮 Interactive Exercise: Health Check Practice +**Visual Health Assessment Examples:** + +
+ + + + + +
+ +### Interactive Exercise: Health Check Practice
@@ -73,7 +114,7 @@ Dataset Name: ________________ Last Updated: ________________ Owner: ______________________ Row Count: ___________________ -Health Score: 🟢 🟡 🔴 (circle one) +Health Score: Good / Warning / Critical (circle one) ``` **Repeat for 2 more datasets and compare results** @@ -87,13 +128,13 @@ The schema tells you what data is actually available and how it's structured: ### Reading the Schema Tab - + **What to look for in each column:**
-**🔍 Column Name Patterns:** +**Column Name Patterns:** - `id`, `uuid`, `key` → Identifiers (good for joins) - `created_at`, `updated_at` → Timestamps (good for time analysis) @@ -101,7 +142,7 @@ The schema tells you what data is actually available and how it's structured: - `status`, `type`, `category` → Categorical data (good for grouping) - `amount`, `count`, `score` → Numeric data (good for calculations) -**📈 Data Type Insights:** +**Data Type Insights:** - `VARCHAR(255)` → Text fields, check for standardization - `TIMESTAMP` → Time-based analysis possible @@ -112,7 +153,7 @@ The schema tells you what data is actually available and how it's structured:
- + **Understanding table relationships:** @@ -124,13 +165,13 @@ The schema tells you what data is actually available and how it's structured: - Unique identifier for each row - Essential for joins and deduplication -**🔗 Foreign Keys:** +**Foreign Keys:** - References to other tables - Shows data relationships - Enables cross-table analysis -**📊 Composite Keys:** +**Composite Keys:** - Multiple columns forming unique identifier - Common in fact tables and junction tables @@ -138,7 +179,7 @@ The schema tells you what data is actually available and how it's structured:
-**🎮 Try This:** Look at a customer table schema and identify: +**Try This:** Look at a customer table schema and identify: - Primary key column - Foreign key relationships @@ -152,7 +193,7 @@ The schema tells you what data is actually available and how it's structured:
-**🟢 High Quality Indicators:** +**High Quality Indicators:** - Consistent naming conventions - Comprehensive column descriptions @@ -160,7 +201,7 @@ The schema tells you what data is actually available and how it's structured: - Clear primary/foreign key relationships - Reasonable column count (not too sparse/dense) -**🔴 Quality Concerns:** +**Quality Concerns:** - Inconsistent naming (camelCase + snake_case) - Missing column descriptions @@ -173,7 +214,7 @@ The schema tells you what data is actually available and how it's structured: -### 🎮 Interactive Exercise: Schema Detective Work +### Interactive Exercise: Schema Detective Work
@@ -218,7 +259,7 @@ DataHub's automated profiling reveals data patterns and quality issues: | **Distinct Count** | Data variety | Too few = poor granularity | | **Standard Deviation** | Data spread | Very high = inconsistent data | -**🎮 Practice Interpretation:** +**Practice Interpretation:** ``` customer_age: Min=18, Max=150, Mean=45, Median=42, Nulls=5% @@ -229,19 +270,19 @@ customer_age: Min=18, Max=150, Mean=45, Median=42, Nulls=5%
- + **Understanding categorical data:**
-**📊 Value Distribution:** +**Value Distribution:** - **Top Values**: Most common categories - **Unique Count**: How many distinct values - **Null Percentage**: Missing data rate -**🚨 Quality Signals:** +**Quality Signals:** - **Good**: Clear categories, low null rate - **Concerning**: Too many unique values, high null rate @@ -266,14 +307,14 @@ customer_status:
-**📈 Temporal Patterns:** +**Temporal Patterns:** - **Date Range**: How far back does data go? - **Update Frequency**: Daily, hourly, real-time? - **Gaps**: Missing time periods? - **Seasonality**: Regular patterns? -**🎯 Business Relevance:** +**Business Relevance:** - **Recent Data**: Good for current analysis - **Historical Depth**: Enables trend analysis @@ -285,7 +326,7 @@ customer_status: -### 🎮 Interactive Exercise: Data Quality Detective +### Interactive Exercise: Data Quality Detective
@@ -325,7 +366,7 @@ Understand how others use this data to validate your choice:
-**📊 Usage Indicators:** +**Usage Indicators:** | Pattern | Interpretation | Decision Impact | | --------------------- | ------------------------ | ------------------------------- | @@ -350,7 +391,7 @@ Understand how others use this data to validate your choice: - **Related Datasets**: Part of a larger, maintained ecosystem? - + **User-generated quality signals:** @@ -370,7 +411,7 @@ Synthesize all information into a clear decision:
-**🎯 Use This Dataset If:** +**Use This Dataset If:** - Health check shows green/yellow lights - Schema matches your analysis needs @@ -378,23 +419,23 @@ Synthesize all information into a clear decision: - Usage patterns indicate active maintenance - You can contact the owner if needed -**⚠️ Investigate Further If:** +**Investigate Further If:** -- 🟡 Some quality concerns but dataset is unique -- 🟡 Usage is low but data looks comprehensive -- 🟡 Owner is unclear but data seems current +- Some quality concerns but dataset is unique +- Usage is low but data looks comprehensive +- Owner is unclear but data seems current -**❌ Skip This Dataset If:** +**Skip This Dataset If:** -- 🔴 Multiple red flags in health check -- 🔴 Schema doesn't support your use case -- 🔴 Serious data quality issues -- 🔴 No recent usage and no owner contact -- 🔴 Better alternatives are available +- Multiple red flags in health check +- Schema doesn't support your use case +- Serious data quality issues +- No recent usage and no owner contact +- Better alternatives are available
-### 🎮 Final Exercise: Complete Dataset Evaluation +### Final Exercise: Complete Dataset Evaluation
@@ -405,10 +446,10 @@ Synthesize all information into a clear decision: ``` Dataset A: ________________ Dataset B: ________________ -Health Check: ⭐⭐⭐⭐⭐ Health Check: ⭐⭐⭐⭐⭐ -Schema Quality: ⭐⭐⭐⭐⭐ Schema Quality: ⭐⭐⭐⭐⭐ -Data Quality: ⭐⭐⭐⭐⭐ Data Quality: ⭐⭐⭐⭐⭐ -Usage Patterns: ⭐⭐⭐⭐⭐ Usage Patterns: ⭐⭐⭐⭐⭐ +Health Check: Excellent Health Check: Excellent +Schema Quality: Excellent Schema Quality: Excellent +Data Quality: Excellent Data Quality: Excellent +Usage Patterns: Excellent Usage Patterns: Excellent Total Score: ___/20 Total Score: ___/20 Winner: Dataset ___ @@ -423,19 +464,19 @@ Reason: ________________________
-**⚡ Speed Techniques:** +**Speed Techniques:** - Develop a mental checklist for rapid assessment - Use browser tabs to compare multiple datasets - Focus on deal-breakers first (freshness, schema fit) -**🎯 Accuracy Boosters:** +**Accuracy Boosters:** - Always check sample data when available - Cross-reference with lineage to understand data flow - Contact owners for clarification on edge cases -**🤝 Team Efficiency:** +**Team Efficiency:** - Document your evaluation criteria for consistency - Share findings with teammates to avoid duplicate work @@ -462,7 +503,7 @@ Choose the best customer dataset for a marketing campaign analysis. Justify your ## Common Evaluation Pitfalls - + **Problem**: Waiting for perfect data that doesn't exist @@ -484,7 +525,7 @@ Choose the best customer dataset for a marketing campaign analysis. Justify your - Verify assumptions with data owners - + **Problem**: Evaluating datasets in isolation without considering alternatives @@ -499,7 +540,7 @@ Choose the best customer dataset for a marketing campaign analysis. Justify your ## What You've Learned -🎉 **Excellent work!** You can now rapidly assess dataset quality and make informed decisions: +**Excellent work!** You can now rapidly assess dataset quality and make informed decisions: - **Health Assessment**: Quick evaluation of dataset reliability - **Schema Intelligence**: Understanding structure and relationships diff --git a/docs/learn-datahub/discovery/overview.md b/docs/learn-datahub/discovery/overview.md index 8fb837d8f4fb17..8f761a5223919f 100644 --- a/docs/learn-datahub/discovery/overview.md +++ b/docs/learn-datahub/discovery/overview.md @@ -155,7 +155,7 @@ Throughout this tutorial, you'll solve these common data challenges: This tutorial leverages Docusaurus's interactive capabilities: - + **Live Search Practice**: Try real searches in your DataHub instance **Interactive Filters**: Step-by-step filter application diff --git a/docs/learn-datahub/governance/business-glossary.md b/docs/learn-datahub/governance/business-glossary.md index 9cb155adab60f6..b1487d28888b70 100644 --- a/docs/learn-datahub/governance/business-glossary.md +++ b/docs/learn-datahub/governance/business-glossary.md @@ -126,50 +126,28 @@ Organize terms into logical hierarchies for better navigation: Set up high-level categories using DataHub's glossary hierarchy: - +**Business Glossary Term Hierarchy:** + +``` +Customer Terms +├── 📂 Customer Identification +│ ├── Customer ID +│ └── Customer Segment +├── 📂 Customer Behavior +│ ├── Active Customer +│ └── Customer Engagement +└── 📂 Customer Value + ├── Customer Lifetime Value (CLV) + └── Customer Acquisition Cost (CAC) + +Financial Terms +├── 📂 Revenue Metrics +│ ├── Revenue Recognition +│ └── Monthly Recurring Revenue (MRR) +└── 📂 Cost Metrics + ├── Cost of Goods Sold (COGS) + └── Operating Expenses (OPEX) +``` #### Step 2: Implement Hierarchies diff --git a/docs/learn-datahub/governance/data-classification.md b/docs/learn-datahub/governance/data-classification.md index f0cfd49e4ffffb..a25c1d515fd65c 100644 --- a/docs/learn-datahub/governance/data-classification.md +++ b/docs/learn-datahub/governance/data-classification.md @@ -63,10 +63,10 @@ DataHub supports industry-standard classification levels: **Classification Levels**: -- **🔴 Restricted**: PII, financial data, trade secrets (highest protection) -- **🟡 Confidential**: Internal business data, customer insights +- **Restricted**: PII, financial data, trade secrets (highest protection) +- **Confidential**: Internal business data, customer insights - **🔵 Internal**: General business information, operational data -- **🟢 Public**: Marketing materials, published reports +- **Public**: Marketing materials, published reports ### Exercise 1: Implement PII Detection diff --git a/docs/learn-datahub/governance/governance-policies.md b/docs/learn-datahub/governance/governance-policies.md index 4fa50cf417d055..e507262d41aba3 100644 --- a/docs/learn-datahub/governance/governance-policies.md +++ b/docs/learn-datahub/governance/governance-policies.md @@ -33,11 +33,11 @@ DataHub policies automate governance enforcement through configurable rules that **Policy Types**: -- **🔒 Access Policies**: Control who can view or modify data -- **📋 Metadata Policies**: Enforce required metadata standards -- **🚨 Quality Policies**: Monitor data quality and trigger alerts +- **Access Policies**: Control who can view or modify data +- **Metadata Policies**: Enforce required metadata standards +- **Quality Policies**: Monitor data quality and trigger alerts - **Approval Policies**: Require reviews for sensitive operations -- **📊 Compliance Policies**: Ensure regulatory requirement adherence +- **Compliance Policies**: Ensure regulatory requirement adherence ### Exercise 1: Create Metadata Compliance Policies @@ -299,11 +299,69 @@ Track these key metrics: Evaluate your organization's governance maturity: -**Level 1 - Basic**: Manual processes, reactive governance -**Level 2 - Managed**: Some automation, defined processes -**Level 3 - Defined**: Comprehensive policies, proactive monitoring -**Level 4 - Quantitatively Managed**: Metrics-driven optimization -**Level 5 - Optimizing**: Continuous improvement, predictive governance + ### Congratulations! diff --git a/docs/learn-datahub/governance/ownership-management.md b/docs/learn-datahub/governance/ownership-management.md index 877da8da538e49..1f694b6e0aa168 100644 --- a/docs/learn-datahub/governance/ownership-management.md +++ b/docs/learn-datahub/governance/ownership-management.md @@ -46,10 +46,10 @@ DataHub supports multiple ownership types to reflect real organizational structu **Ownership Types Explained**: -- **👨‍💻 Technical Owner**: Responsible for data pipeline maintenance, schema changes, and technical issues -- **👔 Business Owner**: Accountable for data accuracy, business rules, and stakeholder communication -- **🛡️ Data Steward**: Ensures data quality, compliance, and governance standards -- **📊 Data Owner**: Ultimate accountability for data asset (often a senior business leader) +- **Technical Owner**: Responsible for data pipeline maintenance, schema changes, and technical issues +- **Business Owner**: Accountable for data accuracy, business rules, and stakeholder communication +- **Data Steward**: Ensures data quality, compliance, and governance standards +- **Data Owner**: Ultimate accountability for data asset (often a senior business leader) ### Exercise 1: Assign Dataset Owners diff --git a/docs/learn-datahub/ingestion/overview.md b/docs/learn-datahub/ingestion/overview.md index 5bb1e14d0df0b5..2a1cdffbab2add 100644 --- a/docs/learn-datahub/ingestion/overview.md +++ b/docs/learn-datahub/ingestion/overview.md @@ -116,11 +116,11 @@ DataHub's ingestion framework provides enterprise-grade capabilities: **Key Ingestion Capabilities**: -- **🔌 Universal Connectors**: 50+ pre-built connectors for popular data systems -- **⚡ High Performance**: Optimized for large-scale enterprise environments -- **🔄 Incremental Updates**: Stateful ingestion for efficient metadata synchronization -- **📊 Automated Profiling**: Deep data content analysis and quality metrics -- **🎯 Flexible Configuration**: Customizable extraction, transformation, and loading +- **Universal Connectors**: 50+ pre-built connectors for popular data systems +- **High Performance**: Optimized for large-scale enterprise environments +- **Incremental Updates**: Stateful ingestion for efficient metadata synchronization +- **Automated Profiling**: Deep data content analysis and quality metrics +- **Flexible Configuration**: Customizable extraction, transformation, and loading ### Ingestion Framework Components @@ -150,11 +150,69 @@ DataHub's ingestion framework provides enterprise-grade capabilities: ### Ingestion Maturity Levels -**Level 1 - Basic**: Manual metadata entry, ad-hoc documentation -**Level 2 - Automated**: Scheduled ingestion, basic source coverage -**Level 3 - Optimized**: Stateful ingestion, profiling, performance tuning -**Level 4 - Advanced**: Custom transformers, complex lineage, multi-environment -**Level 5 - Intelligent**: ML-driven optimization, predictive metadata management + ### Common Ingestion Challenges diff --git a/docs/learn-datahub/lineage/impact-analysis.md b/docs/learn-datahub/lineage/impact-analysis.md index 0687922a2a4c2a..7353923849f7b1 100644 --- a/docs/learn-datahub/lineage/impact-analysis.md +++ b/docs/learn-datahub/lineage/impact-analysis.md @@ -131,7 +131,7 @@ glossaryTerms: ['Recommendation Model'] Let's apply this to our scenario: - + **System/Dataset**: `customer_analytics_pipeline` (Hive tables) **Change Type**: Platform migration (Hive → Snowflake) @@ -149,7 +149,7 @@ Let's apply this to our scenario: - Enable real-time customer insights - + **High-Risk Elements**: @@ -254,7 +254,7 @@ Transform your downstream map into quantified risk scores: - **Operational Criticality**: Is this needed for daily operations? - + **Complexity Scale (1-5)**: @@ -272,7 +272,7 @@ Transform your downstream map into quantified risk scores: - **Rollback Difficulty**: How easy is it to undo changes? - + **Risk Score Formula**: @@ -288,9 +288,9 @@ Risk Score = Business Impact × Technical Complexity × Urgency Factor **Risk Categories**: -- **20-25**: 🔴 **Critical Risk** - Executive approval required -- **15-19**: 🟡 **High Risk** - Detailed mitigation plan needed -- **10-14**: 🟢 **Medium Risk** - Standard change process +- **20-25**: **Critical Risk** - Executive approval required +- **15-19**: **High Risk** - Detailed mitigation plan needed +- **10-14**: **Medium Risk** - Standard change process - **5-9**: 🔵 **Low Risk** - Routine change management - **1-4**: ⚪ **Minimal Risk** - Proceed with standard testing @@ -303,13 +303,13 @@ Risk Score = Business Impact × Technical Complexity × Urgency Factor **TechFlow Customer Analytics Migration Risk Assessment**: -| Downstream System | Business Impact | Technical Complexity | Risk Score | Category | -| ----------------- | --------------- | -------------------- | ---------- | ----------- | -| Sales Dashboard | 5 (Critical) | 3 (Moderate) | 22.5 | 🔴 Critical | -| CEO Reports | 4 (High) | 2 (Simple) | 12 | 🟢 Medium | -| ML Pipeline | 3 (Medium) | 4 (Complex) | 18 | 🟡 High | -| Mobile API | 5 (Critical) | 3 (Moderate) | 22.5 | 🔴 Critical | -| Archive System | 1 (Minimal) | 1 (Trivial) | 1.5 | ⚪ Minimal | +| Downstream System | Business Impact | Technical Complexity | Risk Score | Category | +| ----------------- | --------------- | -------------------- | ---------- | ---------- | +| Sales Dashboard | 5 (Critical) | 3 (Moderate) | 22.5 | Critical | +| CEO Reports | 4 (High) | 2 (Simple) | 12 | Medium | +| ML Pipeline | 3 (Medium) | 4 (Complex) | 18 | High | +| Mobile API | 5 (Critical) | 3 (Moderate) | 22.5 | Critical | +| Archive System | 1 (Minimal) | 1 (Trivial) | 1.5 | ⚪ Minimal | **Analysis**: 2 Critical Risk systems require executive approval and detailed rollback plans. @@ -325,13 +325,13 @@ Identify who needs to be involved in the change: **Stakeholder Categories**: -**🎯 Primary Stakeholders** (Directly affected): +**Primary Stakeholders** (Directly affected): - **Data Consumers**: Teams using the affected data - **System Owners**: Technical teams responsible for downstream systems - **Business Users**: People whose work depends on the data -**🤝 Secondary Stakeholders** (Coordination required): +**Secondary Stakeholders** (Coordination required): - **Infrastructure Teams**: Platform and DevOps support - **Security Teams**: Access control and compliance @@ -348,7 +348,7 @@ Identify who needs to be involved in the change: ### Communication Strategy - + **Technical Impact Report Template**: @@ -375,7 +375,7 @@ Identify who needs to be involved in the change: ``` - + **Business Impact Summary Template**: @@ -404,7 +404,7 @@ Contact: @data-team or @project-manager ``` - + **Executive Impact Brief Template**: @@ -448,21 +448,21 @@ Develop comprehensive plans to minimize risk: **Risk Mitigation Categories**: -**🛡️ Preventive Measures** (Avoid problems): +**Preventive Measures** (Avoid problems): - **Comprehensive testing**: Validate all connections before go-live - **Staged rollout**: Migrate non-critical systems first - **Communication plan**: Ensure all stakeholders are prepared - **Documentation updates**: Keep all procedures current -**🚨 Detective Measures** (Catch problems early): +**Detective Measures** (Catch problems early): - **Monitoring alerts**: Set up notifications for system failures - **Health checks**: Automated validation of data flow - **User feedback channels**: Quick reporting of issues - **Performance monitoring**: Track system response times -**🔧 Corrective Measures** (Fix problems quickly): +**Corrective Measures** (Fix problems quickly): - **Rollback procedures**: Detailed steps to revert changes - **Emergency contacts**: 24/7 support team availability @@ -476,7 +476,7 @@ Develop comprehensive plans to minimize risk: **Critical Success Factor**: Every change needs a tested rollback plan. - + **Rollback Decision Matrix**: @@ -528,7 +528,7 @@ Develop comprehensive plans to minimize risk: - [ ] Mobile app functions normally with new backend - + **Migration Success Criteria**: @@ -626,7 +626,7 @@ Choose a real system change in your organization and perform a complete impact a ## What You've Accomplished -🎉 **Outstanding work!** You've transformed from basic lineage viewing to expert-level impact analysis: +**Outstanding work!** You've transformed from basic lineage viewing to expert-level impact analysis: - **Systematic approach**: You can now analyze any system change methodically - **Risk quantification**: You understand how to score and prioritize risks diff --git a/docs/learn-datahub/lineage/reading-lineage.md b/docs/learn-datahub/lineage/reading-lineage.md index 045720ab688f6b..9188059fd4b135 100644 --- a/docs/learn-datahub/lineage/reading-lineage.md +++ b/docs/learn-datahub/lineage/reading-lineage.md @@ -55,8 +55,8 @@ Every element in a lineage graph tells a specific story: - **Raw Tables**: Source system data (often rectangular nodes) - **Analytical Views**: Processed, business-ready data -- **🔄 Materialized Views**: Pre-computed results for performance -- **📁 File Assets**: CSV, Parquet, JSON files in data lakes +- **Materialized Views**: Pre-computed results for performance +- **File Assets**: CSV, Parquet, JSON files in data lakes **Visual Cues in DataHub**: @@ -77,10 +77,10 @@ Every element in a lineage graph tells a specific story: **Data Processing Elements**: -- **🔄 ETL Jobs**: Extract, Transform, Load processes -- **🐍 Python Scripts**: Custom data processing logic +- **ETL Jobs**: Extract, Transform, Load processes +- **Python Scripts**: Custom data processing logic - **dbt Models**: Data transformation workflows -- **⚡ Spark Jobs**: Large-scale data processing +- **Spark Jobs**: Large-scale data processing **Connection Patterns**: @@ -91,14 +91,14 @@ Every element in a lineage graph tells a specific story: **Analysis Technique**: Jobs between datasets show _how_ data is transformed, not just _that_ it flows. - + **Business Applications**: - **BI Dashboards**: Looker, Tableau, PowerBI reports -- **🤖 ML Models**: Training and inference pipelines -- **📱 Applications**: Customer-facing features -- **📧 Automated Reports**: Scheduled business reports +- **ML Models**: Training and inference pipelines +- **Applications**: Customer-facing features +- **Automated Reports**: Scheduled business reports **Business Impact Indicators**: @@ -262,49 +262,365 @@ The connections between nodes reveal how data is processed: ### Reading Connection Types - + -**One-to-One Relationships**: +#### One-to-One Relationships -``` -Raw Customer Data → Customer Analytics Table -``` + -**What this means**: Direct processing, usually filtering, aggregation, or enrichment +**What this means**: Direct processing with filtering, aggregation, or enrichment. The transformation is straightforward and predictable. -**Many-to-One Relationships**: +#### Many-to-One Relationships -``` -Orders + Customers + Products → Sales Analytics -``` + -**What this means**: Data joining and consolidation +**What this means**: Data joining and consolidation from multiple sources. Complex business logic combines different data domains. -**Analysis Approach**: Look for SQL logic, dbt models, or ETL job definitions to understand the exact transformation. +**Analysis Approach**: Look for SQL logic, dbt models, or ETL job definitions to understand the exact transformation rules and join conditions. - + -**Fan-Out Patterns**: +#### Fan-Out Patterns -``` -Raw Events → [Processing Job] → Multiple Analytics Tables -``` + -**Business Meaning**: One source feeding multiple business use cases +**Business Meaning**: One source feeding multiple business use cases. Each downstream system serves different teams and purposes. -**Fan-In Patterns**: +#### Fan-In Patterns -``` -Multiple Sources → [ETL Job] → Single Data Warehouse Table -``` + -**Business Meaning**: Data consolidation from various systems +**Business Meaning**: Data consolidation from various systems into a single, comprehensive view. -**🚨 Risk Assessment**: Fan-out = high impact if source breaks; Fan-in = complex debugging if output is wrong +**Risk Assessment**: + +- **Fan-out** = High impact if source breaks (affects multiple downstream systems) +- **Fan-in** = Complex debugging if output is wrong (multiple potential failure points) - + + +#### Batch vs Real-Time Processing Patterns + + **Batch Processing Indicators**: @@ -315,10 +631,10 @@ Multiple Sources → [ETL Job] → Single Data Warehouse Table **Real-Time Processing Indicators**: - **Streaming connections**: Kafka topics, event streams -- **Near real-time**: Minimal processing delay +- **Near real-time**: Minimal processing delay (seconds to minutes) - **Continuous updates**: Always-fresh data -**⚡ Performance Insight**: Understand processing schedules to set proper expectations for data freshness. +**Performance Insight**: Understanding processing schedules helps set proper expectations for data freshness and availability. @@ -344,25 +660,341 @@ Identify the most important connections in your data ecosystem: ### Interactive Exercise: Critical Path Identification -
+ + +**Scenario**: You're the Data Reliability Engineer at TechFlow Analytics. The CEO wants to know which data assets are most critical to business operations. + + + +**Dependency Count Analysis** (visible in the diagram above): + +- **customer_transactions** → feeds **3 systems** (see purple connection lines): revenue_pipeline, executive_datasource, sales_dashboard +- **customer_metrics** → feeds **4 systems** (see blue connection lines): executive_datasource, churn_model, customer_api, compliance_report +- **revenue_pipeline** → feeds **3 systems** (see orange connection lines): customer_metrics, executive_datasource, churn_model +- **user_events** → feeds **2 systems** (see green connection lines): customer_metrics, churn_model + +**Your Analysis Task**: + +Using the lineage diagram above, calculate the **Critical Score** for each asset using this formula: + +**Critical Score = (Business Impact × Downstream Dependencies) + Failure Risk** -**Scenario**: You're responsible for data reliability at TechFlow Analytics +Where: -**Your Task**: Using lineage, identify the top 3 most critical data assets +- **Business Impact**: 1-10 (10 = affects revenue/customers directly) +- **Downstream Dependencies**: Count of systems that depend on this asset +- **Failure Risk**: 1-10 (10 = high probability of failure) **Analysis Framework**: -``` -Asset Name: ________________________ -Downstream Dependencies: ____________ -Business Impact (1-10): _____________ -Failure Risk (1-10): _______________ -Critical Score: ____________________ -``` + + -**Success Criteria**: You can explain why these 3 assets deserve the most monitoring and protection. +**Step 1**: Count the downstream dependencies for each asset by examining the lineage diagram: -
+| Asset | Business Impact (1-10) | Downstream Count | Failure Risk (1-10) | Critical Score | +| --------------------- | ---------------------- | ---------------- | ------------------- | -------------- | +| customer_transactions | \_\_\_ | \_\_\_ | \_\_\_ | \_\_\_ | +| revenue_pipeline | \_\_\_ | \_\_\_ | \_\_\_ | \_\_\_ | +| customer_metrics | \_\_\_ | \_\_\_ | \_\_\_ | \_\_\_ | +| user_events | \_\_\_ | \_\_\_ | \_\_\_ | \_\_\_ | + +**Step 2**: Rank your top 3 most critical assets: + +1. **Most Critical**: ******\_\_\_\_****** +2. **Second Critical**: ******\_\_\_\_****** +3. **Third Critical**: ******\_\_\_\_****** + +**Step 3**: Justify your choices: + +- **Why is #1 most critical?** ******\_\_\_\_****** +- **What monitoring would you implement?** ******\_\_\_\_****** + +
+ + +**Correct Analysis** (Data Reliability Engineer perspective): + +| Asset | Business Impact | Downstream Count | Failure Risk | Critical Score | Reasoning | +| ------------------------- | --------------- | ---------------- | ------------ | -------------- | -------------------------------- | +| **customer_transactions** | **10** | **4** | **6** | **46** | Revenue source feeding 4 systems | +| **revenue_pipeline** | **9** | **3** | **8** | **35** | Critical ETL with Warning status | +| **customer_metrics** | **8** | **4** | **5** | **37** | KPIs feeding multiple dashboards | +| **user_events** | **7** | **2** | **4** | **18** | Important but fewer dependencies | + +**Top 3 Critical Assets** (in priority order): + +### 1. **customer_transactions** (Score: 46) - HIGHEST PRIORITY + +**Why Critical**: + +- Direct revenue impact (Business Impact: 10/10) +- Feeds 4 downstream systems (revenue_pipeline, customer_metrics, executive_dashboard, sales_dashboard) +- Single point of failure for all revenue reporting + +**Monitoring Strategy**: + +- Real-time transaction volume monitoring +- Data freshness alerts (< 5 minute SLA) +- Schema change detection +- Database connection health checks +- Automated failover to backup systems + +### 2. **customer_metrics** (Score: 37) - HIGH PRIORITY + +**Why Critical**: + +- Core business KPIs (Business Impact: 8/10) +- Feeds executive dashboard, churn model, customer API, compliance reports +- ML model dependency creates cascading failures + +**Monitoring Strategy**: + +- Data quality assertions on key metrics +- Anomaly detection on metric values +- Lineage validation checks +- Model performance monitoring + +### 3. **revenue_pipeline** (Score: 35) - HIGH PRIORITY + +**Why Critical**: + +- Already showing Warning status (Failure Risk: 8/10) +- Critical ETL processing revenue data +- Scheduled dependency (failure affects daily reporting) + +**Monitoring Strategy**: + +- Job execution monitoring with alerts +- Data pipeline SLA tracking +- Resource utilization monitoring +- Automated retry mechanisms +- Escalation procedures for failures + +**Key Insight**: `customer_transactions` is the highest priority because it's both the revenue source AND feeds the most downstream systems. If it fails, everything breaks. + + + + +**Mistake #1: Focusing Only on Business Impact** +❌ **Wrong**: "Executive dashboard is most critical because the CEO uses it" +✅ **Correct**: "customer_transactions is most critical because it feeds the executive dashboard AND 3 other systems" + +**Why**: Single points of failure with many dependencies are more critical than high-visibility endpoints. + +**Mistake #2: Ignoring Current Health Status** +❌ **Wrong**: "All systems look healthy, so failure risk is low" +✅ **Correct**: "revenue_pipeline shows Warning status, indicating higher failure risk" + +**Why**: Current system health is a leading indicator of future failures. + +**Mistake #3: Not Considering Cascading Failures** +❌ **Wrong**: "Each system failure affects only its direct outputs" +✅ **Correct**: "customer_transactions failure cascades through revenue_pipeline to all dashboards" + +**Why**: Data lineage shows how failures propagate through the entire ecosystem. + +**Mistake #4: Overlooking Processing Dependencies** +❌ **Wrong**: "Dashboards are most critical because users see them" +✅ **Correct**: "The ETL jobs feeding dashboards are more critical because dashboard failures often start there" + +**Why**: Processing bottlenecks are common failure points that affect multiple outputs. + +**Learning Checkpoint**: Did your analysis match the expert ranking? If not, review the lineage diagram to understand the dependency patterns you missed. + + +
+ +**Success Validation**: +✅ **Beginner**: Identified customer_transactions as high priority +✅ **Intermediate**: Correctly calculated critical scores using the formula +✅ **Advanced**: Recognized revenue_pipeline's Warning status as a risk factor +✅ **Expert**: Proposed specific monitoring strategies for each critical asset + + ## Pro Tips for Lineage Reading @@ -380,7 +1012,7 @@ Critical Score: ____________________ - **Check recency**: When was lineage last updated? Stale lineage can mislead - **Cross-reference documentation**: Combine lineage with technical docs and business context -**🤝 Team Efficiency**: +**Team Efficiency**: - **Document your findings**: Share critical path analysis with your team - **Create lineage maps**: Visual summaries for non-technical stakeholders diff --git a/docs/learn-datahub/lineage/troubleshooting.md b/docs/learn-datahub/lineage/troubleshooting.md index bfdad38be3e90e..fc4c8fab8ee279 100644 --- a/docs/learn-datahub/lineage/troubleshooting.md +++ b/docs/learn-datahub/lineage/troubleshooting.md @@ -53,26 +53,26 @@ className: 'process-node', }, { id: '3', -data: { label: '⚙️ 3. Validate Ingestion\nReview ingestion logs' }, +data: { label: '3. Validate Ingestion\nReview ingestion logs' }, position: { x: 400, y: 100 }, className: 'process-node', }, { id: '4', -data: { label: '🔧 4. Handle Edge Cases\nAddress complex scenarios' }, +data: { label: '4. Handle Edge Cases\nAddress complex scenarios' }, position: { x: 600, y: 100 }, className: 'process-node', }, { id: '5', type: 'output', -data: { label: '📈 5. Implement Monitoring\nSet up alerts & tracking' }, +data: { label: '5. Implement Monitoring\nSet up alerts & tracking' }, position: { x: 800, y: 100 }, className: 'success-node', }, { id: 'tip1', -data: { label: '💡 Pro Tip: Start with the\nmost critical missing link' }, +data: { label: 'Pro Tip: Start with the\nmost critical missing link' }, position: { x: 0, y: 250 }, className: 'user-node', }, @@ -91,7 +91,7 @@ edges={[ { id: 'e1-tip1', source: '1', target: 'tip1', style: { strokeDasharray: '5,5' }, label: 'tip' }, { id: 'e5-tip2', source: '5', target: 'tip2', style: { strokeDasharray: '5,5' }, label: 'outcome' }, ]} -title="🔧 Lineage Troubleshooting Framework" +title="Lineage Troubleshooting Framework" height="350px" /> @@ -101,28 +101,28 @@ Understanding the most frequent problems helps you troubleshoot faster:
-**🔍 Missing Connections** (60% of issues): +**Missing Connections** (60% of issues): - New systems not yet configured for metadata ingestion - Changes in connection strings or authentication - Processing jobs that don't emit lineage metadata - Manual data movement processes -**📊 Incomplete Metadata** (25% of issues): +**Incomplete Metadata** (25% of issues): - Partial schema information from source systems - Missing column-level lineage in transformations - Outdated metadata from infrequent ingestion runs - Custom applications without metadata instrumentation -**⚡ Performance Problems** (10% of issues): +**Performance Problems** (10% of issues): - Lineage graphs too complex to render efficiently - Ingestion jobs timing out on large metadata volumes - UI responsiveness issues with deep lineage paths - Memory constraints during lineage computation -**🔄 Stale Information** (5% of issues): +**Stale Information** (5% of issues): - Metadata not refreshed after system changes - Cached lineage information showing old connections @@ -138,7 +138,7 @@ Systematic gap identification prevents wasted troubleshooting effort: ### The Gap Analysis Method - + **Gap Documentation Template**: @@ -163,7 +163,7 @@ Last Known Working: Never appeared in DataHub lineage ``` - + **Missing Lineage Impact**: @@ -182,19 +182,19 @@ Last Known Working: Never appeared in DataHub lineage - **Team coordination issues**: ML team not notified of customer data changes - + **Troubleshooting Priority Matrix**: -| Business Impact | Technical Complexity | Priority | Action Timeline | -| --------------- | -------------------- | ------------ | ------------------- | -| High | Low | 🔴 Critical | Fix within 24 hours | -| High | High | 🟡 Important | Fix within 1 week | -| Medium | Low | 🟢 Standard | Fix within 2 weeks | -| Medium | High | 🔵 Planned | Fix within 1 month | -| Low | Any | ⚪ Backlog | Fix when convenient | +| Business Impact | Technical Complexity | Priority | Action Timeline | +| --------------- | -------------------- | ---------- | ------------------- | +| High | Low | Critical | Fix within 24 hours | +| High | High | Important | Fix within 1 week | +| Medium | Low | Standard | Fix within 2 weeks | +| Medium | High | 🔵 Planned | Fix within 1 month | +| Low | Any | ⚪ Backlog | Fix when convenient | -**TechFlow ML Pipeline**: High business impact (compliance risk) + Medium complexity = 🟡 Important (1 week timeline) +**TechFlow ML Pipeline**: High business impact (compliance risk) + Medium complexity = Important (1 week timeline) @@ -207,21 +207,21 @@ Most lineage issues stem from ingestion configuration problems:
-**🔍 Source System Verification**: +**Source System Verification**: - [ ] **System connectivity**: Can DataHub reach the source system? - [ ] **Authentication**: Are credentials valid and permissions sufficient? - [ ] **Metadata availability**: Does the source system expose lineage information? - [ ] **Recent changes**: Have there been system updates or migrations? -**📊 Ingestion Configuration**: +**Ingestion Configuration**: - [ ] **Recipe accuracy**: Is the ingestion recipe configured correctly? - [ ] **Scheduling**: Is the ingestion running on the expected schedule? - [ ] **Scope coverage**: Are all relevant databases/schemas included? - [ ] **Lineage extraction**: Is lineage extraction enabled in the recipe? -**⚡ Execution Status**: +**Execution Status**: - [ ] **Recent runs**: Has ingestion executed successfully recently? - [ ] **Error logs**: Are there any ingestion failures or warnings? @@ -271,7 +271,7 @@ Deep-dive into ingestion mechanics to find the root cause: ### Ingestion Debugging Techniques - + **Log Investigation Strategy**: @@ -304,7 +304,7 @@ Root Cause: Python ML scripts don't emit DataHub-compatible lineage ``` - + **Metadata Completeness Check**: @@ -336,7 +336,7 @@ AND urn LIKE '%customer_segments%'; ``` - + **Recipe Optimization**: @@ -392,25 +392,25 @@ Real-world data pipelines often include scenarios that standard ingestion can't
-**🔧 Manual Data Processes**: +**Manual Data Processes**: - **Problem**: Excel files, manual data entry, ad-hoc scripts - **Solution**: Custom metadata emission or documentation-based lineage - **Implementation**: Create "virtual" datasets representing manual processes -**🔄 External System Dependencies**: +**External System Dependencies**: - **Problem**: Third-party APIs, vendor data feeds, external databases - **Solution**: Proxy datasets or external system connectors - **Implementation**: Document external dependencies as DataHub entities -**⚡ Real-time Processing**: +**Real-time Processing**: - **Problem**: Streaming pipelines, event-driven architectures, microservices - **Solution**: Event-based lineage capture or instrumentation - **Implementation**: Custom lineage emission from application code -**🎯 Complex Transformations**: +**Complex Transformations**: - **Problem**: Multi-step ETL, custom business logic, conditional processing - **Solution**: Job-level lineage with detailed transformation documentation @@ -493,7 +493,7 @@ lineage_mce = make_lineage_mce( - Contact information for external data issues - + **Code-Level Lineage Emission**: @@ -549,21 +549,21 @@ Proactive lineage quality management prevents future troubleshooting:
-**📊 Quality Metrics**: +**Quality Metrics**: - **Coverage**: Percentage of data assets with complete lineage - **Freshness**: How recently lineage information was updated - **Accuracy**: Validation of lineage against known data flows - **Completeness**: Presence of both upstream and downstream connections -**🚨 Alert Conditions**: +**Alert Conditions**: - **Missing lineage**: New datasets without any lineage connections - **Stale metadata**: Lineage not updated within expected timeframe - **Broken connections**: Previously connected systems showing gaps - **Ingestion failures**: Metadata extraction jobs failing repeatedly -**🔄 Maintenance Tasks**: +**Maintenance Tasks**: - **Regular validation**: Quarterly review of critical data lineage - **Configuration updates**: Adjust ingestion recipes as systems evolve @@ -575,7 +575,7 @@ Proactive lineage quality management prevents future troubleshooting: ### Monitoring Implementation - + **Lineage Quality Dashboard**: @@ -622,7 +622,7 @@ def check_lineage_quality(): ``` - + **Quarterly Lineage Review Process**: @@ -710,11 +710,11 @@ Identify a lineage gap in your organization and resolve it using the systematic ## Mission Accomplished: Lineage Mastery Complete! -🎉 **Congratulations!** You've completed the entire Data Lineage & Impact Analysis series and achieved expert-level proficiency: +**Congratulations!** You've completed the entire Data Lineage & Impact Analysis series and achieved expert-level proficiency: -**🔍 Reading Lineage Graphs**: Navigate any complexity with confidence -**📊 Performing Impact Analysis**: Systematically assess and communicate change risks -**🔧 Lineage Troubleshooting**: Diagnose and resolve any lineage quality issue +**Reading Lineage Graphs**: Navigate any complexity with confidence +**Performing Impact Analysis**: Systematically assess and communicate change risks +**Lineage Troubleshooting**: Diagnose and resolve any lineage quality issue **Your New Capabilities**: @@ -726,12 +726,12 @@ Identify a lineage gap in your organization and resolve it using the systematic **Real-World Impact**: You're now equipped to handle the most complex data lineage challenges in production environments, from multi-system migrations to compliance audits to incident response. :::tip Mark Your Progress -Check off "Lineage Troubleshooting" in the progress tracker above! You've completed the entire lineage mastery series! 🎉 +Check off "Lineage Troubleshooting" in the progress tracker above! You've completed the entire lineage mastery series! ::: --- -**🎯 Ready for More?** Continue your DataHub expertise journey with: +**Ready for More?** Continue your DataHub expertise journey with: - **Data Governance Fundamentals (coming soon)** - Master ownership, classification, and business glossary - **Data Quality & Monitoring (coming soon)** - Learn assertions, health dashboards, and incident management diff --git a/docs/learn-datahub/privacy/overview.md b/docs/learn-datahub/privacy/overview.md index 15679db6925148..2d5d47ea07fa8e 100644 --- a/docs/learn-datahub/privacy/overview.md +++ b/docs/learn-datahub/privacy/overview.md @@ -121,11 +121,11 @@ DataHub provides comprehensive privacy management through: **Key Privacy Capabilities**: -- **🔍 Automated PII Discovery**: ML-powered detection of personal data across all systems -- **🛡️ Privacy Controls**: Automated enforcement of data minimization and purpose limitation -- **📋 Compliance Automation**: Streamlined data subject request fulfillment -- **📊 Privacy Analytics**: Comprehensive reporting and audit trail generation -- **🌍 Cross-Border Compliance**: Monitoring and controls for international data transfers +- **Automated PII Discovery**: ML-powered detection of personal data across all systems +- **Privacy Controls**: Automated enforcement of data minimization and purpose limitation +- **Compliance Automation**: Streamlined data subject request fulfillment +- **Privacy Analytics**: Comprehensive reporting and audit trail generation +- **Cross-Border Compliance**: Monitoring and controls for international data transfers ### Privacy Regulatory Landscape @@ -155,11 +155,69 @@ DataHub provides comprehensive privacy management through: ### Privacy Maturity Assessment -**Level 1 - Reactive**: Manual privacy processes, compliance gaps -**Level 2 - Managed**: Basic privacy controls, some automation -**Level 3 - Proactive**: Comprehensive privacy program, systematic controls -**Level 4 - Optimized**: Advanced privacy engineering, predictive compliance -**Level 5 - Privacy-by-Design**: Privacy embedded in all data processes + ### Success Metrics diff --git a/docs/learn-datahub/quality/data-assertions.md b/docs/learn-datahub/quality/data-assertions.md index b517709300e10f..fb23612b69d339 100644 --- a/docs/learn-datahub/quality/data-assertions.md +++ b/docs/learn-datahub/quality/data-assertions.md @@ -262,7 +262,7 @@ DataHub provides comprehensive assertion monitoring: - **Warning**: Minor issues detected, investigate soon - **Failing**: Critical issues found, immediate attention required - **Paused**: Assertion temporarily disabled -- 🔄 **Running**: Currently executing validation +- **Running**: Currently executing validation **Assertion History**: diff --git a/docs/learn-datahub/quality/incident-management.md b/docs/learn-datahub/quality/incident-management.md index 54b74d9b1fe4a1..880b7340ecf852 100644 --- a/docs/learn-datahub/quality/incident-management.md +++ b/docs/learn-datahub/quality/incident-management.md @@ -50,11 +50,11 @@ Systematic incident management transforms chaotic fire-fighting into structured, **Incident Management Components**: -- **🚨 Automated Detection**: Intelligent alerting based on quality thresholds -- **📋 Structured Response**: Standardized workflows for different incident types +- **Automated Detection**: Intelligent alerting based on quality thresholds +- **Structured Response**: Standardized workflows for different incident types - **SLA Management**: Time-bound response and resolution commitments - **Impact Assessment**: Business impact evaluation and prioritization -- **🔄 Post-Incident Review**: Learning and improvement processes +- **Post-Incident Review**: Learning and improvement processes ### Exercise 1: Set Up Incident Detection @@ -64,10 +64,10 @@ Configure intelligent alerting that triggers appropriate response levels: **Severity Classification**: -- **🔴 Critical (P0)**: Complete data unavailability or major accuracy issues affecting revenue/customers -- **🟡 High (P1)**: Significant quality degradation affecting business operations +- **Critical (P0)**: Complete data unavailability or major accuracy issues affecting revenue/customers +- **High (P1)**: Significant quality degradation affecting business operations - **🟠 Medium (P2)**: Quality issues affecting specific use cases or reports -- **🟢 Low (P3)**: Minor quality issues with workarounds available +- **Low (P3)**: Minor quality issues with workarounds available #### Step 2: Configure Automated Detection Rules @@ -208,7 +208,7 @@ Ensure stakeholders receive appropriate information at the right time: **Incident Status Update Template**: ``` -🚨 INCIDENT UPDATE - [Incident ID] - [Time] +INCIDENT UPDATE - [Incident ID] - [Time] STATUS: [Investigating/Mitigating/Resolved] IMPACT: [Brief business impact description] diff --git a/docs/learn-datahub/quality/overview.md b/docs/learn-datahub/quality/overview.md index 7ac0cb1662040a..c92ea76100a40b 100644 --- a/docs/learn-datahub/quality/overview.md +++ b/docs/learn-datahub/quality/overview.md @@ -142,11 +142,69 @@ DataHub provides comprehensive quality management through: ### Quality Management Maturity Levels -**Level 1 - Reactive**: Manual quality checks, issue discovery after impact -**Level 2 - Proactive**: Automated basic checks, regular quality monitoring -**Level 3 - Predictive**: Advanced analytics, quality trend prediction -**Level 4 - Preventive**: Quality-by-design, automated remediation -**Level 5 - Optimizing**: Continuous quality improvement, ML-driven optimization + ### Ready to Begin? diff --git a/docs/learn-datahub/quality/quality-automation.md b/docs/learn-datahub/quality/quality-automation.md index 1b53f012c4107c..94b7b06e94ab69 100644 --- a/docs/learn-datahub/quality/quality-automation.md +++ b/docs/learn-datahub/quality/quality-automation.md @@ -33,8 +33,8 @@ Quality automation shifts from reactive incident response to proactive issue pre **Automation Layers**: -- **🔄 Pipeline Integration**: Quality checks embedded in data processing workflows -- **🚪 Quality Gates**: Automated approval/rejection of data based on quality criteria +- **Pipeline Integration**: Quality checks embedded in data processing workflows +- **Quality Gates**: Automated approval/rejection of data based on quality criteria - **Self-Healing**: Automatic remediation of common quality issues - **Continuous Improvement**: ML-driven optimization of quality processes - **Preventive Monitoring**: Early detection of quality degradation patterns diff --git a/docs/learn-datahub/quality/quality-monitoring.md b/docs/learn-datahub/quality/quality-monitoring.md index 1f8a9e43a47411..d0db2aaccd82b5 100644 --- a/docs/learn-datahub/quality/quality-monitoring.md +++ b/docs/learn-datahub/quality/quality-monitoring.md @@ -281,7 +281,7 @@ Financial Data: 89.2% (Warning - investigating payment delays) Product Data: 95.8% (Good) Marketing Data: 94.5% (Good) -🚨 ATTENTION REQUIRED +ATTENTION REQUIRED 1. Payment processing latency (Financial) - ETA: 2PM 2. Customer email validation (CRM) - In progress diff --git a/docs/learn-datahub/quickstart/first-ingestion.md b/docs/learn-datahub/quickstart/first-ingestion.md index 254b87a0fa658c..95380e163e4912 100644 --- a/docs/learn-datahub/quickstart/first-ingestion.md +++ b/docs/learn-datahub/quickstart/first-ingestion.md @@ -270,6 +270,50 @@ No errors encountered **Pro Tip**: Notice how DataHub automatically organized everything by platform? This is how you'll navigate complex data ecosystems in real companies. +**Your Ingested Enterprise Data Assets:** + +
+ + + + + +
+ ### 3. Your First Dataset Deep-Dive: Exploring User Metrics Data **Time to investigate!** Let's look at the user metrics data. Click on `fct_users_created` (you'll find it under the Hive platform). diff --git a/docs/learn-datahub/quickstart/first-lineage.md b/docs/learn-datahub/quickstart/first-lineage.md index 52b3fa88a8903e..72f878b738edce 100644 --- a/docs/learn-datahub/quickstart/first-lineage.md +++ b/docs/learn-datahub/quickstart/first-lineage.md @@ -125,56 +125,299 @@ Let's understand the visual elements: **Question**: "I need to update the customer table schema. What will be affected?" -**Steps**: - -1. Navigate to the `customers` table -2. Click the Lineage tab -3. Look at **downstream dependencies** (right side) -4. Identify all affected: + + +**Steps to Analyze Impact**: + +1. **Navigate to the `customers` table** in DataHub +2. **Click the Lineage tab** to see the full dependency graph +3. **Look at downstream dependencies** (right side of the lineage view) +4. **Identify all affected systems**: - Analytics tables that read from customers - Dashboards that display customer data - ML models that use customer features - Reports that include customer metrics -**What you'll see**: - -``` -customers → customer_analytics → customer_dashboard -customers → ml_features → churn_model → recommendation_api -customers → daily_report_job → executive_dashboard -``` +**Impact Assessment**: Any schema change to the `customers` table will potentially affect 8 downstream systems, requiring coordinated updates and testing. ### Scenario 2: Root Cause Analysis **Question**: "The customer dashboard shows wrong numbers. Where's the problem?" -**Steps**: - -1. Start at the `customer_dashboard` -2. Trace **upstream dependencies** (left side) -3. Check each step in the pipeline: - - Is the source data fresh? - - Did any ETL jobs fail? - - Are transformations working correctly? - -**Debugging path**: - -``` -customer_dashboard ← customer_metrics ← etl_job ← raw_customers - ↑ - Check here first! -``` + + +**Debugging Steps**: + +1. **Start at the `customer_dashboard`** (the problem location) +2. **Trace upstream dependencies** (left side of lineage view) +3. **Check each step systematically**: + - **ETL Job**: Did it run successfully? Check logs for failures + - **Customer Metrics**: Is the data fresh? Look at last update timestamp + - **Raw Customers**: Is source data being updated correctly? + +**Root Cause Investigation Priority**: + +1. **Check ETL Job first** - Most common failure point +2. **Verify data freshness** - Look for stale or missing data +3. **Validate transformations** - Ensure business logic is correct +4. **Confirm source data quality** - Check for upstream issues + +**Common Issues Found**: + +- ETL job failed silently due to schema changes +- Data pipeline running but processing stale data +- Transformation logic changed without proper testing +- Source system connectivity problems ### Scenario 3: Data Governance **Question**: "This table contains PII. Where does this sensitive data flow?" -**Steps**: - -1. Find the table with PII (e.g., `customer_profiles`) -2. Examine **all downstream paths** -3. Identify systems that receive sensitive data -4. Verify proper access controls and compliance + + +**Governance Investigation Steps**: + +1. **Find the PII source** (e.g., `customer_profiles` table) +2. **Examine all downstream paths** using DataHub lineage +3. **Identify systems receiving sensitive data**: + + - CRM systems (legitimate business use) + - Marketing platforms (verify consent) + - Analytics systems (should be anonymized) + - Third-party integrations (compliance risk) + +4. **Verify proper controls**: + - Access permissions and role-based security + - Data anonymization where required + - Consent management for marketing use + - Audit trails for compliance reporting + +**Compliance Checklist**: + +- ✅ **Anonymized Analytics**: PII removed, GDPR compliant +- ✅ **CRM System**: Legitimate business purpose, access controlled +- ⚠️ **Marketing Campaigns**: Verify consent and opt-in status +- ✅ **Compliance Audit**: Full access tracking enabled + +**Action Items**: Review marketing system access to ensure proper consent management and consider additional anonymization. ## Column-Level Lineage diff --git a/docs/learn-datahub/quickstart/overview.md b/docs/learn-datahub/quickstart/overview.md index 71d3f7da8e9d07..ea2fe189bac6f8 100644 --- a/docs/learn-datahub/quickstart/overview.md +++ b/docs/learn-datahub/quickstart/overview.md @@ -1,6 +1,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import TutorialProgress from '@site/src/components/TutorialProgress'; +import DataHubLineageNode, { DataHubLineageFlow } from '@site/src/components/DataHubLineageNode'; # Chapter 1: DataHub Foundation (30 minutes) @@ -89,15 +90,49 @@ Before starting, ensure you have: DataHub acts as the central metadata hub connecting your entire data ecosystem: -``` -┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ -│ Source Data │ │ DataHub Core │ │ User Interface │ -│ │ │ │ │ │ -│ • Cloud DBs │───▶│ • Metadata API │───▶│ • Web App │ -│ • Warehouses │ │ • Graph Store │ │ • CLI Tools │ -│ • Streaming │ │ • Search Index │ │ • Dashboards │ -└─────────────────┘ └─────────────────┘ └─────────────────┘ -``` + **Key Integration Points**: diff --git a/docs/learn-datahub/quickstart/setup.md b/docs/learn-datahub/quickstart/setup.md index 9ad10cebbeebeb..56ec3380aa0d87 100644 --- a/docs/learn-datahub/quickstart/setup.md +++ b/docs/learn-datahub/quickstart/setup.md @@ -1,6 +1,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import TutorialProgress from '@site/src/components/TutorialProgress'; +import NextStepButton from '@site/src/components/NextStepButton'; import OSDetectionTabs from '@site/src/components/OSDetectionTabs'; # Step 1: Setup DataHub (5 minutes) @@ -449,8 +450,6 @@ datahub docker quickstart --port 9003 - DataHub's core architecture components - How to verify a successful deployment -import NextStepButton from '@site/src/components/NextStepButton'; - Date: Thu, 30 Oct 2025 18:27:54 +0000 Subject: [PATCH 03/10] third commit --- docs-website/README.md | 2 +- docs-website/docusaurus.config.js | 63 +- docs-website/generateDocsDir.ts | 60 +- docs-website/scripts/sync-datahub-styles.js | 231 +++-- docs-website/sidebars.js | 83 +- .../components/ArchitectureDiagram/index.jsx | 14 +- .../ArchitectureDiagram/styles.module.css | 25 +- .../components/CardDropdown/CardDropdown.tsx | 69 +- .../CardDropdown/styles.module.scss | 51 +- .../components/DataHubEntityCard/index.jsx | 237 ++--- .../DataHubEntityCard/styles.module.css | 418 +++++---- .../components/DataHubLineageNode/index.jsx | 883 ++++++++++-------- .../DataHubLineageNode/styles.module.css | 435 +++++---- .../components/FeatureAvailability/index.js | 17 +- docs-website/src/components/Feedback/index.js | 22 +- .../components/InteractiveDiagram/index.jsx | 2 +- .../InteractiveDiagram/styles.module.css | 12 +- .../components/LineageLayoutGrid/index.jsx | 412 +++++--- .../LineageLayoutGrid/styles.module.css | 85 +- .../src/components/NextStepButton/index.jsx | 30 +- .../NextStepButton/styles.module.css | 4 +- .../src/components/OSDetectionTabs/index.jsx | 91 +- .../src/components/Pills/GlossaryTermPill.jsx | 35 + docs-website/src/components/Pills/TagPill.jsx | 25 + .../src/components/Pills/styles.module.css | 63 ++ .../src/components/ProcessFlow/index.jsx | 121 ++- .../components/ProcessFlow/styles.module.css | 41 +- docs-website/src/components/SlackUtm/index.js | 15 +- .../components/SlackUtm/styles.module.scss | 4 +- .../SolutionsDropdown/SolutionsDropdown.tsx | 69 +- .../SolutionsDropdownContent/index.js | 29 +- .../solutionsDropdownContent.js | 72 +- .../styles.module.scss | 232 ++--- .../SolutionsDropdown/styles.module.css | 6 +- .../src/components/StepCompletion/index.jsx | 26 +- .../StepCompletion/styles.module.css | 10 +- .../src/components/TutorialExercise/index.jsx | 72 +- .../TutorialExercise/styles.module.css | 50 +- .../src/components/TutorialProgress/index.jsx | 99 +- .../TutorialProgress/styles.module.css | 36 +- docs-website/src/css/custom.css | 2 +- docs-website/src/css/mermaid-custom.css | 8 +- .../learn/_components/LearnItemCard/index.jsx | 9 +- .../LearnItemCard/styles.module.scss | 2 +- .../learn/_components/LearnListPage/index.jsx | 43 +- docs-website/src/learn/business-glossary.md | 42 +- docs-website/src/learn/business-metric.md | 7 +- docs-website/src/learn/data-freshness.md | 26 +- docs-website/src/learn/data-mesh.md | 36 +- docs-website/src/learn/data-pipeline.md | 7 +- docs-website/src/pages/champions.js | 2 +- .../src/pages/datahub-components-demo.md | 5 + .../FeatureCard/featurecard.module.scss | 1 - .../docs/_components/FeatureCard/index.jsx | 4 +- .../featurecardsection.module.scss | 2 - .../_components/FeatureCardSection/index.jsx | 67 +- .../docs/_components/FilterBar/index.jsx | 4 +- .../docs/_components/FilterPage/index.jsx | 4 +- .../docs/_components/QuickstartCTA/index.jsx | 18 +- .../QuickstartCTA/quickstartcta.module.scss | 7 +- .../docs/_components/SearchBar/index.jsx | 303 +++--- docs-website/src/pages/docs/index.js | 6 +- docs-website/src/pages/integrations.jsx | 2 +- docs-website/src/styles/config-table.scss | 88 +- docs-website/src/styles/global.scss | 87 +- docs-website/src/styles/sphinx.scss | 50 +- .../src/theme/DocItem/Footer/index.js | 35 +- .../DocsVersionDropdownNavbarItem.js | 29 +- docs-website/src/theme/Root.js | 4 +- .../discovery/advanced-search.md | 24 +- .../discovery/collaborative-discovery.md | 25 +- .../discovery/dataset-profiles.md | 25 +- docs/learn-datahub/discovery/overview.md | 44 +- .../governance/business-glossary.md | 15 +- .../governance/data-classification.md | 17 +- .../governance/governance-policies.md | 17 +- docs/learn-datahub/governance/overview.md | 13 +- .../governance/ownership-management.md | 15 +- docs/learn-datahub/ingestion/overview.md | 16 +- docs/learn-datahub/lineage/impact-analysis.md | 4 - docs/learn-datahub/lineage/overview.md | 21 +- docs/learn-datahub/lineage/reading-lineage.md | 237 +++-- docs/learn-datahub/lineage/troubleshooting.md | 78 +- .../privacy/compliance-workflows.md | 55 ++ docs/learn-datahub/privacy/overview.md | 84 +- docs/learn-datahub/privacy/pii-detection.md | 89 ++ .../learn-datahub/privacy/privacy-controls.md | 83 ++ docs/learn-datahub/quality/data-assertions.md | 2 +- .../quality/incident-management.md | 2 +- docs/learn-datahub/quality/overview.md | 13 +- .../quality/quality-monitoring.md | 2 +- .../quickstart/discovery-basics.md | 53 +- .../quickstart/first-ingestion.md | 10 +- .../learn-datahub/quickstart/first-lineage.md | 10 +- docs/learn-datahub/quickstart/overview.md | 45 +- docs/learn-datahub/quickstart/setup.md | 10 +- 96 files changed, 3752 insertions(+), 2408 deletions(-) create mode 100644 docs-website/src/components/Pills/GlossaryTermPill.jsx create mode 100644 docs-website/src/components/Pills/TagPill.jsx create mode 100644 docs-website/src/components/Pills/styles.module.css create mode 100644 docs/learn-datahub/privacy/compliance-workflows.md create mode 100644 docs/learn-datahub/privacy/pii-detection.md create mode 100644 docs/learn-datahub/privacy/privacy-controls.md diff --git a/docs-website/README.md b/docs-website/README.md index d0129131be26de..6b5966637633a3 100644 --- a/docs-website/README.md +++ b/docs-website/README.md @@ -177,7 +177,7 @@ You can format all files by running: or by allowing pre-commit hooks to run when you commit your changes. -Warning: When using [admonitions](https://docusaurus.io/docs/markdown-features/admonitions#usage-with-prettier) (e.g. `:::note`), +Warning: When using [admonitions](https://docusaurus.io/docs/markdown-features/admonitions#usage-with-prettier) (e.g. `:::note`), you may need to add newlines around the inner text to avoid formatting issues. See the link for details. ## Docs site generation process diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index 0fd79f39701e29..d69fe1788e0a77 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -12,14 +12,17 @@ module.exports = { organizationName: "datahub-project", // Usually your GitHub org/user name. projectName: "datahub", // Usually your repo name. staticDirectories: ["static"], - stylesheets: ["https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap"], + stylesheets: [ + "https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap", + ], headTags: [ { - tagName: 'meta', + tagName: "meta", attributes: { - httpEquiv: 'Content-Security-Policy', - content: "frame-ancestors 'self' https://*.acryl.io https://acryldata.io http://localhost:*" - } + httpEquiv: "Content-Security-Policy", + content: + "frame-ancestors 'self' https://*.acryl.io https://acryldata.io http://localhost:*", + }, }, ], scripts: [ @@ -34,11 +37,11 @@ module.exports = { defer: true, }, { - src: "https://app.revenuehero.io/scheduler.min.js" + src: "https://app.revenuehero.io/scheduler.min.js", }, { src: "https://tag.clearbitscripts.com/v1/pk_2e321cabe30432a5c44c0424781aa35f/tags.js", - referrerPolicy: "strict-origin-when-cross-origin" + referrerPolicy: "strict-origin-when-cross-origin", }, { src: "/scripts/reo.js", @@ -54,7 +57,8 @@ module.exports = { "runllm-keyboard-shortcut": "Mod+j", "runllm-preset": "docusaurus", "runllm-theme-color": "#1890FF", - "runllm-brand-logo": "https://docs.datahub.com/img/datahub-logo-color-mark.svg", + "runllm-brand-logo": + "https://docs.datahub.com/img/datahub-logo-color-mark.svg", "runllm-community-url": "https://datahub.com/slack", "runllm-community-type": "slack", "runllm-disable-ask-a-person": "true", @@ -107,7 +111,7 @@ module.exports = { }, colorMode: { // Only support light mode. - defaultMode: 'light', + defaultMode: "light", disableSwitch: true, respectPrefersColorScheme: false, }, @@ -138,12 +142,13 @@ module.exports = { dropdownActiveClassDisabled: true, dropdownItemsAfter: [ { - type: 'html', + type: "html", value: '', }, { - type: 'html', - value: '', + type: "html", + value: + '', }, { value: ` @@ -339,7 +344,7 @@ module.exports = { versions: { current: { label: "Next", - banner: 'none', + banner: "none", }, }, path: "genDocs", @@ -355,7 +360,8 @@ module.exports = { blog: { blogTitle: "DataHub Learn", blogSidebarTitle: "DataHub Learn", - blogDescription: "Learn about the hot topics in the data ecosystem and how DataHub can help you with your data journey.", + blogDescription: + "Learn about the hot topics in the data ecosystem and how DataHub can help you with your data journey.", path: "src/learn", routeBasePath: "learn", postsPerPage: "ALL", @@ -363,7 +369,9 @@ module.exports = { }, theme: { customCss: [ - isSaas ? require.resolve("./src/styles/acryl.scss") : require.resolve("./src/styles/datahub.scss"), + isSaas + ? require.resolve("./src/styles/acryl.scss") + : require.resolve("./src/styles/datahub.scss"), require.resolve("./src/styles/global.scss"), require.resolve("./src/styles/sphinx.scss"), require.resolve("./src/styles/config-table.scss"), @@ -374,7 +382,7 @@ module.exports = { mdxPageComponent: "@theme/MDXPage", }, googleTagManager: { - containerId: 'GTM-5M8T9HNN', + containerId: "GTM-5M8T9HNN", }, gtag: { trackingID: "G-PKGVLETT4C", @@ -384,29 +392,32 @@ module.exports = { ], plugins: [ [ - '@docusaurus/plugin-client-redirects', + "@docusaurus/plugin-client-redirects", { createRedirects(existingPath) { - if (existingPath.includes('/docs')) { + if (existingPath.includes("/docs")) { return [ - existingPath.replace('/docs', '/docs/next'), - existingPath.replace('/docs', '/docs/0.13.0'), - existingPath.replace('/docs', '/docs/0.12.1'), - existingPath.replace('/docs', '/docs/0.11.0'), - existingPath.replace('/docs', '/docs/0.10.5'), + existingPath.replace("/docs", "/docs/next"), + existingPath.replace("/docs", "/docs/0.13.0"), + existingPath.replace("/docs", "/docs/0.12.1"), + existingPath.replace("/docs", "/docs/0.11.0"), + existingPath.replace("/docs", "/docs/0.10.5"), ]; } return undefined; // Return a falsy value: no redirect created }, redirects: [ { - from: '/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor', - to: '/docs/managed-datahub/remote-executor/about', + from: "/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor", + to: "/docs/managed-datahub/remote-executor/about", }, ], }, ], - ["@docusaurus/plugin-ideal-image", { quality: 100, sizes: [320, 640, 1280, 1440, 1600] }], + [ + "@docusaurus/plugin-ideal-image", + { quality: 100, sizes: [320, 640, 1280, 1440, 1600] }, + ], "docusaurus-plugin-sass", [ "docusaurus-graphql-plugin", diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index b695ff60f66501..a08bdc0715eee6 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -65,7 +65,7 @@ function list_markdown_files(): string[] { .trim() .split("\n"); let all_generated_markdown_files = execSync( - "cd .. && ls docs/generated/**/**/*.md && ls docs/generated/**/*.md" + "cd .. && ls docs/generated/**/**/*.md && ls docs/generated/**/*.md", ) .toString() .trim() @@ -76,7 +76,7 @@ function list_markdown_files(): string[] { if (!process.env.CI) { // If not in CI, we also include "untracked" files. const untracked_files = execSync( - "(git ls-files --full-name --others --exclude-standard .. | grep '.md$') || true" + "(git ls-files --full-name --others --exclude-standard .. | grep '.md$') || true", ) .toString() .trim() @@ -85,14 +85,14 @@ function list_markdown_files(): string[] { if (untracked_files.length > 0) { console.log( - `Including untracked files in docs list: [${untracked_files}]` + `Including untracked files in docs list: [${untracked_files}]`, ); all_markdown_files = [...all_markdown_files, ...untracked_files]; } // But we should also exclude any files that have been deleted. const deleted_files = execSync( - "(git ls-files --full-name --deleted --exclude-standard .. | grep '.md$') || true" + "(git ls-files --full-name --deleted --exclude-standard .. | grep '.md$') || true", ) .toString() .trim() @@ -101,7 +101,7 @@ function list_markdown_files(): string[] { if (deleted_files.length > 0) { console.log(`Removing deleted files from docs list: [${deleted_files}]`); all_markdown_files = all_markdown_files.filter( - (filepath) => !deleted_files.includes(filepath) + (filepath) => !deleted_files.includes(filepath), ); } } @@ -204,7 +204,7 @@ const allowed_broken_links = [ function markdown_guess_title( contents: matter.GrayMatterFile, - filepath: string + filepath: string, ): void { if (sidebarsjs_hardcoded_titles.includes(filepath)) { return; @@ -231,7 +231,7 @@ function markdown_guess_title( if (!headers) { throw new Error( - `${filepath} must have at least one h1 header for setting the title` + `${filepath} must have at least one h1 header for setting the title`, ); } @@ -257,7 +257,7 @@ function markdown_guess_title( function markdown_add_edit_url( contents: matter.GrayMatterFile, - filepath: string + filepath: string, ): void { const editUrl = `${GITHUB_EDIT_URL}/${filepath}`; contents.data.custom_edit_url = editUrl; @@ -265,7 +265,7 @@ function markdown_add_edit_url( function markdown_add_slug( contents: matter.GrayMatterFile, - filepath: string + filepath: string, ): void { if (contents.data.slug) { return; @@ -352,7 +352,7 @@ function new_url(original: string, filepath: string): string { // Detects when the path is a dangling reference, according to the locally // checked out repo. throw new Error( - `broken github repo link to ${updated_path} in ${filepath}` + `broken github repo link to ${updated_path} in ${filepath}`, ); } const updated_url = `${GITHUB_BROWSE_URL}/${updated_path}`; @@ -366,7 +366,7 @@ function new_url(original: string, filepath: string): string { const up_levels = (filepath.match(/\//g) ?? []).length; const relation = path.dirname(filepath); const updated = path.normalize( - `${"../".repeat(up_levels + 2)}/${relation}/${original}` + `${"../".repeat(up_levels + 2)}/${relation}/${original}`, ); //console.log(`Rewriting ${original} ${filepath} as ${updated}`); return updated; @@ -377,7 +377,7 @@ function new_url(original: string, filepath: string): string { function markdown_rewrite_urls( contents: matter.GrayMatterFile, - filepath: string + filepath: string, ): void { const new_content = contents.content .replace( @@ -389,7 +389,7 @@ function markdown_rewrite_urls( (_, text, url) => { const updated = new_url(preprocess_url(url), filepath); return `[${text}](${updated})`; - } + }, ) .replace( // Also look for the [text]: url syntax. @@ -397,14 +397,14 @@ function markdown_rewrite_urls( (_, text, url) => { const updated = new_url(preprocess_url(url), filepath); return `[${text}]: ${updated}`; - } + }, ); contents.content = new_content; } function markdown_enable_specials( contents: matter.GrayMatterFile, - filepath: string + filepath: string, ): void { const new_content = contents.content .replace(/^`; } - } + }, ); contents.content = new_content; } @@ -501,13 +501,13 @@ function markdown_sanitize_and_linkify(content: string): string { // Link to issues/pull requests. content = content.replace( /#(\d+)\b/g, - "[#$1](https://github.com/datahub-project/datahub/pull/$1)" + "[#$1](https://github.com/datahub-project/datahub/pull/$1)", ); // Prettify bare links to PRs. content = content.replace( /(\s+)(https:\/\/github\.com\/linkedin\/datahub\/pull\/(\d+))(\s+|$)/g, - "$1[#$3]($2)$4" + "$1[#$3]($2)$4", ); return content; @@ -541,12 +541,12 @@ custom_edit_url: https://github.com/datahub-project/datahub/blob/master/docs-web repo: "datahub", }); const releases_list = releases_list_full.data.filter( - (release) => !release.prerelease && !release.draft + (release) => !release.prerelease && !release.draft, ); // We only embed release notes for releases in the last 3 months. const release_notes_date_cutoff = new Date( - Date.now() - 1000 * 60 * 60 * 24 * 30 * 3 + Date.now() - 1000 * 60 * 60 * 24 * 30 * 3, ); // Construct a summary table. @@ -557,7 +557,7 @@ custom_edit_url: https://github.com/datahub-project/datahub/blob/master/docs-web const release_date = new Date(Date.parse(release.created_at)); let row = `| **${release.tag_name}** | ${pretty_format_date( - release.created_at + release.created_at, )} |`; if (release_date > release_notes_date_cutoff) { row += `[Release Notes](#${make_link_anchor(release.tag_name)}), `; @@ -583,11 +583,11 @@ custom_edit_url: https://github.com/datahub-project/datahub/blob/master/docs-web const heading_regex = /^(#+)\s/gm; const max_heading_level = Math.min( 3, - ...[...body.matchAll(heading_regex)].map((v) => v[1].length) + ...[...body.matchAll(heading_regex)].map((v) => v[1].length), ); body = body.replace( heading_regex, - `${"#".repeat(3 - max_heading_level)}$1 ` + `${"#".repeat(3 - max_heading_level)}$1 `, ); } else { // Link to GitHub. @@ -612,7 +612,7 @@ ${body}\n\n`; function write_markdown_file( contents: matter.GrayMatterFile, - output_filepath: string + output_filepath: string, ): void { const pathname = path.dirname(output_filepath); fs.mkdirSync(pathname, { recursive: true }); @@ -675,7 +675,7 @@ function write_markdown_file( } if (!accounted_for_in_sidebar(filepath)) { console.warn( - `File not accounted for in sidebar: ${filepath} - consider adding it to docs-website/sidebars.js or explicitly ignoring it` + `File not accounted for in sidebar: ${filepath} - consider adding it to docs-website/sidebars.js or explicitly ignoring it`, ); } } diff --git a/docs-website/scripts/sync-datahub-styles.js b/docs-website/scripts/sync-datahub-styles.js index dd5291a6bf0070..87982cb6de52d4 100644 --- a/docs-website/scripts/sync-datahub-styles.js +++ b/docs-website/scripts/sync-datahub-styles.js @@ -2,57 +2,62 @@ /** * Sync DataHub Styles Script - * + * * This script automatically extracts design tokens from the DataHub web-react * codebase and updates the tutorial component styles to match the actual UI. - * + * * Usage: node scripts/sync-datahub-styles.js */ -const fs = require('fs'); -const path = require('path'); +const fs = require("fs"); +const path = require("path"); // Paths -const DATAHUB_COLORS_PATH = '../../datahub-web-react/src/alchemy-components/theme/foundations/colors.ts'; -const DATAHUB_SEMANTIC_TOKENS_PATH = '../../datahub-web-react/src/alchemy-components/theme/semantic-tokens.ts'; -const DOCS_COMPONENTS_DIR = './src/components'; +const DATAHUB_COLORS_PATH = + "../../datahub-web-react/src/alchemy-components/theme/foundations/colors.ts"; +const DATAHUB_SEMANTIC_TOKENS_PATH = + "../../datahub-web-react/src/alchemy-components/theme/semantic-tokens.ts"; +const DOCS_COMPONENTS_DIR = "./src/components"; /** * Extract color values from DataHub's colors.ts file */ function extractDataHubColors() { try { - const colorsFile = fs.readFileSync(path.resolve(__dirname, DATAHUB_COLORS_PATH), 'utf8'); - + const colorsFile = fs.readFileSync( + path.resolve(__dirname, DATAHUB_COLORS_PATH), + "utf8", + ); + // Extract color definitions using regex const colorMatches = colorsFile.match(/(\w+):\s*{([^}]+)}/g) || []; const singleColorMatches = colorsFile.match(/(\w+):\s*'([^']+)'/g) || []; - + const colors = {}; - + // Parse nested color objects (e.g., gray: { 100: '#EBECF0', ... }) - colorMatches.forEach(match => { + colorMatches.forEach((match) => { const [, colorName, colorValues] = match.match(/(\w+):\s*{([^}]+)}/); const values = {}; - + const valueMatches = colorValues.match(/(\d+):\s*'([^']+)'/g) || []; - valueMatches.forEach(valueMatch => { + valueMatches.forEach((valueMatch) => { const [, key, value] = valueMatch.match(/(\d+):\s*'([^']+)'/); values[key] = value; }); - + colors[colorName] = values; }); - + // Parse single color values (e.g., white: '#FFFFFF') - singleColorMatches.forEach(match => { + singleColorMatches.forEach((match) => { const [, colorName, colorValue] = match.match(/(\w+):\s*'([^']+)'/); colors[colorName] = colorValue; }); - + return colors; } catch (error) { - console.warn('Could not read DataHub colors file:', error.message); + console.warn("Could not read DataHub colors file:", error.message); return null; } } @@ -62,20 +67,26 @@ function extractDataHubColors() { */ function extractSemanticTokens() { try { - const semanticFile = fs.readFileSync(path.resolve(__dirname, DATAHUB_SEMANTIC_TOKENS_PATH), 'utf8'); - + const semanticFile = fs.readFileSync( + path.resolve(__dirname, DATAHUB_SEMANTIC_TOKENS_PATH), + "utf8", + ); + // Extract semantic token mappings - const tokenMatches = semanticFile.match(/'([^']+)':\s*colors\.([^,\s]+)/g) || []; + const tokenMatches = + semanticFile.match(/'([^']+)':\s*colors\.([^,\s]+)/g) || []; const tokens = {}; - - tokenMatches.forEach(match => { - const [, tokenName, colorPath] = match.match(/'([^']+)':\s*colors\.([^,\s]+)/); + + tokenMatches.forEach((match) => { + const [, tokenName, colorPath] = match.match( + /'([^']+)':\s*colors\.([^,\s]+)/, + ); tokens[tokenName] = colorPath; }); - + return tokens; } catch (error) { - console.warn('Could not read DataHub semantic tokens file:', error.message); + console.warn("Could not read DataHub semantic tokens file:", error.message); return null; } } @@ -84,33 +95,37 @@ function extractSemanticTokens() { * Generate CSS variables from DataHub colors */ function generateCSSVariables(colors, semanticTokens) { - if (!colors) return ''; - + if (!colors) return ""; + let cssVars = `/* Auto-generated DataHub Design Tokens */\n:root {\n`; - + // Core color mappings based on DataHub's actual usage const colorMappings = { - 'datahub-primary': colors.primary?.[500] || colors.violet?.[500] || '#533FD1', - 'datahub-primary-dark': colors.primary?.[600] || colors.violet?.[600] || '#4C39BE', - 'datahub-primary-light': colors.primary?.[400] || colors.violet?.[400] || '#7565DA', - 'datahub-primary-lightest': colors.primary?.[0] || colors.violet?.[0] || '#F1F3FD', - 'datahub-gray-100': colors.gray?.[100] || '#EBECF0', - 'datahub-gray-600': colors.gray?.[600] || '#374066', - 'datahub-gray-1700': colors.gray?.[1700] || '#5F6685', - 'datahub-gray-1800': colors.gray?.[1800] || '#8088A3', - 'datahub-gray-1500': colors.gray?.[1500] || '#F9FAFC', - 'datahub-white': colors.white || '#FFFFFF', - 'datahub-success': colors.green?.[500] || '#77B750', - 'datahub-warning': colors.yellow?.[500] || '#EEAE09', - 'datahub-error': colors.red?.[500] || '#CD0D24', - 'datahub-border': colors.gray?.[1400] || '#E9EAEE', + "datahub-primary": + colors.primary?.[500] || colors.violet?.[500] || "#533FD1", + "datahub-primary-dark": + colors.primary?.[600] || colors.violet?.[600] || "#4C39BE", + "datahub-primary-light": + colors.primary?.[400] || colors.violet?.[400] || "#7565DA", + "datahub-primary-lightest": + colors.primary?.[0] || colors.violet?.[0] || "#F1F3FD", + "datahub-gray-100": colors.gray?.[100] || "#EBECF0", + "datahub-gray-600": colors.gray?.[600] || "#374066", + "datahub-gray-1700": colors.gray?.[1700] || "#5F6685", + "datahub-gray-1800": colors.gray?.[1800] || "#8088A3", + "datahub-gray-1500": colors.gray?.[1500] || "#F9FAFC", + "datahub-white": colors.white || "#FFFFFF", + "datahub-success": colors.green?.[500] || "#77B750", + "datahub-warning": colors.yellow?.[500] || "#EEAE09", + "datahub-error": colors.red?.[500] || "#CD0D24", + "datahub-border": colors.gray?.[1400] || "#E9EAEE", }; - + // Add CSS variables Object.entries(colorMappings).forEach(([varName, value]) => { cssVars += ` --${varName}: ${value};\n`; }); - + // Add shadows and other design tokens cssVars += ` --datahub-shadow: 0px 1px 2px 0px rgba(33, 23, 95, 0.07);\n`; cssVars += ` --datahub-shadow-hover: 0 2px 8px rgba(83, 63, 209, 0.15);\n`; @@ -118,29 +133,33 @@ function generateCSSVariables(colors, semanticTokens) { cssVars += ` --datahub-node-height: 90px;\n`; cssVars += ` --datahub-transformation-size: 40px;\n`; cssVars += `}\n\n`; - + // Dark mode variables cssVars += `/* Dark mode colors */\n[data-theme='dark'] {\n`; const darkMappings = { - 'datahub-primary': colors.primary?.[400] || colors.violet?.[400] || '#7565DA', - 'datahub-primary-dark': colors.primary?.[500] || colors.violet?.[500] || '#533FD1', - 'datahub-primary-light': colors.primary?.[300] || colors.violet?.[300] || '#8C7EE0', - 'datahub-primary-lightest': colors.primary?.[800] || colors.violet?.[800] || '#2E2373', - 'datahub-gray-100': colors.gray?.[700] || '#2F3657', - 'datahub-gray-600': colors.gray?.[200] || '#CFD1DA', - 'datahub-gray-1700': colors.gray?.[300] || '#A9ADBD', - 'datahub-gray-1800': colors.gray?.[400] || '#81879F', - 'datahub-gray-1500': colors.gray?.[2000] || '#1E2338', - 'datahub-white': colors.gray?.[800] || '#272D48', - 'datahub-border': colors.gray?.[600] || '#374066', + "datahub-primary": + colors.primary?.[400] || colors.violet?.[400] || "#7565DA", + "datahub-primary-dark": + colors.primary?.[500] || colors.violet?.[500] || "#533FD1", + "datahub-primary-light": + colors.primary?.[300] || colors.violet?.[300] || "#8C7EE0", + "datahub-primary-lightest": + colors.primary?.[800] || colors.violet?.[800] || "#2E2373", + "datahub-gray-100": colors.gray?.[700] || "#2F3657", + "datahub-gray-600": colors.gray?.[200] || "#CFD1DA", + "datahub-gray-1700": colors.gray?.[300] || "#A9ADBD", + "datahub-gray-1800": colors.gray?.[400] || "#81879F", + "datahub-gray-1500": colors.gray?.[2000] || "#1E2338", + "datahub-white": colors.gray?.[800] || "#272D48", + "datahub-border": colors.gray?.[600] || "#374066", }; - + Object.entries(darkMappings).forEach(([varName, value]) => { cssVars += ` --${varName}: ${value};\n`; }); - + cssVars += `}\n\n`; - + return cssVars; } @@ -148,27 +167,31 @@ function generateCSSVariables(colors, semanticTokens) { * Update component CSS files with new design tokens */ function updateComponentStyles(cssVariables) { - const componentDirs = ['DataHubEntityCard', 'DataHubLineageNode']; - - componentDirs.forEach(componentDir => { - const styleFile = path.join(DOCS_COMPONENTS_DIR, componentDir, 'styles.module.css'); - + const componentDirs = ["DataHubEntityCard", "DataHubLineageNode"]; + + componentDirs.forEach((componentDir) => { + const styleFile = path.join( + DOCS_COMPONENTS_DIR, + componentDir, + "styles.module.css", + ); + try { - let content = fs.readFileSync(styleFile, 'utf8'); - + let content = fs.readFileSync(styleFile, "utf8"); + // Replace the CSS variables section - const variableRegex = /\/\* Auto-generated DataHub Design Tokens \*\/[\s\S]*?}\s*\n\s*\n/; - + const variableRegex = + /\/\* Auto-generated DataHub Design Tokens \*\/[\s\S]*?}\s*\n\s*\n/; + if (variableRegex.test(content)) { content = content.replace(variableRegex, cssVariables); } else { // If no existing variables section, add at the top content = cssVariables + content; } - + fs.writeFileSync(styleFile, content); console.log(`✅ Updated ${componentDir} styles`); - } catch (error) { console.error(`❌ Failed to update ${componentDir}:`, error.message); } @@ -179,41 +202,55 @@ function updateComponentStyles(cssVariables) { * Main execution */ function main() { - console.log('🔄 Syncing DataHub styles...\n'); - + console.log("🔄 Syncing DataHub styles...\n"); + const colors = extractDataHubColors(); const semanticTokens = extractSemanticTokens(); - + if (!colors) { - console.warn('⚠️ Could not extract DataHub colors from source files.'); - console.log(' Using fallback design tokens to ensure build continues...\n'); - + console.warn("⚠️ Could not extract DataHub colors from source files."); + console.log( + " Using fallback design tokens to ensure build continues...\n", + ); + // Use fallback colors to ensure build doesn't fail const fallbackColors = { - primary: { 500: '#533FD1', 600: '#4C39BE', 400: '#7565DA', 0: '#F1F3FD' }, - gray: { 100: '#EBECF0', 600: '#374066', 1700: '#5F6685', 1800: '#8088A3', 1500: '#F9FAFC' }, - white: '#FFFFFF', - green: { 500: '#77B750' }, - yellow: { 500: '#EEAE09' }, - red: { 500: '#CD0D24' } + primary: { 500: "#533FD1", 600: "#4C39BE", 400: "#7565DA", 0: "#F1F3FD" }, + gray: { + 100: "#EBECF0", + 600: "#374066", + 1700: "#5F6685", + 1800: "#8088A3", + 1500: "#F9FAFC", + }, + white: "#FFFFFF", + green: { 500: "#77B750" }, + yellow: { 500: "#EEAE09" }, + red: { 500: "#CD0D24" }, }; - + const cssVariables = generateCSSVariables(fallbackColors, null); updateComponentStyles(cssVariables); - - console.log('✅ Applied fallback styling - components will use default DataHub colors'); + + console.log( + "✅ Applied fallback styling - components will use default DataHub colors", + ); return; } - - console.log('📊 Extracted DataHub design tokens'); + + console.log("📊 Extracted DataHub design tokens"); console.log(` - Colors: ${Object.keys(colors).length} palettes`); - console.log(` - Semantic tokens: ${semanticTokens ? Object.keys(semanticTokens).length : 0} mappings\n`); - + console.log( + ` - Semantic tokens: ${semanticTokens ? Object.keys(semanticTokens).length : 0} mappings\n`, + ); + const cssVariables = generateCSSVariables(colors, semanticTokens); updateComponentStyles(cssVariables); - - console.log('\n🎉 DataHub styles sync completed!'); - console.log(' Tutorial components now match the latest DataHub UI styling.'); + + console.log("\n🎉 DataHub styles sync completed!"); + console.log( + " Tutorial components now match the latest DataHub UI styling.", + ); } // Run the script @@ -221,4 +258,8 @@ if (require.main === module) { main(); } -module.exports = { extractDataHubColors, generateCSSVariables, updateComponentStyles }; +module.exports = { + extractDataHubColors, + generateCSSVariables, + updateComponentStyles, +}; diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 07f1e37aa9b40a..82d146451624a3 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -170,57 +170,38 @@ module.exports = { }, ], }, - // { - // type: "category", - // label: "Data Ingestion Mastery (60 min)", - // collapsed: true, - // link: { type: "doc", id: "docs/learn-datahub/ingestion/overview" }, - // items: [ - // { - // type: "doc", - // label: "Recipe Fundamentals (15 min)", - // id: "docs/learn-datahub/ingestion/recipe-fundamentals", - // }, - // { - // type: "doc", - // label: "Stateful Ingestion (15 min)", - // id: "docs/learn-datahub/ingestion/stateful-ingestion", - // }, - // { - // type: "doc", - // label: "Data Profiling (15 min)", - // id: "docs/learn-datahub/ingestion/data-profiling", - // }, - // { - // type: "doc", - // label: "Advanced Patterns (15 min)", - // id: "docs/learn-datahub/ingestion/advanced-patterns", - // }, - // ], - // }, - // { - // type: "category", - // label: "Privacy & Compliance (35 min)", - // collapsed: true, - // link: { type: "doc", id: "docs/learn-datahub/privacy/overview" }, - // items: [ - // { - // type: "doc", - // label: "PII Detection (12 min)", - // id: "docs/learn-datahub/privacy/pii-detection", - // }, - // { - // type: "doc", - // label: "Privacy Controls (12 min)", - // id: "docs/learn-datahub/privacy/privacy-controls", - // }, - // { - // type: "doc", - // label: "Compliance Workflows (11 min)", - // id: "docs/learn-datahub/privacy/compliance-workflows", - // }, - // ], - // }, + { + type: "category", + label: "Data Ingestion Mastery (60 min)", + collapsed: true, + link: { type: "doc", id: "docs/learn-datahub/ingestion/overview" }, + items: [ + // Stubs or to-be-created pages can be added later + ], + }, + { + type: "category", + label: "Privacy & Compliance (35 min)", + collapsed: true, + link: { type: "doc", id: "docs/learn-datahub/privacy/overview" }, + items: [ + { + type: "doc", + label: "PII Detection (12 min)", + id: "docs/learn-datahub/privacy/pii-detection", + }, + { + type: "doc", + label: "Privacy Controls (12 min)", + id: "docs/learn-datahub/privacy/privacy-controls", + }, + { + type: "doc", + label: "Compliance Workflows (11 min)", + id: "docs/learn-datahub/privacy/compliance-workflows", + }, + ], + }, ], }, { diff --git a/docs-website/src/components/ArchitectureDiagram/index.jsx b/docs-website/src/components/ArchitectureDiagram/index.jsx index 8a42048167064e..1e76aaf79279c4 100644 --- a/docs-website/src/components/ArchitectureDiagram/index.jsx +++ b/docs-website/src/components/ArchitectureDiagram/index.jsx @@ -1,12 +1,14 @@ -import React from 'react'; -import styles from './styles.module.css'; +import React from "react"; +import styles from "./styles.module.css"; -const ArchitectureDiagram = ({ type = 'integration' }) => { - if (type === 'integration') { +const ArchitectureDiagram = ({ type = "integration" }) => { + if (type === "integration") { return (
-
DataHub Integration Architecture
- +
+ DataHub Integration Architecture +
+
{/* Source Systems Layer */}
diff --git a/docs-website/src/components/ArchitectureDiagram/styles.module.css b/docs-website/src/components/ArchitectureDiagram/styles.module.css index 64ae703bffd60c..27bee346df9831 100644 --- a/docs-website/src/components/ArchitectureDiagram/styles.module.css +++ b/docs-website/src/components/ArchitectureDiagram/styles.module.css @@ -130,8 +130,13 @@ } @keyframes pulse { - 0%, 100% { opacity: 0.7; } - 50% { opacity: 1; } + 0%, + 100% { + opacity: 0.7; + } + 50% { + opacity: 1; + } } .diagramFooter { @@ -173,16 +178,16 @@ .diagramContainer { flex-direction: column; } - + .arrowLayer { flex-direction: row; transform: rotate(90deg); } - + .arrow { transform: rotate(90deg); } - + .dataFlow { flex-direction: column; gap: 8px; @@ -190,27 +195,27 @@ } /* Dark mode support */ -[data-theme='dark'] .architectureDiagram { +[data-theme="dark"] .architectureDiagram { background: linear-gradient(135deg, #1e293b 0%, #334155 100%); border-color: var(--ifm-color-primary-dark); } -[data-theme='dark'] .node { +[data-theme="dark"] .node { background: var(--ifm-color-emphasis-100); color: var(--ifm-color-emphasis-800); } -[data-theme='dark'] .sourceNode { +[data-theme="dark"] .sourceNode { background: linear-gradient(135deg, #064e3b 0%, #065f46 100%); color: #ecfdf5; } -[data-theme='dark'] .coreNode { +[data-theme="dark"] .coreNode { background: linear-gradient(135deg, #1e3a8a 0%, #1e40af 100%); color: #eff6ff; } -[data-theme='dark'] .uiNode { +[data-theme="dark"] .uiNode { background: linear-gradient(135deg, #581c87 0%, #6b21a8 100%); color: #f3f4f6; } diff --git a/docs-website/src/components/CardDropdown/CardDropdown.tsx b/docs-website/src/components/CardDropdown/CardDropdown.tsx index 5f896e1ecd8c04..1ac26329e9a31d 100644 --- a/docs-website/src/components/CardDropdown/CardDropdown.tsx +++ b/docs-website/src/components/CardDropdown/CardDropdown.tsx @@ -1,19 +1,22 @@ -import React, {useState, useRef, useEffect} from 'react'; -import clsx from 'clsx'; +import React, { useState, useRef, useEffect } from "react"; +import clsx from "clsx"; import { isRegexpStringMatch, useCollapsible, Collapsible, -} from '@docusaurus/theme-common'; -import {isSamePath, useLocalPathname} from '@docusaurus/theme-common/internal'; -import NavbarNavLink from '@theme/NavbarItem/NavbarNavLink'; -import NavbarItem, {type LinkLikeNavbarItemProps} from '@theme/NavbarItem'; +} from "@docusaurus/theme-common"; +import { + isSamePath, + useLocalPathname, +} from "@docusaurus/theme-common/internal"; +import NavbarNavLink from "@theme/NavbarItem/NavbarNavLink"; +import NavbarItem, { type LinkLikeNavbarItemProps } from "@theme/NavbarItem"; import type { DesktopOrMobileNavBarItemProps, Props, -} from '@theme/NavbarItem/DropdownNavbarItem'; -import styles from './styles.module.scss'; -import Link from '@docusaurus/Link'; +} from "@theme/NavbarItem/DropdownNavbarItem"; +import styles from "./styles.module.scss"; +import Link from "@docusaurus/Link"; function isItemActive( item: LinkLikeNavbarItemProps, @@ -47,7 +50,7 @@ function DropdownNavbarItemDesktop({ ...props }: DesktopOrMobileNavBarItemProps) { const dropdownRef = useRef(null); - const [showDropdown, setShowDropdown] = useState(false); + const [showDropdown, setShowDropdown] = useState(false); useEffect(() => { const handleClickOutside = ( @@ -62,24 +65,25 @@ function DropdownNavbarItemDesktop({ setShowDropdown(false); }; - document.addEventListener('mousedown', handleClickOutside); - document.addEventListener('touchstart', handleClickOutside); - document.addEventListener('focusin', handleClickOutside); + document.addEventListener("mousedown", handleClickOutside); + document.addEventListener("touchstart", handleClickOutside); + document.addEventListener("focusin", handleClickOutside); return () => { - document.removeEventListener('mousedown', handleClickOutside); - document.removeEventListener('touchstart', handleClickOutside); - document.removeEventListener('focusin', handleClickOutside); + document.removeEventListener("mousedown", handleClickOutside); + document.removeEventListener("touchstart", handleClickOutside); + document.removeEventListener("focusin", handleClickOutside); }; }, [dropdownRef]); return (
+ className={clsx("navbar__item", "dropdown", "dropdown--hoverable", { + "dropdown--right": position === "right", + "dropdown--show": showDropdown, + })} + > tag focusable in case no link target // See https://github.com/facebook/docusaurus/pull/6003 // There's probably a better solution though... - href={props.to ? undefined : '#'} - className={clsx('navbar__link', className)} + href={props.to ? undefined : "#"} + className={clsx("navbar__link", className)} {...props} onClick={props.to ? undefined : (e) => e.preventDefault()} onKeyDown={(e) => { - if (e.key === 'Enter') { + if (e.key === "Enter") { e.preventDefault(); setShowDropdown(!showDropdown); } - }}> + }} + > {props.children ?? props.label}
    @@ -137,7 +142,7 @@ function DropdownNavbarItemMobile({ const localPathname = useLocalPathname(); const containsActive = containsActiveItems(items, localPathname); - const {collapsed, toggleCollapsed, setCollapsed} = useCollapsible({ + const { collapsed, toggleCollapsed, setCollapsed } = useCollapsible({ initialState: () => !containsActive, }); @@ -150,21 +155,23 @@ function DropdownNavbarItemMobile({ return (
  • + className={clsx("menu__list-item", { + "menu__list-item--collapsed": collapsed, + })} + > { e.preventDefault(); toggleCollapsed(); - }}> + }} + > {props.children ?? props.label} @@ -180,4 +187,4 @@ export default function DropdownNavbarItem({ }: Props): JSX.Element { const Comp = mobile ? DropdownNavbarItemMobile : DropdownNavbarItemDesktop; return ; -} \ No newline at end of file +} diff --git a/docs-website/src/components/CardDropdown/styles.module.scss b/docs-website/src/components/CardDropdown/styles.module.scss index 9b5fff816e5752..59814e0366be12 100644 --- a/docs-website/src/components/CardDropdown/styles.module.scss +++ b/docs-website/src/components/CardDropdown/styles.module.scss @@ -17,51 +17,52 @@ align-items: flex-start; gap: 0.98219rem; border-radius: var(--number-scales-2s-20, 1.25rem); - background: #FFF; + background: #fff; box-shadow: 0px 16px 16px 0px rgba(0, 0, 0, 0.25); } - .wrapper { display: flex; flex-direction: column; - gap: 0.6rem; + gap: 0.6rem; } .card { display: flex; - align-items: center; + align-items: center; padding: 1rem 0.8rem; text-align: left; - gap: 0.5rem; + gap: 0.5rem; border-radius: 0.72681rem; - background: #F7F7F7; + background: #f7f7f7; width: 12rem; - transition: transform 0.2s, box-shadow 0.2s; + transition: + transform 0.2s, + box-shadow 0.2s; &:hover { - transform: translateY(-5px); - box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1); - text-decoration: none; - color: inherit; - } + transform: translateY(-5px); + box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1); + text-decoration: none; + color: inherit; + } } .icon { - flex-shrink: 0; - width: 1.3rem; - height: 1.3rem; - display: flex; - justify-content: center; - align-items: center; + flex-shrink: 0; + width: 1.3rem; + height: 1.3rem; + display: flex; + justify-content: center; + align-items: center; } .title { - flex-grow: 1; - color: #1E1E1E; - font-family: Manrope, sans-serif; - font-size: 0.9rem; - font-weight: 600; - line-height: 1.5rem; - letter-spacing: -0.011rem; + flex-grow: 1; + color: #1e1e1e; + font-family: Manrope, sans-serif; + font-size: 0.9rem; + font-weight: 600; + line-height: 1.5rem; + letter-spacing: -0.011rem; } diff --git a/docs-website/src/components/DataHubEntityCard/index.jsx b/docs-website/src/components/DataHubEntityCard/index.jsx index 813827912ddd46..8ddc683eb7052d 100644 --- a/docs-website/src/components/DataHubEntityCard/index.jsx +++ b/docs-website/src/components/DataHubEntityCard/index.jsx @@ -1,105 +1,111 @@ -import React from 'react'; -import styles from './styles.module.css'; +import React from "react"; +import styles from "./styles.module.css"; // Health icon components matching DataHub's HealthIcon (same as lineage nodes) const HealthIcon = ({ health, size = 14 }) => { const iconStyle = { width: `${size}px`, height: `${size}px`, - display: 'inline-block', - marginLeft: '6px', - verticalAlign: 'middle', + display: "inline-block", + marginLeft: "6px", + verticalAlign: "middle", }; - if (health === 'Good') { + if (health === "Good") { return ( - + ); } - - if (health === 'Warning' || health === 'Critical') { - const color = health === 'Critical' ? '#ff4d4f' : '#faad14'; + + if (health === "Warning" || health === "Critical") { + const color = health === "Critical" ? "#ff4d4f" : "#faad14"; return ( - + ); } - + return null; }; // Simplified version of DataHub's DefaultPreviewCard for tutorials const DataHubEntityCard = ({ name, - type = 'Dataset', - platform = 'Hive', + type = "Dataset", + platform = "Hive", description, owners = [], tags = [], glossaryTerms = [], assertions = { passing: 0, failing: 0, total: 0 }, - health = 'Good', - url = '#', - className = '', + health = "Good", + url = "#", + className = "", }) => { // Use actual DataHub platform logos from the docs website const getPlatformLogo = (platformName) => { const logoMap = { - 'Hive': '/img/logos/platforms/hive.svg', - 'Kafka': '/img/logos/platforms/kafka.svg', - 'HDFS': '/img/logos/platforms/hadoop.svg', - 'Snowflake': '/img/logos/platforms/snowflake.svg', - 'BigQuery': '/img/logos/platforms/bigquery.svg', - 'Spark': '/img/logos/platforms/spark.svg', - 'PostgreSQL': '/img/logos/platforms/postgres.svg', - 'Postgres': '/img/logos/platforms/postgres.svg', - 'postgres': '/img/logos/platforms/postgres.svg', - 'MySQL': '/img/logos/platforms/mysql.svg', - 'MongoDB': '/img/logos/platforms/mongodb.svg', - 'Elasticsearch': '/img/logos/platforms/elasticsearch.svg', - 'Redshift': '/img/logos/platforms/redshift.svg', - 'Databricks': '/img/logos/platforms/databricks.png', - 'dbt': '/img/logos/platforms/dbt.svg', - 'Airflow': '/img/logos/platforms/airflow.svg', - 'Looker': '/img/logos/platforms/looker.svg', - 'Tableau': '/img/logos/platforms/tableau.png', - 'PowerBI': '/img/logos/platforms/powerbi.png', - 'Superset': '/img/logos/platforms/superset.svg', + Hive: "/img/logos/platforms/hive.svg", + Kafka: "/img/logos/platforms/kafka.svg", + HDFS: "/img/logos/platforms/hadoop.svg", + Snowflake: "/img/logos/platforms/snowflake.svg", + BigQuery: "/img/logos/platforms/bigquery.svg", + Spark: "/img/logos/platforms/spark.svg", + PostgreSQL: "/img/logos/platforms/postgres.svg", + Postgres: "/img/logos/platforms/postgres.svg", + postgres: "/img/logos/platforms/postgres.svg", + MySQL: "/img/logos/platforms/mysql.svg", + MongoDB: "/img/logos/platforms/mongodb.svg", + Elasticsearch: "/img/logos/platforms/elasticsearch.svg", + Redshift: "/img/logos/platforms/redshift.svg", + Databricks: "/img/logos/platforms/databricks.png", + dbt: "/img/logos/platforms/dbt.svg", + Airflow: "/img/logos/platforms/airflow.svg", + Looker: "/img/logos/platforms/looker.svg", + Tableau: "/img/logos/platforms/tableau.png", + PowerBI: "/img/logos/platforms/powerbi.png", + Superset: "/img/logos/platforms/superset.svg", }; - return logoMap[platformName] || '/img/logos/platforms/acryl.svg'; + return logoMap[platformName] || "/img/logos/platforms/acryl.svg"; }; const healthColors = { - 'Good': '#52c41a', - 'Warning': '#faad14', - 'Critical': '#ff4d4f', + Good: "#52c41a", + Warning: "#faad14", + Critical: "#ff4d4f", }; // Get ownership type icon based on type const getOwnershipTypeIcon = (ownershipType) => { switch (ownershipType) { - case 'Technical Owner': - return '👨‍💻'; - case 'Business Owner': - return '👔'; - case 'Data Steward': - return '🛡️'; - case 'Data Owner': - return '📊'; + case "Technical Owner": + return "👨‍💻"; + case "Business Owner": + return "👔"; + case "Data Steward": + return "🛡️"; + case "Data Owner": + return "📊"; default: - return '👤'; + return "👤"; } }; // Get assertion status icon const getAssertionStatusIcon = (assertions) => { if (assertions.total === 0) return null; - if (assertions.failing > 0) return '❌'; - if (assertions.passing === assertions.total) return '✅'; - return '⚠️'; + if (assertions.failing > 0) return "❌"; + if (assertions.passing === assertions.total) return "✅"; + return "⚠️"; }; // Generate color hash for tags (matching DataHub's ColorHash) @@ -107,7 +113,7 @@ const DataHubEntityCard = ({ let hash = 0; for (let i = 0; i < tagName.length; i++) { const char = tagName.charCodeAt(i); - hash = ((hash << 5) - hash) + char; + hash = (hash << 5) - hash + char; hash = hash & hash; } const hue = Math.abs(hash) % 360; @@ -117,12 +123,20 @@ const DataHubEntityCard = ({ // Generate color for glossary terms const generateTermColor = (termName) => { const colors = [ - '#1890ff', '#52c41a', '#faad14', '#f5222d', '#722ed1', - '#fa541c', '#13c2c2', '#eb2f96', '#a0d911', '#fadb14' + "#1890ff", + "#52c41a", + "#faad14", + "#f5222d", + "#722ed1", + "#fa541c", + "#13c2c2", + "#eb2f96", + "#a0d911", + "#fadb14", ]; let hash = 0; for (let i = 0; i < termName.length; i++) { - hash = ((hash << 5) - hash) + termName.charCodeAt(i); + hash = (hash << 5) - hash + termName.charCodeAt(i); } return colors[Math.abs(hash) % colors.length]; }; @@ -130,7 +144,7 @@ const DataHubEntityCard = ({ // Tag component matching DataHub's StyledTag const DataHubTag = ({ tag }) => (
    -
    @@ -141,7 +155,7 @@ const DataHubEntityCard = ({ // Glossary term component matching DataHub's Term const DataHubTerm = ({ term }) => (
    -
    @@ -153,8 +167,8 @@ const DataHubEntityCard = ({
    - {`${platform} @@ -163,7 +177,7 @@ const DataHubEntityCard = ({ {platform}
    - +

    @@ -171,13 +185,9 @@ const DataHubEntityCard = ({ {health && }

    - - {description && ( -

    - {description} -

    - )} - + + {description &&

    {description}

    } + {(tags.length > 0 || glossaryTerms.length > 0) && (
    {tags.map((tag, index) => ( @@ -188,19 +198,19 @@ const DataHubEntityCard = ({ ))}
    )} - + {owners.length > 0 && (
    Ownership
    {(() => { // Group owners by type const ownersByType = {}; - owners.forEach(owner => { - const type = owner.type || 'Technical Owner'; + owners.forEach((owner) => { + const type = owner.type || "Technical Owner"; if (!ownersByType[type]) ownersByType[type] = []; ownersByType[type].push(owner); }); - + return Object.entries(ownersByType).map(([type, typeOwners]) => (
    @@ -241,63 +251,60 @@ const DataHubEntityCard = ({ // Pre-configured sample entities for tutorials export const SampleEntities = { userCreatedTable: { - name: 'fct_users_created', - type: 'Table', - platform: 'Hive', - description: 'Fact table tracking user creation events with timestamps and attribution', + name: "fct_users_created", + type: "Table", + platform: "Hive", + description: + "Fact table tracking user creation events with timestamps and attribution", owners: [ - { name: 'john.doe@company.com', type: 'Technical Owner' }, - { name: 'sarah.smith@company.com', type: 'Business Owner' } + { name: "john.doe@company.com", type: "Technical Owner" }, + { name: "sarah.smith@company.com", type: "Business Owner" }, ], - tags: ['PII', 'User Analytics', 'Daily'], - glossaryTerms: ['User Metrics', 'Fact Table'], + tags: ["PII", "User Analytics", "Daily"], + glossaryTerms: ["User Metrics", "Fact Table"], assertions: { passing: 8, failing: 0, total: 8 }, - health: 'Good', + health: "Good", }, - + userDeletedTable: { - name: 'fct_users_deleted', - type: 'Table', - platform: 'Hive', - description: 'Fact table tracking user deletion events and reasons', - owners: [ - { name: 'john.doe@company.com', type: 'Technical Owner' } - ], - tags: ['User Analytics', 'Daily'], - glossaryTerms: ['User Metrics'], + name: "fct_users_deleted", + type: "Table", + platform: "Hive", + description: "Fact table tracking user deletion events and reasons", + owners: [{ name: "john.doe@company.com", type: "Technical Owner" }], + tags: ["User Analytics", "Daily"], + glossaryTerms: ["User Metrics"], assertions: { passing: 5, failing: 1, total: 6 }, - health: 'Good', + health: "Good", }, - + kafkaUserEvents: { - name: 'user_events', - type: 'Topic', - platform: 'Kafka', - description: 'Real-time stream of user activity events', + name: "user_events", + type: "Topic", + platform: "Kafka", + description: "Real-time stream of user activity events", owners: [ - { name: 'data.engineering@company.com', type: 'Technical Owner' }, - { name: 'mike.wilson@company.com', type: 'Data Steward' } + { name: "data.engineering@company.com", type: "Technical Owner" }, + { name: "mike.wilson@company.com", type: "Data Steward" }, ], - tags: ['Streaming', 'Real-time', 'PII'], - glossaryTerms: ['User Activity', 'Event Data'], + tags: ["Streaming", "Real-time", "PII"], + glossaryTerms: ["User Activity", "Event Data"], assertions: { passing: 12, failing: 0, total: 12 }, - health: 'Good', + health: "Good", }, - + rawUserData: { - name: 'raw_user_data', - type: 'Dataset', - platform: 'HDFS', - description: 'Raw user registration and profile data from application database', - owners: [ - { name: 'data.platform@company.com', type: 'Data Owner' } - ], - tags: ['Raw', 'PII', 'Hourly'], - glossaryTerms: ['Source Data', 'User Information'], + name: "raw_user_data", + type: "Dataset", + platform: "HDFS", + description: + "Raw user registration and profile data from application database", + owners: [{ name: "data.platform@company.com", type: "Data Owner" }], + tags: ["Raw", "PII", "Hourly"], + glossaryTerms: ["Source Data", "User Information"], assertions: { passing: 3, failing: 2, total: 5 }, - health: 'Warning', + health: "Warning", }, }; - export default DataHubEntityCard; diff --git a/docs-website/src/components/DataHubEntityCard/styles.module.css b/docs-website/src/components/DataHubEntityCard/styles.module.css index d87d559c25ed8f..0a8c0b32279f54 100644 --- a/docs-website/src/components/DataHubEntityCard/styles.module.css +++ b/docs-website/src/components/DataHubEntityCard/styles.module.css @@ -1,19 +1,19 @@ /* Auto-generated DataHub Design Tokens */ :root { - --datahub-primary: #533FD1; - --datahub-primary-dark: #4C39BE; - --datahub-primary-light: #7565DA; - --datahub-primary-lightest: #F1F3FD; - --datahub-gray-100: #EBECF0; + --datahub-primary: #533fd1; + --datahub-primary-dark: #4c39be; + --datahub-primary-light: #7565da; + --datahub-primary-lightest: #f1f3fd; + --datahub-gray-100: #ebecf0; --datahub-gray-600: #374066; - --datahub-gray-1700: #5F6685; - --datahub-gray-1800: #8088A3; - --datahub-gray-1500: #F9FAFC; - --datahub-white: #FFFFFF; - --datahub-success: #77B750; - --datahub-warning: #EEAE09; - --datahub-error: #CD0D24; - --datahub-border: #E9EAEE; + --datahub-gray-1700: #5f6685; + --datahub-gray-1800: #8088a3; + --datahub-gray-1500: #f9fafc; + --datahub-white: #ffffff; + --datahub-success: #77b750; + --datahub-warning: #eeae09; + --datahub-error: #cd0d24; + --datahub-border: #e9eaee; --datahub-shadow: 0px 1px 2px 0px rgba(33, 23, 95, 0.07); --datahub-shadow-hover: 0 2px 8px rgba(83, 63, 209, 0.15); --datahub-node-width: 320px; @@ -22,212 +22,242 @@ } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; + --datahub-border: #374066; +} + +/* Dark mode colors */ +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } @@ -236,36 +266,36 @@ /* Import DataHub color variables */ :root { /* DataHub Alchemy Design System Colors */ - --datahub-primary: #533FD1; - --datahub-primary-dark: #4C39BE; - --datahub-primary-light: #7565DA; - --datahub-primary-lightest: #F1F3FD; - --datahub-gray-100: #EBECF0; + --datahub-primary: #533fd1; + --datahub-primary-dark: #4c39be; + --datahub-primary-light: #7565da; + --datahub-primary-lightest: #f1f3fd; + --datahub-gray-100: #ebecf0; --datahub-gray-600: #374066; - --datahub-gray-1700: #5F6685; - --datahub-gray-1800: #8088A3; - --datahub-gray-1500: #F9FAFC; - --datahub-white: #FFFFFF; - --datahub-success: #77B750; - --datahub-warning: #EEAE09; - --datahub-error: #CD0D24; - --datahub-border: #E9EAEE; + --datahub-gray-1700: #5f6685; + --datahub-gray-1800: #8088a3; + --datahub-gray-1500: #f9fafc; + --datahub-white: #ffffff; + --datahub-success: #77b750; + --datahub-warning: #eeae09; + --datahub-error: #cd0d24; + --datahub-border: #e9eaee; --datahub-shadow: 0px 1px 2px 0px rgba(33, 23, 95, 0.07); --datahub-shadow-hover: 0 2px 8px rgba(83, 63, 209, 0.15); } /* Dark mode colors */ -[data-theme='dark'] { - --datahub-primary: #7565DA; - --datahub-primary-dark: #533FD1; - --datahub-primary-light: #8C7EE0; - --datahub-primary-lightest: #2E2373; - --datahub-gray-100: #2F3657; - --datahub-gray-600: #CFD1DA; - --datahub-gray-1700: #A9ADBD; - --datahub-gray-1800: #81879F; - --datahub-gray-1500: #1E2338; - --datahub-white: #272D48; +[data-theme="dark"] { + --datahub-primary: #7565da; + --datahub-primary-dark: #533fd1; + --datahub-primary-light: #8c7ee0; + --datahub-primary-lightest: #2e2373; + --datahub-gray-100: #2f3657; + --datahub-gray-600: #cfd1da; + --datahub-gray-1700: #a9adbd; + --datahub-gray-1800: #81879f; + --datahub-gray-1500: #1e2338; + --datahub-white: #272d48; --datahub-border: #374066; } @@ -535,14 +565,14 @@ margin: 8px 0; padding: 12px; } - + .header { flex-direction: column; align-items: flex-start; gap: 8px; } - + .tags { gap: 4px; } -} \ No newline at end of file +} diff --git a/docs-website/src/components/DataHubLineageNode/index.jsx b/docs-website/src/components/DataHubLineageNode/index.jsx index b67c94fecdcda5..c89bdc8b190dac 100644 --- a/docs-website/src/components/DataHubLineageNode/index.jsx +++ b/docs-website/src/components/DataHubLineageNode/index.jsx @@ -1,131 +1,133 @@ -import React, { useState } from 'react'; -import styles from './styles.module.css'; -import LineageLayoutGrid from '../LineageLayoutGrid'; +import React, { useState } from "react"; +import styles from "./styles.module.css"; +import { TagPill } from "../Pills/TagPill"; +import { GlossaryTermPill } from "../Pills/GlossaryTermPill"; +import LineageLayoutGrid from "../LineageLayoutGrid"; // Simplified version of DataHub's LineageEntityNode for tutorials const DataHubLineageNode = ({ name, - type = 'Dataset', - entityType = 'Dataset', // DataHub entity type (Dataset, DataJob, etc.) - platform = 'Hive', + type = "Dataset", + entityType = "Dataset", // DataHub entity type (Dataset, DataJob, etc.) + platform = "Hive", isSelected = false, isCenter = false, - health = 'Good', + health = "Good", isExpanded = false, columns = [], tags = [], glossaryTerms = [], onClick, onToggleExpand, - className = '', + className = "", }) => { // Use actual DataHub platform logos from the docs website const getPlatformLogo = (platformName) => { const logoMap = { // Analytics & BI - 'Looker': '/img/logos/platforms/looker.svg', - 'Tableau': '/img/logos/platforms/tableau.png', - 'PowerBI': '/img/logos/platforms/powerbi.png', - 'Metabase': '/img/logos/platforms/metabase.svg', - 'Superset': '/img/logos/platforms/superset.svg', - 'Mode': '/img/logos/platforms/mode.png', - 'Preset': '/img/logos/platforms/presetlogo.svg', - 'Sigma': '/img/logos/platforms/sigma.png', - 'Qlik': '/img/logos/platforms/qlik.png', - 'Redash': '/img/logos/platforms/redash.svg', - + Looker: "/img/logos/platforms/looker.svg", + Tableau: "/img/logos/platforms/tableau.png", + PowerBI: "/img/logos/platforms/powerbi.png", + Metabase: "/img/logos/platforms/metabase.svg", + Superset: "/img/logos/platforms/superset.svg", + Mode: "/img/logos/platforms/mode.png", + Preset: "/img/logos/platforms/presetlogo.svg", + Sigma: "/img/logos/platforms/sigma.png", + Qlik: "/img/logos/platforms/qlik.png", + Redash: "/img/logos/platforms/redash.svg", + // Cloud Data Warehouses - 'Snowflake': '/img/logos/platforms/snowflake.svg', - 'BigQuery': '/img/logos/platforms/bigquery.svg', - 'Redshift': '/img/logos/platforms/redshift.svg', - 'Databricks': '/img/logos/platforms/databricks.png', - 'Synapse': '/img/logos/platforms/mssql.svg', - + Snowflake: "/img/logos/platforms/snowflake.svg", + BigQuery: "/img/logos/platforms/bigquery.svg", + Redshift: "/img/logos/platforms/redshift.svg", + Databricks: "/img/logos/platforms/databricks.png", + Synapse: "/img/logos/platforms/mssql.svg", + // Databases - 'PostgreSQL': '/img/logos/platforms/postgres.svg', - 'Postgres': '/img/logos/platforms/postgres.svg', - 'postgres': '/img/logos/platforms/postgres.svg', - 'MySQL': '/img/logos/platforms/mysql.svg', - 'Oracle': '/img/logos/platforms/oracle.svg', - 'SQL Server': '/img/logos/platforms/mssql.svg', - 'MongoDB': '/img/logos/platforms/mongodb.svg', - 'Cassandra': '/img/logos/platforms/cassandra.png', - 'Neo4j': '/img/logos/platforms/neo4j.png', - 'DynamoDB': '/img/logos/platforms/dynamodb.png', - 'ClickHouse': '/img/logos/platforms/clickhouse.svg', - 'CockroachDB': '/img/logos/platforms/cockroachdb.png', - 'MariaDB': '/img/logos/platforms/mariadb.png', - 'Teradata': '/img/logos/platforms/teradata.svg', - 'Vertica': '/img/logos/platforms/vertica.svg', - 'SAP HANA': '/img/logos/platforms/hana.svg', - 'Couchbase': '/img/logos/platforms/couchbase.svg', - + PostgreSQL: "/img/logos/platforms/postgres.svg", + Postgres: "/img/logos/platforms/postgres.svg", + postgres: "/img/logos/platforms/postgres.svg", + MySQL: "/img/logos/platforms/mysql.svg", + Oracle: "/img/logos/platforms/oracle.svg", + "SQL Server": "/img/logos/platforms/mssql.svg", + MongoDB: "/img/logos/platforms/mongodb.svg", + Cassandra: "/img/logos/platforms/cassandra.png", + Neo4j: "/img/logos/platforms/neo4j.png", + DynamoDB: "/img/logos/platforms/dynamodb.png", + ClickHouse: "/img/logos/platforms/clickhouse.svg", + CockroachDB: "/img/logos/platforms/cockroachdb.png", + MariaDB: "/img/logos/platforms/mariadb.png", + Teradata: "/img/logos/platforms/teradata.svg", + Vertica: "/img/logos/platforms/vertica.svg", + "SAP HANA": "/img/logos/platforms/hana.svg", + Couchbase: "/img/logos/platforms/couchbase.svg", + // Big Data & Processing - 'Hive': '/img/logos/platforms/hive.svg', - 'Spark': '/img/logos/platforms/spark.svg', - 'Hadoop': '/img/logos/platforms/hadoop.svg', - 'Kafka': '/img/logos/platforms/kafka.svg', - 'Pulsar': '/img/logos/platforms/pulsar.png', - 'Presto': '/img/logos/platforms/presto.svg', - 'Trino': '/img/logos/platforms/trino.png', - 'Druid': '/img/logos/platforms/druid.svg', - 'Pinot': '/img/logos/platforms/pinot.svg', - 'Kusto': '/img/logos/platforms/kusto.svg', - 'Iceberg': '/img/logos/platforms/iceberg.png', - 'Delta Lake': '/img/logos/platforms/deltalake.svg', - 'Hudi': '/img/logos/platforms/hudi.png', - + Hive: "/img/logos/platforms/hive.svg", + Spark: "/img/logos/platforms/spark.svg", + Hadoop: "/img/logos/platforms/hadoop.svg", + Kafka: "/img/logos/platforms/kafka.svg", + Pulsar: "/img/logos/platforms/pulsar.png", + Presto: "/img/logos/platforms/presto.svg", + Trino: "/img/logos/platforms/trino.png", + Druid: "/img/logos/platforms/druid.svg", + Pinot: "/img/logos/platforms/pinot.svg", + Kusto: "/img/logos/platforms/kusto.svg", + Iceberg: "/img/logos/platforms/iceberg.png", + "Delta Lake": "/img/logos/platforms/deltalake.svg", + Hudi: "/img/logos/platforms/hudi.png", + // Cloud Storage - 'S3': '/img/logos/platforms/s3.svg', - 'GCS': '/img/logos/platforms/gcs.svg', - 'ADLS': '/img/logos/platforms/adls.svg', - + S3: "/img/logos/platforms/s3.svg", + GCS: "/img/logos/platforms/gcs.svg", + ADLS: "/img/logos/platforms/adls.svg", + // ETL & Orchestration - 'Airflow': '/img/logos/platforms/airflow.svg', - 'dbt': '/img/logos/platforms/dbt.svg', - 'Fivetran': '/img/logos/platforms/fivetran.png', - 'Dagster': '/img/logos/platforms/dagster.svg', - 'Prefect': '/img/logos/platforms/prefect.svg', - 'Snaplogic': '/img/logos/platforms/snaplogic.svg', - 'Nifi': '/img/logos/platforms/nifi.svg', - + Airflow: "/img/logos/platforms/airflow.svg", + dbt: "/img/logos/platforms/dbt.svg", + Fivetran: "/img/logos/platforms/fivetran.png", + Dagster: "/img/logos/platforms/dagster.svg", + Prefect: "/img/logos/platforms/prefect.svg", + Snaplogic: "/img/logos/platforms/snaplogic.svg", + Nifi: "/img/logos/platforms/nifi.svg", + // ML & AI - 'MLflow': '/img/logos/platforms/mlflow.svg', - 'SageMaker': '/img/logos/platforms/sagemaker.svg', - 'Vertex AI': '/img/logos/platforms/vertexai.png', - + MLflow: "/img/logos/platforms/mlflow.svg", + SageMaker: "/img/logos/platforms/sagemaker.svg", + "Vertex AI": "/img/logos/platforms/vertexai.png", + // Cloud Platforms - 'AWS Athena': '/img/logos/platforms/athena.svg', - 'AWS Glue': '/img/logos/platforms/glue.svg', - 'Azure': '/img/logos/platforms/azure-ad.svg', - 'Elasticsearch': '/img/logos/platforms/elasticsearch.svg', - + "AWS Athena": "/img/logos/platforms/athena.svg", + "AWS Glue": "/img/logos/platforms/glue.svg", + Azure: "/img/logos/platforms/azure-ad.svg", + Elasticsearch: "/img/logos/platforms/elasticsearch.svg", + // Data Quality & Governance - 'Great Expectations': '/img/logos/platforms/great-expectations.png', - 'Feast': '/img/logos/platforms/feast.svg', - 'Dremio': '/img/logos/platforms/dremio.png', - + "Great Expectations": "/img/logos/platforms/great-expectations.png", + Feast: "/img/logos/platforms/feast.svg", + Dremio: "/img/logos/platforms/dremio.png", + // File Formats & Others - 'OpenAPI': '/img/logos/platforms/openapi.png', - 'Salesforce': '/img/logos/platforms/salesforce.png', - 'Okta': '/img/logos/platforms/okta.png', - 'SAC': '/img/logos/platforms/sac.svg', - 'Hex': '/img/logos/platforms/hex.png', - 'SQLAlchemy': '/img/logos/platforms/sqlalchemy.png', - 'Protobuf': '/img/logos/platforms/protobuf.png', - + OpenAPI: "/img/logos/platforms/openapi.png", + Salesforce: "/img/logos/platforms/salesforce.png", + Okta: "/img/logos/platforms/okta.png", + SAC: "/img/logos/platforms/sac.svg", + Hex: "/img/logos/platforms/hex.png", + SQLAlchemy: "/img/logos/platforms/sqlalchemy.png", + Protobuf: "/img/logos/platforms/protobuf.png", + // DataHub & Default - 'DataHub': '/img/logos/platforms/acryl.svg', - 'API': '/img/logos/platforms/acryl.svg', // Generic for API - 'Unknown': '/img/logos/platforms/acryl.svg', + DataHub: "/img/logos/platforms/acryl.svg", + API: "/img/logos/platforms/acryl.svg", // Generic for API + Unknown: "/img/logos/platforms/acryl.svg", }; - return logoMap[platformName] || '/img/logos/platforms/acryl.svg'; + return logoMap[platformName] || "/img/logos/platforms/acryl.svg"; }; const healthColors = { - 'Good': '#52c41a', - 'Warning': '#faad14', - 'Critical': '#ff4d4f', + Good: "#52c41a", + Warning: "#faad14", + Critical: "#ff4d4f", }; // Health icon components matching DataHub's HealthIcon @@ -133,21 +135,27 @@ const DataHubLineageNode = ({ const iconStyle = { width: `${size}px`, height: `${size}px`, - display: 'inline-block', + display: "inline-block", }; - if (health === 'Good') { + if (health === "Good") { return ( - + ); } - - if (health === 'Warning' || health === 'Critical') { + + if (health === "Warning" || health === "Critical") { return ( - + ); } @@ -157,75 +165,83 @@ const DataHubLineageNode = ({ // Column type icons matching DataHub's exact TypeIcon component const getColumnTypeIcon = (columnType) => { - const iconStyle = { - width: '16px', - height: '16px', - display: 'flex', - alignItems: 'center', - justifyContent: 'center', - fontSize: '14px', - fontWeight: 'bold' + const iconStyle = { + width: "16px", + height: "16px", + display: "flex", + alignItems: "center", + justifyContent: "center", + fontSize: "14px", + fontWeight: "bold", }; - + switch (columnType?.toLowerCase()) { - case 'string': - case 'varchar': - case 'text': + case "string": + case "varchar": + case "text": // String icon - A with underline (exactly like DataHub) return ( -
    - A +
    + + A +
    ); - case 'int': - case 'integer': - case 'bigint': - case 'number': + case "int": + case "integer": + case "bigint": + case "number": // Number icon - # symbol (exactly like DataHub) return ( -
    - # +
    + #
    ); - case 'date': - case 'datetime': - case 'timestamp': + case "date": + case "datetime": + case "timestamp": // Calendar icon (simple calendar symbol) return ( -
    +
    - +
    ); - case 'boolean': - case 'bool': + case "boolean": + case "bool": // Boolean icon - simple T/F return ( -
    +
    T/F
    ); - case 'struct': - case 'object': + case "struct": + case "object": // Struct icon - curly brackets (exactly like DataHub) return ( -
    - { } +
    + {}
    ); - case 'array': - case 'list': + case "array": + case "list": // Array icon - square brackets return ( -
    +
    [ ]
    ); default: // Question mark for unknown types return ( -
    +
    ?
    ); @@ -238,10 +254,10 @@ const DataHubLineageNode = ({ let hash = 0; for (let i = 0; i < tagName.length; i++) { const char = tagName.charCodeAt(i); - hash = ((hash << 5) - hash) + char; + hash = (hash << 5) - hash + char; hash = hash & hash; // Convert to 32bit integer } - + // Convert to HSL with high saturation for vibrant colors const hue = Math.abs(hash) % 360; return `hsl(${hue}, 70%, 45%)`; @@ -250,55 +266,45 @@ const DataHubLineageNode = ({ // Generate color for glossary terms (matching DataHub's glossary colors) const generateTermColor = (termName) => { const colors = [ - '#1890ff', '#52c41a', '#faad14', '#f5222d', '#722ed1', - '#fa541c', '#13c2c2', '#eb2f96', '#a0d911', '#fadb14' + "#1890ff", + "#52c41a", + "#faad14", + "#f5222d", + "#722ed1", + "#fa541c", + "#13c2c2", + "#eb2f96", + "#a0d911", + "#fadb14", ]; let hash = 0; for (let i = 0; i < termName.length; i++) { - hash = ((hash << 5) - hash) + termName.charCodeAt(i); + hash = (hash << 5) - hash + termName.charCodeAt(i); } return colors[Math.abs(hash) % colors.length]; }; - // Tag component matching DataHub's StyledTag - const DataHubTag = ({ tag }) => ( -
    -
    - {tag} -
    - ); - - // Glossary term component matching DataHub's Term - const DataHubTerm = ({ term }) => ( -
    -
    - {term} -
    - ); + // Use shared pill components for consistency // Tags and terms group component const TagTermGroup = ({ tags, glossaryTerms, maxShow = 3 }) => { const allItems = [ - ...tags.map(tag => ({ type: 'tag', value: tag })), - ...glossaryTerms.map(term => ({ type: 'term', value: term })) + ...tags.map((tag) => ({ type: "tag", value: tag })), + ...glossaryTerms.map((term) => ({ type: "term", value: term })), ]; - + const visibleItems = allItems.slice(0, maxShow); const remainingCount = allItems.length - maxShow; - + return (
    - {visibleItems.map((item, index) => ( - item.type === 'tag' ? - : - - ))} + {visibleItems.map((item, index) => + item.type === "tag" ? ( + + ) : ( + + ), + )} {remainingCount > 0 && (
    +{remainingCount}
    )} @@ -307,19 +313,24 @@ const DataHubLineageNode = ({ }; // Determine if this is a transformation node (DataJob, Query, etc.) - const isTransformationNode = entityType === 'DataJob' || entityType === 'Query' || entityType === 'DataProcessInstance'; - + const isTransformationNode = + entityType === "DataJob" || + entityType === "Query" || + entityType === "DataProcessInstance"; + const nodeClasses = [ isTransformationNode ? styles.transformationNode : styles.lineageNode, isSelected && styles.selected, isCenter && styles.center, - className - ].filter(Boolean).join(' '); + className, + ] + .filter(Boolean) + .join(" "); // Render transformation node (circular, smaller) if (isTransformationNode) { return ( -
    - {`${platform} @@ -342,18 +353,13 @@ const DataHubLineageNode = ({ // Render entity node (rectangular, larger) return ( -
    +
    {/* Main card content - matches DataHub's CardWrapper structure */}
    - {`${platform} @@ -367,37 +373,49 @@ const DataHubLineageNode = ({ e.stopPropagation(); onToggleExpand && onToggleExpand(); }} - title={isExpanded ? 'Hide columns' : 'Show columns'} + title={isExpanded ? "Hide columns" : "Show columns"} > - {isExpanded ? '−' : '+'} + {isExpanded ? "−" : "+"} )}
    - +
    -
    {name}
    +
    + {name} +
    {platform}
    {(tags.length > 0 || glossaryTerms.length > 0) && ( - + )}
    - + {/* Expandable columns section */} {isExpanded && columns.length > 0 && (
    - Columns ({columns.length}) + + Columns ({columns.length}) +
    {columns.map((column, index) => ( -
    +
    {/* Left handle for incoming connections */}
    @@ -408,7 +426,10 @@ const DataHubLineageNode = ({ {column.type}
    {column.hasLineage && ( -
    +
    )} @@ -424,17 +445,17 @@ const DataHubLineageNode = ({ }; // Component for showing lineage connections with interactive expansion and column-level lineage -export const DataHubLineageFlow = ({ - nodes = [], - title, - className = '', +export const DataHubLineageFlow = ({ + nodes = [], + title, + className = "", showColumnLineage = false, - layout = 'linear', // 'linear', 'hierarchical', 'layers' + layout = "linear", // 'linear', 'hierarchical', 'layers' layers = null, // For hierarchical layout: [{ name: 'sources', nodes: [...] }, ...] showConnections = false, - connectionColor = 'var(--datahub-primary)', + connectionColor = "var(--datahub-primary)", connectionColors = {}, - defaultColors = ['#533FD1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6'] + defaultColors = ["#533FD1", "#10b981", "#f59e0b", "#ef4444", "#8b5cf6"], }) => { const [allExpanded, setAllExpanded] = React.useState(false); @@ -445,25 +466,27 @@ export const DataHubLineageFlow = ({ // Build connection map for hierarchical layouts const buildConnectionMap = () => { const connections = new Map(); - + if (layers) { // Build connections from layer structure layers.forEach((layer, layerIndex) => { - layer.nodes.forEach(node => { - if (node.downstreamConnections) { - connections.set(node.name, node.downstreamConnections); - } - }); + if (layer && layer.nodes && Array.isArray(layer.nodes)) { + layer.nodes.forEach((node) => { + if (node.downstreamConnections) { + connections.set(node.name, node.downstreamConnections); + } + }); + } }); } else if (nodes.length > 0 && nodes[0].downstreamConnections) { // Build connections from node structure - nodes.forEach(node => { + nodes.forEach((node) => { if (node.downstreamConnections) { connections.set(node.name, node.downstreamConnections); } }); } - + return connections; }; @@ -474,17 +497,17 @@ export const DataHubLineageFlow = ({
    {nodes.map((node, index) => ( - {index < nodes.length - 1 && ( -
    @@ -503,15 +526,17 @@ export const DataHubLineageFlow = ({ // Keep all layers as they are - DataJobs can be in layers alongside data assets const dataAssetLayers = layers; - + // Find DataJobs for connection logic (but they stay in their assigned layers) const allDataJobs = []; - layers.forEach(layer => { - layer.nodes.forEach(node => { - if (node.entityType === 'DataJob') { - allDataJobs.push(node); - } - }); + layers.forEach((layer) => { + if (layer && layer.nodes && Array.isArray(layer.nodes)) { + layer.nodes.forEach((node) => { + if (node && node.entityType === "DataJob") { + allDataJobs.push(node); + } + }); + } }); return ( @@ -522,8 +547,10 @@ export const DataHubLineageFlow = ({ {layer.title && (
    {layer.title}
    )} -
    - {layer.nodes.map((node, nodeIndex) => ( +
    + {layer.nodes && + Array.isArray(layer.nodes) && + layer.nodes.map((node, nodeIndex) => ( ))} -
    +
    - + {layerIndex < dataAssetLayers.length - 1 && (
    - { const sourceNodes = dataAssetLayers[layerIndex].nodes; const targetNodes = dataAssetLayers[layerIndex + 1].nodes; const nodeHeight = 120; const nodeSpacing = 20; const layerPadding = 20; - const totalSourceContentHeight = sourceNodes.length * nodeHeight + (sourceNodes.length - 1) * nodeSpacing; - const totalTargetContentHeight = targetNodes.length * nodeHeight + (targetNodes.length - 1) * nodeSpacing; - return Math.max(totalSourceContentHeight + (layerPadding * 2), totalTargetContentHeight + (layerPadding * 2), 300); + const totalSourceContentHeight = + sourceNodes.length * nodeHeight + + (sourceNodes.length - 1) * nodeSpacing; + const totalTargetContentHeight = + targetNodes.length * nodeHeight + + (targetNodes.length - 1) * nodeSpacing; + return Math.max( + totalSourceContentHeight + layerPadding * 2, + totalTargetContentHeight + layerPadding * 2, + 300, + ); })()}`} preserveAspectRatio="none" > - {renderLayerConnections(dataAssetLayers[layerIndex], dataAssetLayers[layerIndex + 1], layerIndex, allDataJobs)} + {renderLayerConnections( + dataAssetLayers[layerIndex], + dataAssetLayers[layerIndex + 1], + layerIndex, + allDataJobs, + )}
    )} @@ -561,14 +601,22 @@ export const DataHubLineageFlow = ({ }; // Render DataJobs in intermediate positions between layers - const renderIntermediateDataJobs = (sourceLayer, targetLayer, allDataJobs) => { + const renderIntermediateDataJobs = ( + sourceLayer, + targetLayer, + allDataJobs, + ) => { // Find DataJobs that connect these layers - const relevantDataJobs = allDataJobs.filter(dataJob => { - const hasSourceConnection = sourceLayer.nodes.some(sourceNode => - sourceNode.downstreamConnections?.includes(dataJob.name) + const relevantDataJobs = allDataJobs.filter((dataJob) => { + const sourceNodes = sourceLayer?.nodes || []; + const targetNodes = targetLayer?.nodes || []; + + const hasSourceConnection = sourceNodes.some((sourceNode) => + sourceNode?.downstreamConnections?.includes(dataJob.name), ); - const hasTargetConnection = dataJob.downstreamConnections?.some(targetName => - targetLayer.nodes.some(targetNode => targetNode.name === targetName) + const hasTargetConnection = dataJob?.downstreamConnections?.some( + (targetName) => + targetNodes.some((targetNode) => targetNode?.name === targetName), ); return hasSourceConnection && hasTargetConnection; }); @@ -590,56 +638,85 @@ export const DataHubLineageFlow = ({ }; // Render connections between layers - const renderLayerConnections = (sourceLayer, targetLayer, layerIndex, allDataJobs = []) => { + const renderLayerConnections = ( + sourceLayer, + targetLayer, + layerIndex, + allDataJobs = [], + ) => { const connections = []; - const sourceNodes = sourceLayer.nodes; - const targetNodes = targetLayer.nodes; - + const sourceNodes = sourceLayer?.nodes || []; + const targetNodes = targetLayer?.nodes || []; + // Calculate actual node positions based on CSS layout // Nodes are centered with justify-content: center and have gap: 20px const nodeHeight = 120; // Approximate height of a collapsed node const nodeSpacing = 20; // Gap between nodes (from CSS: gap: 20px) const layerPadding = 20; // Padding from CSS: padding: 20px 0 - + // Calculate total content height for each layer - const totalSourceContentHeight = sourceNodes.length * nodeHeight + (sourceNodes.length - 1) * nodeSpacing; - const totalTargetContentHeight = targetNodes.length * nodeHeight + (targetNodes.length - 1) * nodeSpacing; - + const totalSourceContentHeight = + sourceNodes.length * nodeHeight + (sourceNodes.length - 1) * nodeSpacing; + const totalTargetContentHeight = + targetNodes.length * nodeHeight + (targetNodes.length - 1) * nodeSpacing; + // SVG height should match the layer height including padding - const svgHeight = Math.max(totalSourceContentHeight + (layerPadding * 2), totalTargetContentHeight + (layerPadding * 2), 300); - + const svgHeight = Math.max( + totalSourceContentHeight + layerPadding * 2, + totalTargetContentHeight + layerPadding * 2, + 300, + ); + // Calculate starting Y position - nodes are centered in the available space - const sourceStartY = layerPadding + (svgHeight - totalSourceContentHeight - (layerPadding * 2)) / 2; - const targetStartY = layerPadding + (svgHeight - totalTargetContentHeight - (layerPadding * 2)) / 2; - + const sourceStartY = + layerPadding + + (svgHeight - totalSourceContentHeight - layerPadding * 2) / 2; + const targetStartY = + layerPadding + + (svgHeight - totalTargetContentHeight - layerPadding * 2) / 2; + sourceNodes.forEach((sourceNode, sourceIndex) => { if (sourceNode.downstreamConnections) { - sourceNode.downstreamConnections.forEach(targetNodeName => { + sourceNode.downstreamConnections.forEach((targetNodeName) => { // Find target node in the target layer - const targetIndex = targetNodes.findIndex(node => node.name === targetNodeName); - + const targetIndex = targetNodes.findIndex( + (node) => node.name === targetNodeName, + ); + if (targetIndex !== -1) { // Calculate actual vertical center of each node - const sourceY = sourceStartY + (sourceIndex * (nodeHeight + nodeSpacing)) + (nodeHeight / 2); - const targetY = targetStartY + (targetIndex * (nodeHeight + nodeSpacing)) + (nodeHeight / 2); - + const sourceY = + sourceStartY + + sourceIndex * (nodeHeight + nodeSpacing) + + nodeHeight / 2; + const targetY = + targetStartY + + targetIndex * (nodeHeight + nodeSpacing) + + nodeHeight / 2; + // Use different colors for different source nodes - const colors = ['#533FD1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6']; + const colors = [ + "#533FD1", + "#10b981", + "#f59e0b", + "#ef4444", + "#8b5cf6", + ]; const connectionColor = colors[sourceIndex % colors.length]; - + // Connection positioning - from right edge of source to left edge of target const startX = 200; // Right edge of source layer const endX = 0; // Left edge of target layer - + // Create smooth curves with proper horizontal arrow positioning const cp1X = startX + (endX - startX) * 0.3; const cp1Y = sourceY; const cp2X = startX + (endX - startX) * 0.7; const cp2Y = targetY; - + const pathData = `M ${startX} ${sourceY} C ${cp1X} ${cp1Y}, ${cp2X} ${cp2Y}, ${endX} ${targetY}`; - + connections.push( {/* Main connection path */} @@ -655,9 +732,21 @@ export const DataHubLineageFlow = ({ strokeLinejoin="round" /> {/* Connection points */} - - - + + + , ); } }); @@ -666,8 +755,8 @@ export const DataHubLineageFlow = ({ // Create unique markers for each source node color with horizontal orientation const markers = []; - const colors = ['#533FD1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6']; - + const colors = ["#533FD1", "#10b981", "#f59e0b", "#ef4444", "#8b5cf6"]; + sourceNodes.forEach((sourceNode, sourceIndex) => { const connectionColor = colors[sourceIndex % colors.length]; markers.push( @@ -687,15 +776,13 @@ export const DataHubLineageFlow = ({ opacity="0.9" stroke="none" /> - + , ); }); return ( <> - - {markers} - + {markers} {connections} ); @@ -704,8 +791,8 @@ export const DataHubLineageFlow = ({ // Choose layout based on props const renderLayout = () => { switch (layout) { - case 'hierarchical': - case 'layers': + case "hierarchical": + case "layers": return ( ); - case 'linear': + case "linear": default: return renderLinearLayout(); } @@ -725,35 +812,58 @@ export const DataHubLineageFlow = ({ return (
    - {title &&

    {title}

    } {renderLayout()}
    ); }; // Component for showing column-level lineage connections -const ColumnLineageConnections = ({ sourceNode, targetNode, connections, hasDataJob = false }) => { +const ColumnLineageConnections = ({ + sourceNode, + targetNode, + connections, + hasDataJob = false, +}) => { if (!connections.length) return null; return (
    - + {connections.map((connection, index) => { - // When hasDataJob is true, the sourceNode is the DataJob and we need to show + // When hasDataJob is true, the sourceNode is the DataJob and we need to show // connections from the previous dataset through the DataJob to the target let sourceY, targetY; - + if (hasDataJob) { // Source is DataJob, target is Dataset - show transformation output - targetY = 50 + (targetNode.columns?.findIndex(col => col.name === connection.target) || 0) * 36; + targetY = + 50 + + (targetNode.columns?.findIndex( + (col) => col.name === connection.target, + ) || 0) * + 36; // For DataJob source, we'll position the connection at the center sourceY = 125; // Center of the DataJob } else { // Normal dataset to dataset connection - sourceY = 50 + (sourceNode.columns?.findIndex(col => col.name === connection.source) || 0) * 36; - targetY = 50 + (targetNode.columns?.findIndex(col => col.name === connection.target) || 0) * 36; + sourceY = + 50 + + (sourceNode.columns?.findIndex( + (col) => col.name === connection.source, + ) || 0) * + 36; + targetY = + 50 + + (targetNode.columns?.findIndex( + (col) => col.name === connection.target, + ) || 0) * + 36; } - + return ( {/* Connection line */} @@ -766,9 +876,21 @@ const ColumnLineageConnections = ({ sourceNode, targetNode, connections, hasData markerEnd="url(#arrowhead)" /> {/* Connection points */} - - - + + + {/* Label showing the transformation */} (
    Feature Availability
    - - Self-Hosted DataHub {saasOnly ? : } + + Self-Hosted DataHub{" "} + {saasOnly ? : }
    - + DataHub Cloud {ossOnly ? : }
    diff --git a/docs-website/src/components/Feedback/index.js b/docs-website/src/components/Feedback/index.js index ecabca445bd48c..990df7ce1c444f 100644 --- a/docs-website/src/components/Feedback/index.js +++ b/docs-website/src/components/Feedback/index.js @@ -2,7 +2,11 @@ import React, { useState, useMemo } from "react"; import clsx from "clsx"; import { supabase } from "./supabase"; import styles from "./styles.module.scss"; -import { LikeOutlined, DislikeOutlined, CheckCircleOutlined } from "@ant-design/icons"; +import { + LikeOutlined, + DislikeOutlined, + CheckCircleOutlined, +} from "@ant-design/icons"; import { v4 as uuidv4 } from "uuid"; const Feedback = () => { @@ -67,10 +71,22 @@ const Feedback = () => {
    Is this page helpful?
    - -
    diff --git a/docs-website/src/components/InteractiveDiagram/index.jsx b/docs-website/src/components/InteractiveDiagram/index.jsx index f1d5136de5a748..dc7bf21972057a 100644 --- a/docs-website/src/components/InteractiveDiagram/index.jsx +++ b/docs-website/src/components/InteractiveDiagram/index.jsx @@ -10,7 +10,7 @@ import ReactFlow, { import 'reactflow/dist/style.css'; import styles from './styles.module.css'; -const InteractiveDiagram = ({ +const Discovery Challenge #2: The Data Detective = ({ nodes: initialNodes = [], edges: initialEdges = [], title, diff --git a/docs-website/src/components/InteractiveDiagram/styles.module.css b/docs-website/src/components/InteractiveDiagram/styles.module.css index 0b3a140338ea11..6ce55ad010f023 100644 --- a/docs-website/src/components/InteractiveDiagram/styles.module.css +++ b/docs-website/src/components/InteractiveDiagram/styles.module.css @@ -176,23 +176,23 @@ } /* Dark Mode Adjustments */ -[data-theme='dark'] .diagramContainer { +[data-theme="dark"] .diagramContainer { border-color: var(--ifm-color-emphasis-300); box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3); } -[data-theme='dark'] .diagramTitle { +[data-theme="dark"] .diagramTitle { background: var(--ifm-color-emphasis-200); color: var(--ifm-color-content); } -[data-theme='dark'] .reactFlow :global(.react-flow__node) { +[data-theme="dark"] .reactFlow :global(.react-flow__node) { background: var(--ifm-color-emphasis-100); border-color: var(--ifm-color-emphasis-400); color: var(--ifm-color-content); } -[data-theme='dark'] .reactFlow :global(.react-flow__edge-text) { +[data-theme="dark"] .reactFlow :global(.react-flow__edge-text) { fill: var(--ifm-color-content); } @@ -208,12 +208,12 @@ .diagramContainer { margin: 16px 0; } - + .diagramTitle { padding: 12px 16px; font-size: 14px; } - + .reactFlow :global(.react-flow__node) { font-size: 11px; padding: 6px 8px; diff --git a/docs-website/src/components/LineageLayoutGrid/index.jsx b/docs-website/src/components/LineageLayoutGrid/index.jsx index 0511ddeb7bc949..9d833821b9999c 100644 --- a/docs-website/src/components/LineageLayoutGrid/index.jsx +++ b/docs-website/src/components/LineageLayoutGrid/index.jsx @@ -1,49 +1,74 @@ -import React, { useRef, useEffect, useState } from 'react'; -import styles from './styles.module.css'; -import DataHubLineageNode from '../DataHubLineageNode'; +import React, { useRef, useEffect, useState } from "react"; +import styles from "./styles.module.css"; +import DataHubLineageNode from "../DataHubLineageNode"; -const LineageLayoutGrid = ({ - title, - layers = [], +const LineageLayoutGrid = ({ + title, + layers = [], showConnections = true, allExpanded = false, onToggleExpand = () => {}, connectionColors = {}, - defaultColors = ['#533FD1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6'] + defaultColors = ["#533FD1", "#10b981", "#f59e0b", "#ef4444", "#8b5cf6"], }) => { const containerRef = useRef(null); const [connections, setConnections] = useState([]); - // Build a map of all nodes with their positions + // Build a map of all nodes with their positions (supports nested sub-layers) const buildNodeMap = () => { const nodeMap = new Map(); - - layers.forEach((layer, layerIndex) => { - layer.nodes.forEach((node, nodeIndex) => { - const element = containerRef.current.querySelector(`[data-node-id="${node.name}"]`); - if (element) { - const containerRect = containerRef.current.getBoundingClientRect(); - const nodeRect = element.getBoundingClientRect(); - const containerScrollLeft = containerRef.current.scrollLeft; - const containerScrollTop = containerRef.current.scrollTop; - - nodeMap.set(node.name, { - node, - layerIndex, - nodeIndex, - x: nodeRect.left - containerRect.left + containerScrollLeft, - y: nodeRect.top - containerRect.top + containerScrollTop, - width: nodeRect.width, - height: nodeRect.height, - centerX: nodeRect.left - containerRect.left + nodeRect.width / 2 + containerScrollLeft, - centerY: nodeRect.top - containerRect.top + nodeRect.height / 2 + containerScrollTop, - rightEdge: nodeRect.right - containerRect.left + containerScrollLeft, - leftEdge: nodeRect.left - containerRect.left + containerScrollLeft - }); - } - }); - }); - + + const addNodesFromLayer = (layer, layerIndex) => { + // Add standalone nodes if present + if (Array.isArray(layer.nodes)) { + layer.nodes.forEach((node, nodeIndex) => { + const element = containerRef.current.querySelector( + `[data-node-id="${node.name}"]`, + ); + if (element) { + const containerRect = containerRef.current.getBoundingClientRect(); + const nodeElement = element.firstElementChild || element; + const nodeRect = nodeElement.getBoundingClientRect(); + const containerScrollLeft = containerRef.current.scrollLeft; + const containerScrollTop = containerRef.current.scrollTop; + + nodeMap.set(node.name, { + node, + layerIndex, + nodeIndex, + x: nodeRect.left - containerRect.left + containerScrollLeft, + y: nodeRect.top - containerRect.top + containerScrollTop, + width: nodeRect.width, + height: nodeRect.height, + centerX: + nodeRect.left - + containerRect.left + + nodeRect.width / 2 + + containerScrollLeft, + centerY: + nodeRect.top - + containerRect.top + + nodeRect.height / 2 + + containerScrollTop, + rightEdge: + nodeRect.right - containerRect.left + containerScrollLeft, + leftEdge: + nodeRect.left - containerRect.left + containerScrollLeft, + }); + } + }); + } + + // Recurse into sub-layers if present + if (Array.isArray(layer.subLayers)) { + layer.subLayers.forEach((subLayer) => + addNodesFromLayer(subLayer, layerIndex), + ); + } + }; + + layers.forEach((layer, layerIndex) => addNodesFromLayer(layer, layerIndex)); + return nodeMap; }; @@ -54,47 +79,56 @@ const LineageLayoutGrid = ({ const nodeMap = buildNodeMap(); const newConnections = []; - // Find all connections across all layers - layers.forEach((layer, layerIndex) => { - layer.nodes.forEach((sourceNode, sourceIndex) => { + // Find all connections across all layers (including nested sub-layers) + const processNodes = (nodes) => { + nodes.forEach((sourceNode, sourceIndex) => { if (!sourceNode.downstreamConnections) return; - sourceNode.downstreamConnections.forEach(targetNodeName => { + sourceNode.downstreamConnections.forEach((targetNodeName) => { const sourceNodeData = nodeMap.get(sourceNode.name); const targetNodeData = nodeMap.get(targetNodeName); - - if (sourceNodeData && targetNodeData) { - // Get connection color from props or use default - const connectionColor = connectionColors[sourceNode.name] || defaultColors[sourceIndex % defaultColors.length]; - - // Calculate routing path that avoids intermediate nodes - const path = calculateRoutingPath(sourceNodeData, targetNodeData, nodeMap); - // Debug: Log DataJob connections specifically - if (sourceNode.entityType === 'DataJob' || targetNodeData.node.entityType === 'DataJob') { - console.log(`DataJob connection: ${sourceNode.name} (${sourceNode.entityType}) → ${targetNodeName} (${targetNodeData.node.entityType})`); - } + if (sourceNodeData && targetNodeData) { + const connectionColor = + connectionColors[sourceNode.name] || + defaultColors[sourceIndex % defaultColors.length]; + const path = calculateRoutingPath( + sourceNodeData, + targetNodeData, + nodeMap, + ); + const arrowMarkerWidth = 10; // keep in sync with marker path and viewBox size + const backoffPx = 10; // small gap to avoid overlapping the node border newConnections.push({ id: `${sourceNode.name}-${targetNodeName}`, sourceX: sourceNodeData.rightEdge, sourceY: sourceNodeData.centerY, - targetX: targetNodeData.leftEdge - 16, // Account for arrowhead + // End the path at the center of the back of the arrowhead, with a slight gap before the node. + targetX: targetNodeData.leftEdge - (arrowMarkerWidth + backoffPx), targetY: targetNodeData.centerY, color: connectionColor, path: path, layerIndex: sourceNodeData.layerIndex, - sourceIndex + sourceIndex, }); } }); }); - }); + }; + + const traverseForConnections = (layer) => { + if (Array.isArray(layer.nodes)) processNodes(layer.nodes); + if (Array.isArray(layer.subLayers)) + layer.subLayers.forEach(traverseForConnections); + }; + + layers.forEach(traverseForConnections); setConnections(newConnections); }; - // Calculate routing path that avoids nodes + // Calculate routing path that avoids nodes with proper collision detection const calculateRoutingPath = (sourceData, targetData, nodeMap) => { const sourceX = sourceData.rightEdge; const sourceY = sourceData.centerY; @@ -102,37 +136,73 @@ const LineageLayoutGrid = ({ const targetY = targetData.centerY; // Check if there are nodes between source and target that we need to route around - const intermediateNodes = Array.from(nodeMap.values()).filter(nodeData => { - // Only consider nodes that are between source and target horizontally - return nodeData.centerX > sourceX && nodeData.centerX < targetX && - nodeData.node.name !== sourceData.node.name && - nodeData.node.name !== targetData.node.name; - }); + const intermediateNodes = Array.from(nodeMap.values()).filter( + (nodeData) => { + // Add buffer zones around nodes to ensure we don't clip them + const nodeBuffer = 25; // Extra space around nodes + const nodeLeft = nodeData.x - nodeBuffer; + const nodeRight = nodeData.x + nodeData.width + nodeBuffer; + const nodeTop = nodeData.y - nodeBuffer; + const nodeBottom = nodeData.y + nodeData.height + nodeBuffer; + + // Check if this node is in the horizontal path between source and target + const isInHorizontalPath = nodeLeft < targetX && nodeRight > sourceX; + const isNotSourceOrTarget = + nodeData.node.name !== sourceData.node.name && + nodeData.node.name !== targetData.node.name; + + // Also check if the direct line from source to target would intersect this node + const directLineIntersectsNode = + // Line passes through the node's Y range + (sourceY <= nodeBottom && sourceY >= nodeTop) || + (targetY <= nodeBottom && targetY >= nodeTop) || + (sourceY <= nodeTop && targetY >= nodeBottom) || + (sourceY >= nodeBottom && targetY <= nodeTop); + + return ( + isInHorizontalPath && isNotSourceOrTarget && directLineIntersectsNode + ); + }, + ); if (intermediateNodes.length === 0) { // Direct path if no obstacles return null; // Will use default curve } - // Find routing level that avoids all intermediate nodes - const allNodeYs = intermediateNodes.map(n => [n.y, n.y + n.height]).flat(); - allNodeYs.push(sourceY, targetY); - - const minY = Math.min(...allNodeYs); - const maxY = Math.max(...allNodeYs); - - // Route above or below based on which has more space and is more natural - const routingOffset = 40; - const routeAbove = minY - routingOffset; - const routeBelow = maxY + routingOffset; - - // Choose the route that's closer to the average of source and target + // Calculate routing paths that avoid all intermediate nodes + const nodeObstacles = intermediateNodes.map((nodeData) => ({ + top: nodeData.y - 25, // Buffer above node + bottom: nodeData.y + nodeData.height + 25, // Buffer below node + left: nodeData.x - 25, + right: nodeData.x + nodeData.width + 25, + centerY: nodeData.centerY, + name: nodeData.node.name, + })); + + // Find the best routing level (above or below obstacles) + const allTops = nodeObstacles.map((n) => n.top); + const allBottoms = nodeObstacles.map((n) => n.bottom); + + const highestTop = Math.min(...allTops); + const lowestBottom = Math.max(...allBottoms); + + // Calculate routing options with more clearance + const routingOffset = 60; // Larger offset to more clearly bend around nodes + const routeAbove = highestTop - routingOffset; + const routeBelow = lowestBottom + routingOffset; + + // Choose the route that's closer to the average of source and target Y positions const avgY = (sourceY + targetY) / 2; - const routingY = Math.abs(routeAbove - avgY) < Math.abs(routeBelow - avgY) ? routeAbove : routeBelow; + const routingY = + Math.abs(routeAbove - avgY) < Math.abs(routeBelow - avgY) + ? routeAbove + : routeBelow; return { - type: 'routed', - routingY + type: "routed", + routingY, + obstacles: nodeObstacles, }; }; @@ -146,18 +216,18 @@ const LineageLayoutGrid = ({ useEffect(() => { const handleResize = () => calculateConnections(); const handleScroll = () => calculateConnections(); - - window.addEventListener('resize', handleResize); - + + window.addEventListener("resize", handleResize); + // Add scroll listener to the container if (containerRef.current) { - containerRef.current.addEventListener('scroll', handleScroll); + containerRef.current.addEventListener("scroll", handleScroll); } - + return () => { - window.removeEventListener('resize', handleResize); + window.removeEventListener("resize", handleResize); if (containerRef.current) { - containerRef.current.removeEventListener('scroll', handleScroll); + containerRef.current.removeEventListener("scroll", handleScroll); } }; }, []); @@ -165,75 +235,145 @@ const LineageLayoutGrid = ({ // Generate path that routes around nodes when needed const generatePath = (connection) => { const { sourceX, sourceY, targetX, targetY, path } = connection; - - if (!path || path.type !== 'routed') { + + if (!path || path.type !== "routed") { // Simple Bezier curve for direct connections const horizontalDistance = targetX - sourceX; const cp1X = sourceX + horizontalDistance * 0.5; const cp1Y = sourceY; const cp2X = sourceX + horizontalDistance * 0.5; const cp2Y = targetY; - + return `M ${sourceX} ${sourceY} C ${cp1X} ${cp1Y}, ${cp2X} ${cp2Y}, ${targetX} ${targetY}`; } - - // Routed path that curves above or below obstacles + + // Routed path using two cubic segments that pass through a safe routing level const { routingY } = path; - const horizontalDistance = targetX - sourceX; - - // Create a smooth S-curve that goes through the routing level - const midX = sourceX + horizontalDistance * 0.5; - - // Control points for smooth routing curve - const cp1X = sourceX + horizontalDistance * 0.25; - const cp1Y = sourceY + (routingY - sourceY) * 0.3; - - const cp2X = sourceX + horizontalDistance * 0.75; - const cp2Y = targetY + (routingY - targetY) * 0.3; - - return `M ${sourceX} ${sourceY} C ${cp1X} ${cp1Y}, ${cp2X} ${cp2Y}, ${targetX} ${targetY}`; + const midX = sourceX + (targetX - sourceX) * 0.5; + const bend = 40; // Horizontal control point offset for smooth bends + + // First curve: from source to mid point at routingY + const cp1X = sourceX + bend; + const cp1Y = sourceY; + const cp2X = midX - bend; + const cp2Y = routingY; + + // Second curve: from mid point at routingY to target + const cp3X = midX + bend; + const cp3Y = routingY; + const cp4X = targetX - bend; + const cp4Y = targetY; + + return `M ${sourceX} ${sourceY} C ${cp1X} ${cp1Y}, ${cp2X} ${cp2Y}, ${midX} ${routingY} C ${cp3X} ${cp3Y}, ${cp4X} ${cp4Y}, ${targetX} ${targetY}`; + }; + + // Recursive renderer for layers and sublayers + const renderLayerContent = (layer) => { + const hasSubLayers = + Array.isArray(layer.subLayers) && layer.subLayers.length > 0; + + if (!hasSubLayers) { + // Render standalone nodes + return ( +
    + {(layer.nodes || []).map((node, nodeIndex) => ( +
    + +
    + ))} +
    + ); + } + + // Render sublayers: support horizontal columns or vertical stacks at each level + if (layer.subLayersLayout === "columns") { + return ( +
    + {/* Leftmost column for standalone nodes at this level */} + {Array.isArray(layer.nodes) && layer.nodes.length > 0 && ( +
    +
    + {layer.nodes.map((node, nodeIndex) => ( +
    + +
    + ))} +
    +
    + )} + {layer.subLayers.map((subLayer, subLayerIndex) => ( +
    + {subLayer.title && ( +
    {subLayer.title}
    + )} + {renderLayerContent(subLayer)} +
    + ))} +
    + ); + } + + // Default vertical stack + return ( +
    + {layer.subLayers.map((subLayer, subLayerIndex) => ( +
    + {subLayer.title && ( +
    {subLayer.title}
    + )} + {renderLayerContent(subLayer)} +
    + ))} +
    + ); }; return (
    {title &&

    {title}

    } - +
    {layers.map((layer, layerIndex) => (
    {layer.title && (
    {layer.title}
    )} -
    - {layer.nodes.map((node, nodeIndex) => ( -
    - -
    - ))} -
    + + {renderLayerContent(layer)}
    ))}
    {/* SVG overlay for connections */} {showConnections && connections.length > 0 && ( - @@ -243,7 +383,7 @@ const LineageLayoutGrid = ({ id={`arrowhead-${connection.id}`} markerWidth="10" markerHeight="8" - refX="0" + refX="0" /* anchor the center of the back of the arrow at the path end */ refY="4" orient="0" markerUnits="strokeWidth" @@ -256,7 +396,7 @@ const LineageLayoutGrid = ({ ))} - + {connections.map((connection) => ( - - ))} diff --git a/docs-website/src/components/LineageLayoutGrid/styles.module.css b/docs-website/src/components/LineageLayoutGrid/styles.module.css index 875b1751e3effb..b189d70f66bc3a 100644 --- a/docs-website/src/components/LineageLayoutGrid/styles.module.css +++ b/docs-website/src/components/LineageLayoutGrid/styles.module.css @@ -11,11 +11,21 @@ } .title { + position: sticky; + top: 0; /* stick to top of scroll container */ + left: 50%; /* horizontally center within visible area */ + transform: translateX(-50%); + z-index: 2; /* above connections overlay */ text-align: center; margin: 0 0 24px 0; font-size: 18px; font-weight: 600; color: var(--ifm-color-content); + background: var( + --ifm-background-color + ); /* readable over content while scrolling */ + padding: 8px 12px; + border-radius: 12px; } /* CSS Grid Layout for Layers */ @@ -57,6 +67,54 @@ white-space: nowrap; } +/* Sub-layers Container */ +.subLayersContainer { + display: flex; + flex-direction: column; + gap: 32px; + width: 100%; + align-items: center; +} + +/* Horizontal sub-layer columns within a single layer */ +.subLayersRowContainer { + display: grid; + grid-auto-flow: column; + grid-auto-columns: minmax(320px, 1fr); + gap: 24px; + width: 100%; + align-items: start; + justify-content: start; /* ensure first subcolumn starts at left edge of layer */ + justify-items: start; /* ensure subcolumn content aligns to left */ +} + +.subLayerColumn { + display: flex; + flex-direction: column; + align-items: center; + width: 100%; +} + +.subLayer { + display: flex; + flex-direction: column; + align-items: center; + width: 100%; +} + +.subLayerTitle { + font-size: 12px; + font-weight: 500; + color: var(--ifm-color-emphasis-600); + text-align: center; + margin-bottom: 16px; + padding: 6px 12px; + background: var(--ifm-color-emphasis-100); + border-radius: 16px; + letter-spacing: 0.3px; + white-space: nowrap; +} + /* Nodes within each layer */ .layerNodes { display: flex; @@ -68,6 +126,17 @@ justify-content: center; } +/* Left-aligned variant for sublayer columns */ +.layerNodesLeft { + display: flex; + flex-direction: column; + gap: 20px; + align-items: flex-start; + width: 100%; + flex: 1; + justify-content: flex-start; +} + .nodeWrapper { width: 100%; max-width: 300px; @@ -106,20 +175,30 @@ grid-auto-columns: minmax(280px, 1fr); gap: 60px; } - + .nodeWrapper { max-width: 260px; } + + .subLayersRowContainer { + grid-auto-columns: minmax(260px, 1fr); + gap: 16px; + } } /* Dark mode support */ -[data-theme='dark'] .lineageContainer { +[data-theme="dark"] .lineageContainer { background: var(--ifm-background-color); border-color: var(--ifm-color-emphasis-300); } -[data-theme='dark'] .layerTitle { +[data-theme="dark"] .layerTitle { background: var(--ifm-background-color); border-color: var(--ifm-color-emphasis-300); color: var(--ifm-color-emphasis-800); } + +[data-theme="dark"] .subLayerTitle { + background: var(--ifm-color-emphasis-200); + color: var(--ifm-color-emphasis-700); +} diff --git a/docs-website/src/components/NextStepButton/index.jsx b/docs-website/src/components/NextStepButton/index.jsx index bb8c4415c6f691..b62e31e9b18a76 100644 --- a/docs-website/src/components/NextStepButton/index.jsx +++ b/docs-website/src/components/NextStepButton/index.jsx @@ -1,29 +1,29 @@ -import React from 'react'; -import Link from '@docusaurus/Link'; -import styles from './styles.module.css'; +import React from "react"; +import Link from "@docusaurus/Link"; +import styles from "./styles.module.css"; -const NextStepButton = ({ - to, - children, - tutorialId, - currentStep, - variant = 'primary', - icon = '→' +const NextStepButton = ({ + to, + children, + tutorialId, + currentStep, + variant = "primary", + icon = "→", }) => { const handleClick = () => { if (tutorialId && currentStep !== undefined) { const storageKey = `datahub-tutorial-${tutorialId}`; const savedProgress = localStorage.getItem(storageKey); let completedSteps = new Set(); - + if (savedProgress) { try { completedSteps = new Set(JSON.parse(savedProgress)); } catch (e) { - console.warn('Failed to parse tutorial progress:', e); + console.warn("Failed to parse tutorial progress:", e); } } - + // Mark current step as completed completedSteps.add(`step-${currentStep}`); localStorage.setItem(storageKey, JSON.stringify([...completedSteps])); @@ -31,8 +31,8 @@ const NextStepButton = ({ }; return ( - diff --git a/docs-website/src/components/NextStepButton/styles.module.css b/docs-website/src/components/NextStepButton/styles.module.css index 26028d37f54339..e24bb0dd6b2fdc 100644 --- a/docs-website/src/components/NextStepButton/styles.module.css +++ b/docs-website/src/components/NextStepButton/styles.module.css @@ -54,12 +54,12 @@ } /* Dark mode support */ -[data-theme='dark'] .secondary { +[data-theme="dark"] .secondary { border-color: var(--ifm-color-primary-light); color: var(--ifm-color-primary-light); } -[data-theme='dark'] .secondary:hover { +[data-theme="dark"] .secondary:hover { background: var(--ifm-color-primary-light); color: var(--ifm-color-primary-darkest); } diff --git a/docs-website/src/components/OSDetectionTabs/index.jsx b/docs-website/src/components/OSDetectionTabs/index.jsx index d345694c7a09e7..75b948c7081eb4 100644 --- a/docs-website/src/components/OSDetectionTabs/index.jsx +++ b/docs-website/src/components/OSDetectionTabs/index.jsx @@ -1,43 +1,53 @@ -import React, { useState, useEffect } from 'react'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import styles from './styles.module.css'; +import React, { useState, useEffect } from "react"; +import Tabs from "@theme/Tabs"; +import TabItem from "@theme/TabItem"; +import styles from "./styles.module.css"; const OSDetectionTabs = ({ children, defaultOS = null }) => { // Detect OS immediately during initialization const detectOS = () => { - if (typeof window === 'undefined') return 'linux'; // SSR fallback - + if (typeof window === "undefined") return "linux"; // SSR fallback + const userAgent = window.navigator.userAgent; const platform = window.navigator.platform; - - console.log('Detecting OS - UserAgent:', userAgent, 'Platform:', platform); - + + console.log("Detecting OS - UserAgent:", userAgent, "Platform:", platform); + // More specific macOS detection - if (platform.indexOf('Mac') !== -1 || - userAgent.indexOf('Mac') !== -1 || - userAgent.indexOf('macOS') !== -1 || - platform === 'MacIntel' || - platform === 'MacPPC') { - return 'macos'; - } else if (userAgent.indexOf('Win') !== -1 || platform.indexOf('Win') !== -1) { - return 'windows'; - } else if (userAgent.indexOf('Linux') !== -1 || platform.indexOf('Linux') !== -1) { - return 'linux'; + if ( + platform.indexOf("Mac") !== -1 || + userAgent.indexOf("Mac") !== -1 || + userAgent.indexOf("macOS") !== -1 || + platform === "MacIntel" || + platform === "MacPPC" + ) { + return "macos"; + } else if ( + userAgent.indexOf("Win") !== -1 || + platform.indexOf("Win") !== -1 + ) { + return "windows"; + } else if ( + userAgent.indexOf("Linux") !== -1 || + platform.indexOf("Linux") !== -1 + ) { + return "linux"; } else { - return 'linux'; // Default fallback + return "linux"; // Default fallback } }; const [detectedOS, setDetectedOS] = useState(() => detectOS()); - const [defaultValue, setDefaultValue] = useState(() => defaultOS || detectOS()); + const [defaultValue, setDefaultValue] = useState( + () => defaultOS || detectOS(), + ); useEffect(() => { // Re-detect OS on client side to handle SSR const os = detectOS(); - console.log('Detected OS:', os); + console.log("Detected OS:", os); setDetectedOS(os); - + // Set default tab to detected OS if no explicit default provided if (!defaultOS) { setDefaultValue(os); @@ -47,36 +57,45 @@ const OSDetectionTabs = ({ children, defaultOS = null }) => { // Get OS icon const getOSIcon = (osValue) => { switch (osValue) { - case 'windows': return '🪟'; - case 'macos': return '🍎'; - case 'linux': return '🐧'; - default: return ''; + case "windows": + return "🪟"; + case "macos": + return "🍎"; + case "linux": + return "🐧"; + default: + return ""; } }; // Add OS detection info to child components - const enhancedChildren = React.Children.map(children, child => { + const enhancedChildren = React.Children.map(children, (child) => { if (React.isValidElement(child) && child.type === TabItem) { const isDetected = child.props.value === detectedOS; const icon = getOSIcon(child.props.value); - const label = isDetected - ? `${icon} ${child.props.label} (Your OS)` + const label = isDetected + ? `${icon} ${child.props.label} (Your OS)` : `${icon} ${child.props.label}`; - + return React.cloneElement(child, { ...child.props, label, - className: isDetected ? styles.detectedTab : '' + className: isDetected ? styles.detectedTab : "", }); } return child; }); - console.log('Rendering OSDetectionTabs with defaultValue:', defaultValue, 'detectedOS:', detectedOS); - + console.log( + "Rendering OSDetectionTabs with defaultValue:", + defaultValue, + "detectedOS:", + detectedOS, + ); + return ( - diff --git a/docs-website/src/components/Pills/GlossaryTermPill.jsx b/docs-website/src/components/Pills/GlossaryTermPill.jsx new file mode 100644 index 00000000000000..4ec71be7b14e38 --- /dev/null +++ b/docs-website/src/components/Pills/GlossaryTermPill.jsx @@ -0,0 +1,35 @@ +import React from "react"; +import styles from "./styles.module.css"; + +const generateTermColor = (termName) => { + const colors = [ + "#1890ff", + "#52c41a", + "#faad14", + "#f5222d", + "#722ed1", + "#fa541c", + "#13c2c2", + "#eb2f96", + "#a0d911", + "#fadb14", + ]; + let hash = 0; + for (let i = 0; i < termName.length; i++) { + hash = (hash << 5) - hash + termName.charCodeAt(i); + } + return colors[Math.abs(hash) % colors.length]; +}; + +export const GlossaryTermPill = ({ term }) => ( +
    +
    + {term} +
    +); + +export default GlossaryTermPill; diff --git a/docs-website/src/components/Pills/TagPill.jsx b/docs-website/src/components/Pills/TagPill.jsx new file mode 100644 index 00000000000000..e9a963decc6b37 --- /dev/null +++ b/docs-website/src/components/Pills/TagPill.jsx @@ -0,0 +1,25 @@ +import React from "react"; +import styles from "./styles.module.css"; + +const generateTagColor = (tagName) => { + let hash = 0; + for (let i = 0; i < tagName.length; i++) { + const char = tagName.charCodeAt(i); + hash = (hash << 5) - hash + char; + hash = hash & hash; + } + const hue = Math.abs(hash) % 360; + return `hsl(${hue}, 70%, 45%)`; +}; + +export const TagPill = ({ tag }) => ( +
    +
    + {tag} +
    +); + +export default TagPill; diff --git a/docs-website/src/components/Pills/styles.module.css b/docs-website/src/components/Pills/styles.module.css new file mode 100644 index 00000000000000..dd4f289e4983f7 --- /dev/null +++ b/docs-website/src/components/Pills/styles.module.css @@ -0,0 +1,63 @@ +.tagPill { + display: flex; + align-items: center; + background: var(--ifm-background-color, #fff); + border: 1px solid var(--ifm-color-emphasis-300, #e9eaee); + border-radius: 4px; + padding: 2px 6px; + font-size: 11px; + color: var(--ifm-color-content, #374066); + max-width: 140px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.tagColorDot { + width: 6px; + height: 6px; + border-radius: 50%; + margin-right: 4px; + flex-shrink: 0; +} + +.tagText { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.termPill { + position: relative; + display: inline-flex; + align-items: center; + background: #f8f8f8; + border: 1px solid #ccd1dd; + border-radius: 5px; + padding: 3px 8px; + font-size: 12px; + font-weight: 400; + color: #565657; + max-width: 200px; + overflow: hidden; + cursor: pointer; + margin-left: 8px; /* Make room for ribbon */ +} + +.termRibbon { + position: absolute; + left: -20px; + top: 4px; + width: 50px; + transform: rotate(-45deg); + padding: 4px; + opacity: 1; + background: var(--pill-color, #1890ff); +} + +.termText { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + margin-left: 8px; +} diff --git a/docs-website/src/components/ProcessFlow/index.jsx b/docs-website/src/components/ProcessFlow/index.jsx index 239249568d749e..5c7a86068f143f 100644 --- a/docs-website/src/components/ProcessFlow/index.jsx +++ b/docs-website/src/components/ProcessFlow/index.jsx @@ -1,18 +1,19 @@ -import React from 'react'; -import styles from './styles.module.css'; +import React from "react"; +import styles from "./styles.module.css"; -const ProcessFlow = ({ - title, - steps, - type = 'horizontal', // 'horizontal', 'vertical', 'circular' +const ProcessFlow = ({ + title, + steps, + type = "horizontal", // 'horizontal', 'vertical', 'circular' showNumbers = true, - animated = true + animated = true, }) => { const renderStep = (step, index) => ( -
    - {showNumbers && ( -
    {index + 1}
    - )} +
    + {showNumbers &&
    {index + 1}
    }
    {step.title}
    {step.description && ( @@ -21,7 +22,9 @@ const ProcessFlow = ({ {step.details && (
    {step.details.map((detail, i) => ( -
    • {detail}
    +
    + • {detail} +
    ))}
    )} @@ -31,17 +34,19 @@ const ProcessFlow = ({ const renderConnector = (index) => (
    - {type === 'horizontal' ? '→' : '↓'} + {type === "horizontal" ? "→" : "↓"}
    ); // Detect if we might have overflow (4+ steps in horizontal layout) - const hasOverflow = type === 'horizontal' && steps.length >= 4; + const hasOverflow = type === "horizontal" && steps.length >= 4; return ( -
    +
    {title &&
    {title}
    } - +
    {steps.map((step, index) => ( @@ -62,55 +67,83 @@ export const DataHubWorkflows = { { title: "Requirements Analysis", description: "Define business objectives", - details: ["Identify data needs", "Set success criteria", "Define scope"] + details: [ + "Identify data needs", + "Set success criteria", + "Define scope", + ], }, { title: "Strategic Search", description: "Apply targeted queries", - details: ["Use business terms", "Apply filters", "Refine results"] + details: ["Use business terms", "Apply filters", "Refine results"], }, { title: "Asset Evaluation", description: "Assess data quality", - details: ["Check freshness", "Review schema", "Validate completeness"] + details: ["Check freshness", "Review schema", "Validate completeness"], }, { title: "Access Planning", description: "Understand requirements", - details: ["Check permissions", "Review documentation", "Plan integration"] - } - ] + details: [ + "Check permissions", + "Review documentation", + "Plan integration", + ], + }, + ], }, - + lineageAnalysis: { title: "5-Hop Lineage Analysis Method", steps: [ { title: "Start at Target", description: "Begin with dataset of interest", - details: ["Open lineage view", "Identify current dataset", "Note business context"] + details: [ + "Open lineage view", + "Identify current dataset", + "Note business context", + ], }, { title: "Trace Upstream", description: "Follow data backwards", - details: ["Identify transformations", "Check data sources", "Document dependencies"] + details: [ + "Identify transformations", + "Check data sources", + "Document dependencies", + ], }, { title: "Analyze Hops", description: "Examine each connection", - details: ["Understand business logic", "Check quality gates", "Note critical points"] + details: [ + "Understand business logic", + "Check quality gates", + "Note critical points", + ], }, { title: "Impact Assessment", description: "Evaluate change effects", - details: ["Identify affected systems", "Assess risk levels", "Plan mitigation"] + details: [ + "Identify affected systems", + "Assess risk levels", + "Plan mitigation", + ], }, { title: "Validate Understanding", description: "Confirm analysis", - details: ["Review with data owners", "Test assumptions", "Document findings"] - } - ] + details: [ + "Review with data owners", + "Test assumptions", + "Document findings", + ], + }, + ], }, ingestionProcess: { @@ -119,30 +152,42 @@ export const DataHubWorkflows = { { title: "Connection", description: "Establish secure connections", - details: ["Configure credentials", "Test connectivity", "Set up authentication"] + details: [ + "Configure credentials", + "Test connectivity", + "Set up authentication", + ], }, { title: "Discovery", description: "Scan data structures", - details: ["Identify schemas", "Map relationships", "Detect patterns"] + details: ["Identify schemas", "Map relationships", "Detect patterns"], }, { title: "Extraction", description: "Pull comprehensive metadata", - details: ["Schema information", "Statistics", "Lineage data"] + details: ["Schema information", "Statistics", "Lineage data"], }, { title: "Transformation", description: "Standardize metadata format", - details: ["Apply business rules", "Enrich with context", "Validate quality"] + details: [ + "Apply business rules", + "Enrich with context", + "Validate quality", + ], }, { title: "Loading", description: "Store in DataHub", - details: ["Update knowledge graph", "Index for search", "Enable discovery"] - } - ] - } + details: [ + "Update knowledge graph", + "Index for search", + "Enable discovery", + ], + }, + ], + }, }; export default ProcessFlow; diff --git a/docs-website/src/components/ProcessFlow/styles.module.css b/docs-website/src/components/ProcessFlow/styles.module.css index 024765eb19f71c..b41b3bbbe60449 100644 --- a/docs-website/src/components/ProcessFlow/styles.module.css +++ b/docs-website/src/components/ProcessFlow/styles.module.css @@ -181,11 +181,21 @@ opacity: 0; } -.animated:nth-child(1) { animation-delay: 0.1s; } -.animated:nth-child(3) { animation-delay: 0.2s; } -.animated:nth-child(5) { animation-delay: 0.3s; } -.animated:nth-child(7) { animation-delay: 0.4s; } -.animated:nth-child(9) { animation-delay: 0.5s; } +.animated:nth-child(1) { + animation-delay: 0.1s; +} +.animated:nth-child(3) { + animation-delay: 0.2s; +} +.animated:nth-child(5) { + animation-delay: 0.3s; +} +.animated:nth-child(7) { + animation-delay: 0.4s; +} +.animated:nth-child(9) { + animation-delay: 0.5s; +} @keyframes slideIn { from { @@ -204,8 +214,13 @@ } @keyframes pulse { - 0%, 100% { opacity: 0.6; } - 50% { opacity: 1; } + 0%, + 100% { + opacity: 0.6; + } + 50% { + opacity: 1; + } } /* Responsive Design */ @@ -214,36 +229,36 @@ flex-direction: column; align-items: center; } - + .horizontal .step { width: 100%; max-width: 400px; } - + .horizontal .connector { transform: rotate(90deg); width: 100%; height: 30px; } - + .stepNumber { left: 12px; } } /* Dark Mode Support */ -[data-theme='dark'] .processFlow { +[data-theme="dark"] .processFlow { background: linear-gradient(135deg, #1e293b 0%, #334155 100%); border-color: var(--ifm-color-primary-dark); } -[data-theme='dark'] .step { +[data-theme="dark"] .step { background: var(--ifm-color-emphasis-100); border-color: var(--ifm-color-primary-dark); color: var(--ifm-color-emphasis-800); } -[data-theme='dark'] .step:hover { +[data-theme="dark"] .step:hover { border-color: var(--ifm-color-primary); background: var(--ifm-color-emphasis-200); } diff --git a/docs-website/src/components/SlackUtm/index.js b/docs-website/src/components/SlackUtm/index.js index 8cfda600135dad..f50af0498f3fcc 100644 --- a/docs-website/src/components/SlackUtm/index.js +++ b/docs-website/src/components/SlackUtm/index.js @@ -1,14 +1,23 @@ import React, { useState, useMemo } from "react"; import styles from "./styles.module.scss"; -import { LikeOutlined, DislikeOutlined, CheckCircleOutlined } from "@ant-design/icons"; +import { + LikeOutlined, + DislikeOutlined, + CheckCircleOutlined, +} from "@ant-design/icons"; import { v4 as uuidv4 } from "uuid"; const SlackUtm = ({ pageId }) => { return (
    -
    - Need more help? Join the conversation in Slack! +
    + Need more help? Join the conversation in{" "} + + Slack! +
    ); diff --git a/docs-website/src/components/SlackUtm/styles.module.scss b/docs-website/src/components/SlackUtm/styles.module.scss index b1e8938dc9d086..d53613d0a5a45f 100644 --- a/docs-website/src/components/SlackUtm/styles.module.scss +++ b/docs-website/src/components/SlackUtm/styles.module.scss @@ -1,3 +1,3 @@ .slackUtm { - padding: 0.5rem 0rem; -} \ No newline at end of file + padding: 0.5rem 0rem; +} diff --git a/docs-website/src/components/SolutionsDropdown/SolutionsDropdown.tsx b/docs-website/src/components/SolutionsDropdown/SolutionsDropdown.tsx index bbb38355ec0594..bfa058af2128f8 100644 --- a/docs-website/src/components/SolutionsDropdown/SolutionsDropdown.tsx +++ b/docs-website/src/components/SolutionsDropdown/SolutionsDropdown.tsx @@ -5,22 +5,25 @@ * LICENSE file in the root directory of this source tree. */ -import React, {useState, useRef, useEffect} from 'react'; -import clsx from 'clsx'; +import React, { useState, useRef, useEffect } from "react"; +import clsx from "clsx"; import { isRegexpStringMatch, useCollapsible, Collapsible, -} from '@docusaurus/theme-common'; -import {isSamePath, useLocalPathname} from '@docusaurus/theme-common/internal'; -import NavbarNavLink from '@theme/NavbarItem/NavbarNavLink'; -import NavbarItem, {type LinkLikeNavbarItemProps} from '@theme/NavbarItem'; +} from "@docusaurus/theme-common"; +import { + isSamePath, + useLocalPathname, +} from "@docusaurus/theme-common/internal"; +import NavbarNavLink from "@theme/NavbarItem/NavbarNavLink"; +import NavbarItem, { type LinkLikeNavbarItemProps } from "@theme/NavbarItem"; import type { DesktopOrMobileNavBarItemProps, Props, -} from '@theme/NavbarItem/DropdownNavbarItem'; -import styles from './styles.module.css'; -import SolutionsDropdownContent from './SolutionsDropdownContent'; +} from "@theme/NavbarItem/DropdownNavbarItem"; +import styles from "./styles.module.css"; +import SolutionsDropdownContent from "./SolutionsDropdownContent"; function isItemActive( item: LinkLikeNavbarItemProps, @@ -53,7 +56,7 @@ function DropdownNavbarItemDesktop({ ...props }: DesktopOrMobileNavBarItemProps) { const dropdownRef = useRef(null); - const [showDropdown, setShowDropdown] = useState(false); + const [showDropdown, setShowDropdown] = useState(false); useEffect(() => { const handleClickOutside = ( @@ -68,24 +71,25 @@ function DropdownNavbarItemDesktop({ setShowDropdown(false); }; - document.addEventListener('mousedown', handleClickOutside); - document.addEventListener('touchstart', handleClickOutside); - document.addEventListener('focusin', handleClickOutside); + document.addEventListener("mousedown", handleClickOutside); + document.addEventListener("touchstart", handleClickOutside); + document.addEventListener("focusin", handleClickOutside); return () => { - document.removeEventListener('mousedown', handleClickOutside); - document.removeEventListener('touchstart', handleClickOutside); - document.removeEventListener('focusin', handleClickOutside); + document.removeEventListener("mousedown", handleClickOutside); + document.removeEventListener("touchstart", handleClickOutside); + document.removeEventListener("focusin", handleClickOutside); }; }, [dropdownRef]); return (
    + className={clsx("navbar__item", "dropdown", "dropdown--hoverable", { + "dropdown--right": position === "right", + "dropdown--show": showDropdown, + })} + > tag focusable in case no link target // See https://github.com/facebook/docusaurus/pull/6003 // There's probably a better solution though... - href={props.to ? undefined : '#'} - className={clsx('navbar__link', className)} + href={props.to ? undefined : "#"} + className={clsx("navbar__link", className)} {...props} onClick={props.to ? undefined : (e) => e.preventDefault()} onKeyDown={(e) => { - if (e.key === 'Enter') { + if (e.key === "Enter") { e.preventDefault(); setShowDropdown(!showDropdown); } - }}> + }} + > {props.children ?? props.label}
      @@ -132,7 +137,7 @@ function DropdownNavbarItemMobile({ const localPathname = useLocalPathname(); const containsActive = containsActiveItems(items, localPathname); - const {collapsed, toggleCollapsed, setCollapsed} = useCollapsible({ + const { collapsed, toggleCollapsed, setCollapsed } = useCollapsible({ initialState: () => !containsActive, }); @@ -145,21 +150,23 @@ function DropdownNavbarItemMobile({ return (
    • + className={clsx("menu__list-item", { + "menu__list-item--collapsed": collapsed, + })} + > { e.preventDefault(); toggleCollapsed(); - }}> + }} + > {props.children ?? props.label} @@ -175,4 +182,4 @@ export default function DropdownNavbarItem({ }: Props): JSX.Element { const Comp = mobile ? DropdownNavbarItemMobile : DropdownNavbarItemDesktop; return ; -} \ No newline at end of file +} diff --git a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/index.js b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/index.js index 79481c52f279e6..44431acdaad22f 100644 --- a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/index.js +++ b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/index.js @@ -1,8 +1,8 @@ -import React from 'react'; -import styles from './styles.module.scss'; -import clsx from 'clsx'; -import Link from '@docusaurus/Link'; -import solutionsDropdownContent from './solutionsDropdownContent'; +import React from "react"; +import styles from "./styles.module.scss"; +import clsx from "clsx"; +import Link from "@docusaurus/Link"; +import solutionsDropdownContent from "./solutionsDropdownContent"; function SolutionsDropdownContent() { const { fullSizeCards, halfSizeCards } = solutionsDropdownContent; @@ -20,14 +20,16 @@ function SolutionsDropdownContent() {
      {item.title}
      {item.title}
      -
      {item.description}
      +
      + {item.description} +
      @@ -37,7 +39,10 @@ function SolutionsDropdownContent() { {/* Half-size cards */}
      {halfSizeCards.map((item, index) => ( -
      +
      {item.title}
      -
      {item.description}
      +
      + {item.description} +
      diff --git a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/solutionsDropdownContent.js b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/solutionsDropdownContent.js index ad7278a438cf81..43ca27e8957c1d 100644 --- a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/solutionsDropdownContent.js +++ b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/solutionsDropdownContent.js @@ -1,38 +1,38 @@ const solutionsDropdownContent = { - fullSizeCards: [ - { - title: "Data Discovery", - description: "Search, Browse, Lineage, and more.", - iconImage: "/img/solutions/icon-dropdown-discovery.png", - href: "/solutions/discovery", - }, - { - title: "Data Observability", - description: "Detect and Resolve Data Quality issues", - iconImage: "/img/solutions/icon-dropdown-observe.png", - href: "/solutions/observability", - }, - { - title: "Data Governance", - description: "Automate Classifying and Governing data.", - iconImage: "/img/solutions/icon-dropdown-governance.png", - href: "/solutions/governance", - }, - ], - halfSizeCards: [ - { - title: "DataHub Core", - description: "Get started with the Open Source platform.", - iconImage: "/img/solutions/icon-dropdown-core.png", - href: "/docs/quickstart", - }, - { - title: "Cloud vs Core", - description: "Understand the differences.", - iconImage: "/img/solutions/icon-dropdown-cloud.png", - href: "/cloud", - }, - ], + fullSizeCards: [ + { + title: "Data Discovery", + description: "Search, Browse, Lineage, and more.", + iconImage: "/img/solutions/icon-dropdown-discovery.png", + href: "/solutions/discovery", + }, + { + title: "Data Observability", + description: "Detect and Resolve Data Quality issues", + iconImage: "/img/solutions/icon-dropdown-observe.png", + href: "/solutions/observability", + }, + { + title: "Data Governance", + description: "Automate Classifying and Governing data.", + iconImage: "/img/solutions/icon-dropdown-governance.png", + href: "/solutions/governance", + }, + ], + halfSizeCards: [ + { + title: "DataHub Core", + description: "Get started with the Open Source platform.", + iconImage: "/img/solutions/icon-dropdown-core.png", + href: "/docs/quickstart", + }, + { + title: "Cloud vs Core", + description: "Understand the differences.", + iconImage: "/img/solutions/icon-dropdown-cloud.png", + href: "/cloud", + }, + ], }; - -export default solutionsDropdownContent \ No newline at end of file + +export default solutionsDropdownContent; diff --git a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/styles.module.scss b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/styles.module.scss index b156c3342f24ab..d32ef9a5359f10 100644 --- a/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/styles.module.scss +++ b/docs-website/src/components/SolutionsDropdown/SolutionsDropdownContent/styles.module.scss @@ -1,137 +1,137 @@ .container { - display: flex; + display: flex; +} + +.row { + display: flex; + gap: 1rem; +} + +.card { + display: flex; + width: 12.4375rem; + height: 12.5rem; + padding: 0; + flex-direction: column; + justify-content: center; + align-items: center; + flex-shrink: 0; + border-radius: 0.72681rem; + background: #f7f7f7; + text-align: left; + text-decoration: none; + transition: + transform 0.3s ease, + box-shadow 0.3s ease; +} + +.header { + display: inline-flex; +} + +.title { + color: #1e1e1e; + font-family: Manrope; + font-style: normal; + font-weight: 600; +} + +.description { + color: #757575; + font-family: Manrope; + font-style: normal; + font-weight: 300; +} + +.fullSizeCard { + background-repeat: no-repeat; + background-size: contain; + background-position: bottom right; + height: 100%; + padding: 1.4rem; + + .icon { + width: 1.7rem; + height: 1.7rem; + display: block; } - - .row { - display: flex; - gap: 1rem; + + .title { + font-size: 1.1rem; + font-weight: 600; + line-height: 150%; /* 1.6875rem */ + letter-spacing: -0.01238rem; + margin-top: 0.5rem; } - - .card { + + .description { + font-size: 0.95rem; + line-height: 150%; /* 1.5rem */ + letter-spacing: -0.011rem; + } +} + +.halfSizeWrapper { + display: flex; + flex-direction: column; + gap: 0.98rem; +} + +.halfSizeCard { + display: flex; + height: 5.75rem; + padding: 1.4rem; + flex-direction: column; + align-items: center; + flex-shrink: 0; + align-self: stretch; + + .icon { display: flex; - width: 12.4375rem; - height: 12.5rem; - padding: 0; - flex-direction: column; + width: 1.26806rem; + height: 1.26806rem; + padding: 0.13206rem 0.13725rem 0.13213rem 0.13213rem; justify-content: center; align-items: center; flex-shrink: 0; - border-radius: 0.72681rem; - background: #F7F7F7; - text-align: left; - text-decoration: none; - transition: transform 0.3s ease, box-shadow 0.3s ease; - } - - .header { - display: inline-flex; + margin-right: 0.65rem; } .title { - color: #1E1E1E; + color: #1e1e1e; font-family: Manrope; + font-size: 0.95rem; font-style: normal; font-weight: 600; + line-height: 150%; /* 1.5rem */ + letter-spacing: -0.011rem; } .description { - color: #757575; - font-family: Manrope; - font-style: normal; - font-weight: 300; + font-size: 0.75rem; + line-height: 150%; /* 1.125rem */ + letter-spacing: -0.00825rem; + margin-left: 2rem; } +} - .fullSizeCard { - background-repeat: no-repeat; - background-size: contain; - background-position: bottom right; - height: 100%; - padding: 1.4rem; - - .icon { - width: 1.7rem; - height: 1.7rem; - display: block; - } - - .title { - font-size: 1.1rem; - font-weight: 600; - line-height: 150%; /* 1.6875rem */ - letter-spacing: -0.01238rem; - margin-top: 0.5rem; - } - - .description { - font-size: 0.95rem; - line-height: 150%; /* 1.5rem */ - letter-spacing: -0.011rem; - }; - } - - .halfSizeWrapper { - display: flex; - flex-direction: column; - gap: 0.98rem; - } +.card:hover { + transform: translateY(-5px); + box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1); + text-decoration: none; + color: inherit; +} - .halfSizeCard { - display: flex; - height: 5.75rem; - padding: 1.4rem; - flex-direction: column; - align-items: center; - flex-shrink: 0; - align-self: stretch; - - .icon { - display: flex; - width: 1.26806rem; - height: 1.26806rem; - padding: 0.13206rem 0.13725rem 0.13213rem 0.13213rem; - justify-content: center; - align-items: center; - flex-shrink: 0; - margin-right: 0.65rem; - } - - .title { - color: #1E1E1E; - font-family: Manrope; - font-size: 0.95rem; - font-style: normal; - font-weight: 600; - line-height: 150%; /* 1.5rem */ - letter-spacing: -0.011rem; - } - - .description { - font-size: 0.75rem; - line-height: 150%; /* 1.125rem */ - letter-spacing: -0.00825rem; - margin-left: 2rem; - } +@media (max-width: 768px) { + .col { + flex: 1 1 48%; + max-width: 48%; } - - .card:hover { - transform: translateY(-5px); - box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1); - text-decoration: none; - color: inherit; - } - - - @media (max-width: 768px) { - .col { - flex: 1 1 48%; - max-width: 48%; - } - } - - @media (max-width: 480px) { - .col { - flex: 1 1 100%; - max-width: 100%; - } +} + +@media (max-width: 480px) { + .col { + flex: 1 1 100%; + max-width: 100%; } - \ No newline at end of file +} diff --git a/docs-website/src/components/SolutionsDropdown/styles.module.css b/docs-website/src/components/SolutionsDropdown/styles.module.css index 09c71edf0b1850..01e6f6373c3c4a 100644 --- a/docs-website/src/components/SolutionsDropdown/styles.module.css +++ b/docs-website/src/components/SolutionsDropdown/styles.module.css @@ -5,7 +5,7 @@ * LICENSE file in the root directory of this source tree. */ - .dropdownNavbarItemMobile { +.dropdownNavbarItemMobile { cursor: pointer; } @@ -17,6 +17,6 @@ align-items: flex-start; gap: 0.98219rem; border-radius: var(--number-scales-2s-20, 1.25rem); - background: #FFF; + background: #fff; box-shadow: 0px 16px 16px 0px rgba(0, 0, 0, 0.25); -} \ No newline at end of file +} diff --git a/docs-website/src/components/StepCompletion/index.jsx b/docs-website/src/components/StepCompletion/index.jsx index dd8a9bd51fcc75..bdb8abb4b2457a 100644 --- a/docs-website/src/components/StepCompletion/index.jsx +++ b/docs-website/src/components/StepCompletion/index.jsx @@ -1,14 +1,18 @@ -import React, { useState, useEffect } from 'react'; -import styles from './styles.module.css'; +import React, { useState, useEffect } from "react"; +import styles from "./styles.module.css"; -const StepCompletion = ({ stepId, children, completionText = "✅ Completed!" }) => { +const StepCompletion = ({ + stepId, + children, + completionText = "✅ Completed!", +}) => { const [isCompleted, setIsCompleted] = useState(false); const storageKey = `datahub-step-${stepId}`; // Load completion status from localStorage useEffect(() => { const saved = localStorage.getItem(storageKey); - if (saved === 'true') { + if (saved === "true") { setIsCompleted(true); } }, [storageKey]); @@ -23,10 +27,10 @@ const StepCompletion = ({ stepId, children, completionText = "✅ Completed!" }) }; return ( -
      -
      - {children} -
      +
      +
      {children}
      diff --git a/docs-website/src/components/StepCompletion/styles.module.css b/docs-website/src/components/StepCompletion/styles.module.css index ca212b85e36a2b..b9f1b1c97e26f6 100644 --- a/docs-website/src/components/StepCompletion/styles.module.css +++ b/docs-website/src/components/StepCompletion/styles.module.css @@ -53,24 +53,24 @@ } /* Dark mode support */ -[data-theme='dark'] .stepCompletion { +[data-theme="dark"] .stepCompletion { background: #2d2d2d; border-color: #444; } -[data-theme='dark'] .stepCompletion.completed { +[data-theme="dark"] .stepCompletion.completed { background: #1e3a1e; border-color: #28a745; } -[data-theme='dark'] .completionControl { +[data-theme="dark"] .completionControl { border-top-color: #444; } -[data-theme='dark'] .completionText { +[data-theme="dark"] .completionText { color: #e9ecef; } -[data-theme='dark'] .completed .completionText { +[data-theme="dark"] .completed .completionText { color: #90ee90; } diff --git a/docs-website/src/components/TutorialExercise/index.jsx b/docs-website/src/components/TutorialExercise/index.jsx index 30455597a26052..6500f9a3d0bb20 100644 --- a/docs-website/src/components/TutorialExercise/index.jsx +++ b/docs-website/src/components/TutorialExercise/index.jsx @@ -1,40 +1,40 @@ -import React from 'react'; -import styles from './styles.module.css'; +import React from "react"; +import styles from "./styles.module.css"; -const TutorialExercise = ({ - title, - type = 'search', - icon, - children, - difficulty = 'beginner', +const TutorialExercise = ({ + title, + type = "search", + icon, + children, + difficulty = "beginner", timeEstimate, - platform = 'DataHub' + platform = "DataHub", }) => { const getTypeIcon = () => { switch (type) { - case 'search': - return '🔍'; - case 'hands-on': - return '💻'; - case 'analysis': - return '📊'; - case 'exercise': - return '🎯'; + case "search": + return "🔍"; + case "hands-on": + return "💻"; + case "analysis": + return "📊"; + case "exercise": + return "🎯"; default: - return '📝'; + return "📝"; } }; const getDifficultyColor = () => { switch (difficulty) { - case 'beginner': - return 'var(--datahub-success)'; - case 'intermediate': - return 'var(--datahub-warning)'; - case 'advanced': - return 'var(--datahub-error)'; + case "beginner": + return "var(--datahub-success)"; + case "intermediate": + return "var(--datahub-warning)"; + case "advanced": + return "var(--datahub-error)"; default: - return 'var(--datahub-primary)'; + return "var(--datahub-primary)"; } }; @@ -42,33 +42,25 @@ const TutorialExercise = ({
      -
      - {icon || getTypeIcon()} -
      +
      {icon || getTypeIcon()}

      {title}

      - {difficulty} {timeEstimate && ( - - ⏱️ {timeEstimate} - + ⏱️ {timeEstimate} )} - - {platform} - + {platform}
      -
      - {children} -
      +
      {children}
      ); }; @@ -128,9 +120,7 @@ export const HandsOnExercise = ({ title, steps, children, ...props }) => ( export const InteractiveDemo = ({ title, children, ...props }) => ( -
      - {children} -
      +
      {children}
      ); diff --git a/docs-website/src/components/TutorialExercise/styles.module.css b/docs-website/src/components/TutorialExercise/styles.module.css index 35f58e06af89b9..583b244c56cf3f 100644 --- a/docs-website/src/components/TutorialExercise/styles.module.css +++ b/docs-website/src/components/TutorialExercise/styles.module.css @@ -14,7 +14,11 @@ } .exerciseHeader { - background: linear-gradient(135deg, var(--ifm-color-emphasis-100) 0%, var(--ifm-background-color) 100%); + background: linear-gradient( + 135deg, + var(--ifm-color-emphasis-100) 0%, + var(--ifm-background-color) 100% + ); border-bottom: 1px solid var(--ifm-color-emphasis-300); padding: 16px 20px; display: flex; @@ -205,7 +209,11 @@ /* Interactive Demo Styles */ .interactiveContent { - background: linear-gradient(135deg, var(--ifm-color-primary-lightest) 0%, var(--ifm-background-color) 100%); + background: linear-gradient( + 135deg, + var(--ifm-color-primary-lightest) 0%, + var(--ifm-background-color) 100% + ); border: 1px solid var(--ifm-color-primary-light); border-radius: 8px; padding: 20px; @@ -216,39 +224,39 @@ .exerciseHeader { padding: 12px 16px; } - + .headerLeft { gap: 8px; } - + .typeIcon { width: 32px; height: 32px; font-size: 16px; } - + .exerciseTitle { font-size: 14px; } - + .exerciseContent { padding: 16px; } - + .metadata { gap: 6px; } - + .searchQuery code { min-width: auto; font-size: 13px; padding: 6px 10px; } - + .stepItem { gap: 8px; } - + .stepNumber { width: 24px; height: 24px; @@ -257,25 +265,33 @@ } /* Dark mode support */ -[data-theme='dark'] .exerciseContainer { +[data-theme="dark"] .exerciseContainer { background: var(--ifm-background-surface-color); border-color: var(--ifm-color-emphasis-300); } -[data-theme='dark'] .exerciseHeader { - background: linear-gradient(135deg, var(--ifm-color-emphasis-200) 0%, var(--ifm-background-surface-color) 100%); +[data-theme="dark"] .exerciseHeader { + background: linear-gradient( + 135deg, + var(--ifm-color-emphasis-200) 0%, + var(--ifm-background-surface-color) 100% + ); border-bottom-color: var(--ifm-color-emphasis-300); } -[data-theme='dark'] .searchItem { +[data-theme="dark"] .searchItem { background: var(--ifm-color-emphasis-200); border-color: var(--ifm-color-emphasis-300); } -[data-theme='dark'] .searchItem:hover { +[data-theme="dark"] .searchItem:hover { background: var(--ifm-color-emphasis-300); } -[data-theme='dark'] .interactiveContent { - background: linear-gradient(135deg, var(--ifm-color-primary-dark) 0%, var(--ifm-background-surface-color) 100%); +[data-theme="dark"] .interactiveContent { + background: linear-gradient( + 135deg, + var(--ifm-color-primary-dark) 0%, + var(--ifm-background-surface-color) 100% + ); } diff --git a/docs-website/src/components/TutorialProgress/index.jsx b/docs-website/src/components/TutorialProgress/index.jsx index ef84bae6933712..4e4c70568f29fd 100644 --- a/docs-website/src/components/TutorialProgress/index.jsx +++ b/docs-website/src/components/TutorialProgress/index.jsx @@ -1,15 +1,21 @@ -import React, { useState, useEffect } from 'react'; -import { useHistory, useLocation } from '@docusaurus/router'; -import styles from './styles.module.css'; - -const TutorialProgress = ({ tutorialId, steps, currentStep, compact = false }) => { +import React, { useState, useEffect } from "react"; +import { useHistory, useLocation } from "@docusaurus/router"; +import styles from "./styles.module.css"; + +const TutorialProgress = ({ + tutorialId, + steps, + currentStep, + compact = false, +}) => { const [completedSteps, setCompletedSteps] = useState(new Set()); const [isMinimized, setIsMinimized] = useState(false); const [isScrolled, setIsScrolled] = useState(false); - + // Handle both old and new formats - const actualTutorialId = tutorialId || 'tutorial'; - const actualCurrentStep = typeof currentStep === 'string' ? currentStep : `step-${currentStep}`; + const actualTutorialId = tutorialId || "tutorial"; + const actualCurrentStep = + typeof currentStep === "string" ? currentStep : `step-${currentStep}`; const storageKey = `datahub-tutorial-${actualTutorialId}`; // Load progress from localStorage on component mount @@ -20,7 +26,7 @@ const TutorialProgress = ({ tutorialId, steps, currentStep, compact = false }) = const parsed = JSON.parse(savedProgress); setCompletedSteps(new Set(parsed)); } catch (e) { - console.warn('Failed to parse tutorial progress:', e); + console.warn("Failed to parse tutorial progress:", e); } } }, [storageKey]); @@ -31,14 +37,14 @@ const TutorialProgress = ({ tutorialId, steps, currentStep, compact = false }) = }, [completedSteps, storageKey]); const toggleStep = (stepId) => { - setCompletedSteps(prev => { + setCompletedSteps((prev) => { const newSet = new Set(prev); if (newSet.has(stepId)) { newSet.delete(stepId); } else { newSet.add(stepId); // Auto-mark previous steps as completed - const stepIndex = parseInt(stepId.split('-')[1]); + const stepIndex = parseInt(stepId.split("-")[1]); for (let i = 0; i < stepIndex; i++) { newSet.add(`step-${i}`); } @@ -55,7 +61,7 @@ const TutorialProgress = ({ tutorialId, steps, currentStep, compact = false }) = // Auto-mark current step as completed when user navigates useEffect(() => { if (currentStep !== undefined) { - setCompletedSteps(prev => { + setCompletedSteps((prev) => { const newSet = new Set(prev); newSet.add(actualCurrentStep); return newSet; @@ -66,28 +72,33 @@ const TutorialProgress = ({ tutorialId, steps, currentStep, compact = false }) = // Handle scroll behavior for auto-minimizing useEffect(() => { const handleScroll = () => { - const scrollTop = window.pageYOffset || document.documentElement.scrollTop; + const scrollTop = + window.pageYOffset || document.documentElement.scrollTop; setIsScrolled(scrollTop > 100); // Auto-minimize after scrolling 100px }; - window.addEventListener('scroll', handleScroll); - return () => window.removeEventListener('scroll', handleScroll); + window.addEventListener("scroll", handleScroll); + return () => window.removeEventListener("scroll", handleScroll); }, []); const toggleMinimized = () => { setIsMinimized(!isMinimized); }; - const completionPercentage = Math.round((completedSteps.size / steps.length) * 100); + const completionPercentage = Math.round( + (completedSteps.size / steps.length) * 100, + ); if (compact) { return (
      - 📋 Progress: {completedSteps.size}/{steps.length} + + 📋 Progress: {completedSteps.size}/{steps.length} +
      -
      @@ -101,20 +112,26 @@ const TutorialProgress = ({ tutorialId, steps, currentStep, compact = false }) = if (shouldShowMinimized) { return ( -
      +
      - 📋 {completedSteps.size}/{steps.length} completed ({completionPercentage}%) + 📋 {completedSteps.size}/{steps.length} completed ( + {completionPercentage}%)
      -
      -
      @@ -127,7 +144,7 @@ const TutorialProgress = ({ tutorialId, steps, currentStep, compact = false }) =

      📋 Tutorial Progress

      -
      -
      - {completedSteps.size} of {steps.length} completed ({completionPercentage}%) + {completedSteps.size} of {steps.length} completed ( + {completionPercentage}%)
      - +
      {steps.map((step, index) => { // Handle both old format (step-${index}) and new format (step.id) const stepId = step.id || `step-${index}`; const isCompleted = completedSteps.has(stepId); const isCurrent = actualCurrentStep === stepId; - + return ( -
      {step.description && ( @@ -183,7 +205,7 @@ const TutorialProgress = ({ tutorialId, steps, currentStep, compact = false }) =
      -
      diff --git a/docs-website/src/components/TutorialProgress/styles.module.css b/docs-website/src/components/TutorialProgress/styles.module.css index d70b8de5150c77..b161d2e31301c6 100644 --- a/docs-website/src/components/TutorialProgress/styles.module.css +++ b/docs-website/src/components/TutorialProgress/styles.module.css @@ -34,7 +34,11 @@ } .progressFill { - background: linear-gradient(90deg, var(--ifm-color-primary) 0%, var(--ifm-color-primary-light) 100%); + background: linear-gradient( + 90deg, + var(--ifm-color-primary) 0%, + var(--ifm-color-primary-light) 100% + ); height: 100%; border-radius: 10px; transition: width 0.4s cubic-bezier(0.4, 0, 0.2, 1); @@ -169,48 +173,48 @@ } /* Dark mode support */ -[data-theme='dark'] .tutorialProgress { +[data-theme="dark"] .tutorialProgress { background: #1e1e1e; border-color: #444; color: #e9ecef; } -[data-theme='dark'] .header h4 { +[data-theme="dark"] .header h4 { color: #e9ecef; } -[data-theme='dark'] .progressBar { +[data-theme="dark"] .progressBar { background: #444; } -[data-theme='dark'] .progressText { +[data-theme="dark"] .progressText { color: #e9ecef; text-shadow: 0 0 3px rgba(0, 0, 0, 0.8); } -[data-theme='dark'] .step:hover { +[data-theme="dark"] .step:hover { background: rgba(0, 123, 255, 0.15); } -[data-theme='dark'] .step.current { +[data-theme="dark"] .step.current { background: rgba(0, 123, 255, 0.2); } -[data-theme='dark'] .stepText strong { +[data-theme="dark"] .stepText strong { color: #e9ecef; } -[data-theme='dark'] .actions { +[data-theme="dark"] .actions { border-top-color: #444; } -[data-theme='dark'] .resetButton { +[data-theme="dark"] .resetButton { background: #2d2d2d; border-color: #444; color: #adb5bd; } -[data-theme='dark'] .resetButton:hover { +[data-theme="dark"] .resetButton:hover { background: #3d3d3d; border-color: #555; } @@ -251,7 +255,7 @@ border-radius: 4px; } -[data-theme='dark'] .compact { +[data-theme="dark"] .compact { background: var(--ifm-color-emphasis-200); border-color: var(--ifm-color-emphasis-300); } @@ -346,16 +350,16 @@ } /* Dark mode adjustments for minimized state */ -[data-theme='dark'] .minimized { +[data-theme="dark"] .minimized { background: var(--ifm-color-emphasis-200); border-color: var(--ifm-color-emphasis-400); } -[data-theme='dark'] .minimizedHeader:hover { +[data-theme="dark"] .minimizedHeader:hover { background: var(--ifm-color-emphasis-300); } -[data-theme='dark'] .minimizedBar { +[data-theme="dark"] .minimizedBar { background: var(--ifm-color-emphasis-400); } @@ -368,7 +372,7 @@ width: 100%; margin: 16px 0; } - + .minimized.scrolled { position: fixed; top: 60px; /* Account for mobile header */ diff --git a/docs-website/src/css/custom.css b/docs-website/src/css/custom.css index 0d842f3abdd266..9fb35fe41d9a34 100644 --- a/docs-website/src/css/custom.css +++ b/docs-website/src/css/custom.css @@ -58,4 +58,4 @@ body { 100% { background-position: 0% 50%; } -} \ No newline at end of file +} diff --git a/docs-website/src/css/mermaid-custom.css b/docs-website/src/css/mermaid-custom.css index c3c987c42ec84f..43a675279e3da2 100644 --- a/docs-website/src/css/mermaid-custom.css +++ b/docs-website/src/css/mermaid-custom.css @@ -12,7 +12,7 @@ } /* Dark mode adjustments */ -[data-theme='dark'] .mermaid { +[data-theme="dark"] .mermaid { background: var(--ifm-color-emphasis-100); border-color: var(--ifm-color-emphasis-300); box-shadow: 0 2px 8px rgba(0, 0, 0, 0.3); @@ -55,8 +55,8 @@ text-shadow: 0 1px 2px rgba(255, 255, 255, 0.8); } -[data-theme='dark'] .mermaid .nodeLabel, -[data-theme='dark'] .mermaid .edgeLabel { +[data-theme="dark"] .mermaid .nodeLabel, +[data-theme="dark"] .mermaid .edgeLabel { text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8); } @@ -65,7 +65,7 @@ fill: var(--ifm-color-primary-lightest); stroke: var(--ifm-color-primary-light); stroke-width: 2px; - stroke-dasharray: 5,5; + stroke-dasharray: 5, 5; rx: 8px; ry: 8px; } diff --git a/docs-website/src/learn/_components/LearnItemCard/index.jsx b/docs-website/src/learn/_components/LearnItemCard/index.jsx index 9c6b6cfdc98d87..545557d9e494b7 100644 --- a/docs-website/src/learn/_components/LearnItemCard/index.jsx +++ b/docs-website/src/learn/_components/LearnItemCard/index.jsx @@ -6,7 +6,8 @@ import styles from "./styles.module.scss"; export default function LearnItemCard() { const { metadata } = useBlogPost(); - const { permalink, title, description, formattedDate, frontMatter } = metadata; + const { permalink, title, description, formattedDate, frontMatter } = + metadata; return (
      @@ -23,8 +24,10 @@ export default function LearnItemCard() {
      {description}
      -
      Published on {formattedDate}
      +
      + Published on {formattedDate} +
      ); -} \ No newline at end of file +} diff --git a/docs-website/src/learn/_components/LearnItemCard/styles.module.scss b/docs-website/src/learn/_components/LearnItemCard/styles.module.scss index 2bfaabdc06d498..35a5c93c348c26 100644 --- a/docs-website/src/learn/_components/LearnItemCard/styles.module.scss +++ b/docs-website/src/learn/_components/LearnItemCard/styles.module.scss @@ -50,4 +50,4 @@ width: 100%; height: auto; } -} \ No newline at end of file +} diff --git a/docs-website/src/learn/_components/LearnListPage/index.jsx b/docs-website/src/learn/_components/LearnListPage/index.jsx index 1ceec9afa1e8a3..4fa75be98dc62f 100644 --- a/docs-website/src/learn/_components/LearnListPage/index.jsx +++ b/docs-website/src/learn/_components/LearnListPage/index.jsx @@ -2,7 +2,11 @@ import React, { useState } from "react"; import clsx from "clsx"; import useDocusaurusContext from "@docusaurus/useDocusaurusContext"; -import { PageMetadata, HtmlClassNameProvider, ThemeClassNames } from "@docusaurus/theme-common"; +import { + PageMetadata, + HtmlClassNameProvider, + ThemeClassNames, +} from "@docusaurus/theme-common"; import BlogListPaginator from "@theme/BlogListPaginator"; import SearchMetadata from "@theme/SearchMetadata"; import { BlogPostProvider } from "@docusaurus/theme-common/internal"; @@ -30,10 +34,20 @@ function BlogListPageContent(props) { const { metadata, items } = props; const [activeFilters, setActiveFilters] = useState([]); // These are currently hardcoded, check the frontmatter of the blog posts to see what audiences are available - const audiences = ["Data Governance Leads", "Data Engineers", "Data Architects", "Data Platform Leads", "Data Analysts"]; + const audiences = [ + "Data Governance Leads", + "Data Engineers", + "Data Architects", + "Data Platform Leads", + "Data Analysts", + ]; const filteredItems = activeFilters?.length - ? (items || []).filter((post) => activeFilters.some((activeFilter) => post?.content?.frontMatter?.audience?.some((a) => a === activeFilter))) + ? (items || []).filter((post) => + activeFilters.some((activeFilter) => + post?.content?.frontMatter?.audience?.some((a) => a === activeFilter), + ), + ) : items; const handleFilterToggle = (audience) => { @@ -51,14 +65,19 @@ function BlogListPageContent(props) {

      DataHub Learn

      -

      Learn about the hot topics in the data ecosystem and how DataHub can help you with your data journey.

      +

      + Learn about the hot topics in the data ecosystem and how DataHub + can help you with your data journey. +

      For: {audiences.map((audience) => (