Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
5e6471b
feat(rdf): add RDF ingestion source with simplified architecture
stephengoldbaum Dec 9, 2025
f39b974
Remove relationship logic from glossary_term entity
stephengoldbaum Dec 10, 2025
2c75c4f
Bringing RDF ingestion to DataHub standards compliance
stephengoldbaum Dec 10, 2025
807b3eb
Refactor RDF ingestion source for improved error handling and perform…
stephengoldbaum Dec 10, 2025
ffea4d0
Enhance RDF source configuration with helper text for improved user g…
stephengoldbaum Dec 11, 2025
48171a9
Add RDF capabilities to capability summary
stephengoldbaum Dec 11, 2025
200b85b
fix(ingestion): update warning filters and add RDF dependency
stephengoldbaum Dec 12, 2025
e796c34
feat(rdf): add documentation and update type hints for context parame…
stephengoldbaum Dec 12, 2025
470f634
chore(setup): add RDF dependency to base and test requirements
stephengoldbaum Dec 12, 2025
5ee8089
Update capability_summary.json with RDF plugin
stephengoldbaum Dec 13, 2025
07913ed
Update RDF documentation link and ensure newline at end of capability…
stephengoldbaum Dec 13, 2025
487e39a
Fix capability_summary.py to ensure consistent plugin ordering
stephengoldbaum Dec 13, 2025
bd20f53
Revert "Fix capability_summary.py to ensure consistent plugin ordering"
stephengoldbaum Dec 13, 2025
8f6c3da
Fix broken markdown links in RDF documentation
stephengoldbaum Dec 13, 2025
5f53936
Enhance RDF source with improved error handling and validation tests
stephengoldbaum Dec 15, 2025
7831233
Refactor RDF test cases for improved clarity and consistency
stephengoldbaum Dec 15, 2025
df16737
Enhance RDF test cases with improved validation and error handling
stephengoldbaum Dec 16, 2025
419f36a
Add RDF source configuration and enhance glossary term MCP creation
stephengoldbaum Dec 16, 2025
c70c3ad
feat(rdf): enhance RDF loader to support zip files and web folder URLs
stephengoldbaum Dec 17, 2025
f28d9c0
Restore capability_summary.json from master to include dataplex (will…
stephengoldbaum Dec 19, 2025
b3fa41a
Regenerate capability_summary.json to include RDF source
stephengoldbaum Dec 19, 2025
08e1431
feat(rdf): implement custom property extraction for FIBO dialect and …
stephengoldbaum Jan 6, 2026
da1c33f
feat(rdf): add support for provisional terms in FIBO dialect and conf…
stephengoldbaum Jan 6, 2026
8e7935d
feat(rdf): add integration and unit tests for FIBO dialect and NamedI…
stephengoldbaum Jan 6, 2026
9a68991
feat(tests): add example RDF maturity terms and configuration for pro…
stephengoldbaum Jan 6, 2026
96024d4
fix(rdf): update dialect handling in GlossaryTermExtractor and improv…
stephengoldbaum Jan 18, 2026
493a6b0
fix(rdf): refine SKOS concept handling in GenericDialect to allow OWL…
stephengoldbaum Jan 18, 2026
ad5fa52
fix(rdf): restrict rdflib version to ensure compatibility
stephengoldbaum Jan 18, 2026
c531592
fix(rdf): pin rdflib version to 6.3.2 for compatibility
stephengoldbaum Jan 18, 2026
9ab0d98
feat(tests): add comprehensive edge case tests for RDF dialects, load…
stephengoldbaum Jan 18, 2026
1ef4b99
fix(rdf): update RDF dependencies in setup.py to include requests and…
stephengoldbaum Jan 18, 2026
8dbfbf0
fix(docgen): enhance error handling for platform and plugin validatio…
stephengoldbaum Jan 18, 2026
545aad0
fix(docgen): improve handling of missing capability data in documenta…
stephengoldbaum Feb 1, 2026
aaa46c0
feat(metadata): update capability summary for ABS and Athena sources
stephengoldbaum Feb 1, 2026
d36906c
feat(tests): add comprehensive tests for RDF loader, EntityRegistry, …
stephengoldbaum Feb 1, 2026
f582f21
fix(tests): add type hints for EntityProcessor in comprehensive RDF t…
stephengoldbaum Feb 1, 2026
5eec34f
feat(tests): add comprehensive tests for RDF loader and URN generator
stephengoldbaum Feb 1, 2026
8c9e53d
fix(tests): add type ignores for invalid type checks in URN generator…
stephengoldbaum Feb 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,14 @@ import {
PRESTO_PASSWORD,
PRESTO_USERNAME,
} from '@app/ingestV2/source/builder/RecipeForm/presto';
import {
RDF_DIALECT,
RDF_ENVIRONMENT,
RDF_EXTENSIONS,
RDF_FORMAT,
RDF_RECURSIVE,
RDF_SOURCE,
} from '@app/ingestV2/source/builder/RecipeForm/rdf';
import {
REDSHIFT_DATABASE,
REDSHIFT_HOST_PORT,
Expand Down Expand Up @@ -298,6 +306,7 @@ import {
NOTION,
OKTA,
POWER_BI,
RDF,
SAC,
VERTICA,
} from '@app/ingestV2/source/builder/constants';
Expand Down Expand Up @@ -751,6 +760,13 @@ export const RECIPE_FIELDS: RecipeFields = {
],
hasDynamicFields: true,
},
[RDF]: {
fields: [RDF_SOURCE],
filterFields: [],
advancedFields: [RDF_FORMAT, RDF_EXTENSIONS, RDF_RECURSIVE, RDF_ENVIRONMENT, RDF_DIALECT],
connectionSectionTooltip: 'Configure the RDF source location and basic settings.',
advancedSectionTooltip: 'Advanced options for RDF format, file processing, and dialect selection.',
},
};

const ALL_CONNECTORS_WITH_FORM = Object.keys(RECIPE_FIELDS);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import { FieldType, RecipeField } from '@app/ingestV2/source/builder/RecipeForm/common';

export const RDF_SOURCE: RecipeField = {
name: 'source',
label: 'Source',
helper: 'RDF source: file, folder, URL, or comma-separated files',
tooltip:
'Source to process: file path, folder path, server URL, or comma-separated files. Examples: /path/to/file.ttl, /path/to/folder, https://example.com/data.ttl, file1.ttl,file2.ttl',
type: FieldType.TEXT,
fieldPath: 'source.config.source',
placeholder: '/path/to/file.ttl or /path/to/folder or https://example.com/data.ttl',
required: true,
rules: null,
};

export const RDF_FORMAT: RecipeField = {
name: 'format',
label: 'RDF Format',
helper: 'RDF format (auto-detected if not specified)',
tooltip: 'RDF format (auto-detected if not specified). Examples: turtle, xml, n3, nt, json-ld',
type: FieldType.SELECT,
fieldPath: 'source.config.format',
placeholder: 'Auto-detect',
options: [
{ label: 'Auto-detect', value: '' },
{ label: 'Turtle', value: 'turtle' },
{ label: 'RDF/XML', value: 'xml' },
{ label: 'N3', value: 'n3' },
{ label: 'N-Triples', value: 'nt' },
{ label: 'JSON-LD', value: 'json-ld' },
],
rules: null,
};

export const RDF_EXTENSIONS: RecipeField = {
name: 'extensions',
label: 'File Extensions',
helper: 'File extensions to process when source is a folder',
tooltip: 'File extensions to process when source is a folder. Default: .ttl, .rdf, .owl, .n3, .nt',
type: FieldType.LIST,
fieldPath: 'source.config.extensions',
placeholder: '.ttl',
buttonLabel: 'Add extension',
rules: null,
};

export const RDF_RECURSIVE: RecipeField = {
name: 'recursive',
label: 'Recursive Folder Processing',
helper: 'Enable recursive folder processing when source is a folder',
tooltip: 'Enable recursive folder processing when source is a folder (default: true)',
type: FieldType.BOOLEAN,
fieldPath: 'source.config.recursive',
rules: null,
};

export const RDF_ENVIRONMENT: RecipeField = {
name: 'environment',
label: 'DataHub Environment',
helper: 'DataHub environment (PROD, DEV, TEST, etc.)',
tooltip: 'DataHub environment (PROD, DEV, TEST, etc.)',
type: FieldType.SELECT,
fieldPath: 'source.config.environment',
placeholder: 'PROD',
options: [
{ label: 'PROD', value: 'PROD' },
{ label: 'DEV', value: 'DEV' },
{ label: 'TEST', value: 'TEST' },
{ label: 'UAT', value: 'UAT' },
],
rules: null,
};

export const RDF_DIALECT: RecipeField = {
name: 'dialect',
label: 'RDF Dialect',
helper: 'Force a specific RDF dialect (default: auto-detect)',
tooltip: 'Force a specific RDF dialect (default: auto-detect). Options: default, fibo, generic',
type: FieldType.SELECT,
fieldPath: 'source.config.dialect',
placeholder: 'Auto-detect',
options: [
{ label: 'Auto-detect', value: '' },
{ label: 'Default', value: 'default' },
{ label: 'FIBO', value: 'fibo' },
{ label: 'Generic', value: 'generic' },
],
rules: null,
};
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ export const SNAPLOGIC = 'snaplogic';
export const SNAPLOGIC_URN = `urn:li:dataPlatform:${SNAPLOGIC}`;
export const FABRIC_ONELAKE = 'fabric-onelake';
export const FABRIC_ONELAKE_URN = `urn:li:dataPlatform:${FABRIC_ONELAKE}`;
export const RDF = 'rdf';
export const RDF_URN = `urn:li:dataPlatform:${RDF}`;

export const PLATFORM_URN_TO_LOGO = {
[AIRFLOW_URN]: airflowLogo,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -552,5 +552,13 @@
"recipe": "source:\n type: feast\n config:\n # Coordinates\n path: \"/path/to/repository/\"\n # Options\n environment: \"PROD\"",
"category": "ML Platforms",
"isPopular": false
},
{
"urn": "urn:li:dataPlatform:rdf",
"name": "rdf",
"displayName": "RDF",
"description": "Import glossary terms, term groups, and relationships from RDF/OWL ontologies (SKOS, Turtle, RDF/XML).",
"docsUrl": "https://docs.datahub.com/docs/generated/ingestion/sources/rdf",
"recipe": "source:\n type: rdf\n config:\n source: path/to/glossary.ttl\n environment: PROD\n export_only:\n - glossary"
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -993,6 +993,46 @@ def test_airflow_plugin(
# Verify that no MCPs were generated.
assert not os.path.exists(airflow_instance.metadata_file)
else:
# Check if metadata file exists before trying to read it
if not os.path.exists(airflow_instance.metadata_file):
# Try to get more diagnostic information
print(
f"ERROR: Metadata file does not exist: {airflow_instance.metadata_file}"
)
print("Checking if DAG completed successfully...")
try:
api_version = get_api_version()
res = _make_api_request(
airflow_instance.session,
f"{airflow_instance.airflow_url}/api/{api_version}/dags/{dag_id}/dagRuns",
)
dag_runs = res.json()["dag_runs"]
if dag_runs:
dag_run = dag_runs[0]
print(f"DAG run state: {dag_run['state']}")
print(f"DAG run ID: {dag_run['dag_run_id']}")
except Exception as e:
print(f"Failed to get DAG run info: {e}")

# Check if the connection was configured correctly
print(f"Expected metadata file path: {airflow_instance.metadata_file}")
print(
f"Metadata file parent directory exists: {airflow_instance.metadata_file.parent.exists()}"
)
print(
f"Metadata file parent directory: {airflow_instance.metadata_file.parent}"
)
if airflow_instance.metadata_file.parent.exists():
print(
f"Files in metadata directory: {list(airflow_instance.metadata_file.parent.iterdir())}"
)

raise FileNotFoundError(
f"Metadata file not found: {airflow_instance.metadata_file}. "
f"This usually means the DataHub plugin did not emit any metadata. "
f"Check Airflow logs for plugin errors."
)

_sanitize_output_file(airflow_instance.metadata_file)

check_golden_file(
Expand Down
Loading
Loading