|
3 | 3 | from typing import Any |
4 | 4 |
|
5 | 5 | from pydantic import BaseModel, Field, ValidationInfo, field_validator |
| 6 | +from rdflib import Graph, Namespace, RDF, RDFS, OWL, XSD, Literal, URIRef |
6 | 7 |
|
7 | 8 | NODE_COLOR_PALETTE = [ |
8 | 9 | ("#e3f2fd", "#1976d2"), # Light Blue / Blue |
@@ -551,6 +552,189 @@ def to_arrows_dict(self) -> dict[str, Any]: |
551 | 552 | def to_arrows_json_str(self) -> str: |
552 | 553 | "Convert the data model to an Arrows Data Model JSON string." |
553 | 554 | return json.dumps(self.to_arrows_dict(), indent=2) |
| 555 | + |
| 556 | + def to_owl_turtle_str(self) -> str: |
| 557 | + """ |
| 558 | + Convert the data model to an OWL Turtle string. |
| 559 | +
|
| 560 | + This process is lossy since OWL does not support properties on ObjectProperties. |
| 561 | +
|
| 562 | + This method creates an OWL ontology from the Neo4j data model: |
| 563 | + - Node labels become OWL Classes |
| 564 | + - Node properties become OWL DatatypeProperties with the node class as domain |
| 565 | + - Relationship types become OWL ObjectProperties with start/end nodes as domain/range |
| 566 | + - Relationship properties become OWL DatatypeProperties with the relationship as domain |
| 567 | + """ |
| 568 | + # Create a new RDF graph |
| 569 | + g = Graph() |
| 570 | + |
| 571 | + # Define namespaces |
| 572 | + # Use a generic namespace for the ontology |
| 573 | + base_ns = Namespace("http://voc.neo4j.com/datamodel#") |
| 574 | + g.bind("", base_ns) |
| 575 | + g.bind("owl", OWL) |
| 576 | + g.bind("rdfs", RDFS) |
| 577 | + g.bind("xsd", XSD) |
| 578 | + |
| 579 | + # Create the ontology declaration |
| 580 | + ontology_uri = URIRef("http://voc.neo4j.com/datamodel") |
| 581 | + g.add((ontology_uri, RDF.type, OWL.Ontology)) |
| 582 | + |
| 583 | + # Map Neo4j types to XSD types |
| 584 | + type_mapping = { |
| 585 | + "STRING": XSD.string, |
| 586 | + "INTEGER": XSD.integer, |
| 587 | + "FLOAT": XSD.float, |
| 588 | + "BOOLEAN": XSD.boolean, |
| 589 | + "DATE": XSD.date, |
| 590 | + "DATETIME": XSD.dateTime, |
| 591 | + "TIME": XSD.time, |
| 592 | + "DURATION": XSD.duration, |
| 593 | + "LONG": XSD.long, |
| 594 | + "DOUBLE": XSD.double, |
| 595 | + } |
| 596 | + |
| 597 | + # Process nodes -> OWL Classes |
| 598 | + for node in self.nodes: |
| 599 | + class_uri = base_ns[node.label] |
| 600 | + g.add((class_uri, RDF.type, OWL.Class)) |
| 601 | + |
| 602 | + # Add key property as a datatype property |
| 603 | + if node.key_property: |
| 604 | + prop_uri = base_ns[node.key_property.name] |
| 605 | + g.add((prop_uri, RDF.type, OWL.DatatypeProperty)) |
| 606 | + g.add((prop_uri, RDFS.domain, class_uri)) |
| 607 | + xsd_type = type_mapping.get(node.key_property.type.upper(), XSD.string) |
| 608 | + g.add((prop_uri, RDFS.range, xsd_type)) |
| 609 | + |
| 610 | + # Add other properties as datatype properties |
| 611 | + for prop in node.properties: |
| 612 | + prop_uri = base_ns[prop.name] |
| 613 | + g.add((prop_uri, RDF.type, OWL.DatatypeProperty)) |
| 614 | + g.add((prop_uri, RDFS.domain, class_uri)) |
| 615 | + xsd_type = type_mapping.get(prop.type.upper(), XSD.string) |
| 616 | + g.add((prop_uri, RDFS.range, xsd_type)) |
| 617 | + |
| 618 | + # Process relationships -> OWL ObjectProperties |
| 619 | + for rel in self.relationships: |
| 620 | + rel_uri = base_ns[rel.type] |
| 621 | + g.add((rel_uri, RDF.type, OWL.ObjectProperty)) |
| 622 | + g.add((rel_uri, RDFS.domain, base_ns[rel.start_node_label])) |
| 623 | + g.add((rel_uri, RDFS.range, base_ns[rel.end_node_label])) |
| 624 | + |
| 625 | + # relationships don't have properties in the OWL format. |
| 626 | + # This means translation to OWL is lossy. |
| 627 | + |
| 628 | + # Serialize to Turtle format |
| 629 | + return g.serialize(format="turtle") |
| 630 | + |
| 631 | + @classmethod |
| 632 | + def from_owl_turtle_str(cls, owl_turtle_str: str) -> "DataModel": |
| 633 | + """ |
| 634 | + Convert an OWL Turtle string to a Neo4j Data Model. |
| 635 | +
|
| 636 | + This process is lossy and some components of the ontology may be lost in the data model schema. |
| 637 | +
|
| 638 | + This method parses an OWL ontology and creates a Neo4j data model: |
| 639 | + - OWL Classes become Node labels |
| 640 | + - OWL DatatypeProperties with Class domains become Node properties |
| 641 | + - OWL ObjectProperties become Relationships |
| 642 | + - Property domains and ranges are used to infer Node labels and types |
| 643 | + """ |
| 644 | + # Parse the Turtle string |
| 645 | + g = Graph() |
| 646 | + g.parse(data=owl_turtle_str, format="turtle") |
| 647 | + |
| 648 | + # Map XSD types back to Neo4j types |
| 649 | + xsd_to_neo4j = { |
| 650 | + str(XSD.string): "STRING", |
| 651 | + str(XSD.integer): "INTEGER", |
| 652 | + str(XSD.float): "FLOAT", |
| 653 | + str(XSD.boolean): "BOOLEAN", |
| 654 | + str(XSD.date): "DATE", |
| 655 | + str(XSD.dateTime): "DATETIME", |
| 656 | + str(XSD.time): "TIME", |
| 657 | + str(XSD.duration): "DURATION", |
| 658 | + str(XSD.long): "LONG", |
| 659 | + str(XSD.double): "DOUBLE", |
| 660 | + } |
| 661 | + |
| 662 | + # Extract OWL Classes -> Nodes |
| 663 | + classes = set() |
| 664 | + for s in g.subjects(RDF.type, OWL.Class): |
| 665 | + classes.add(str(s).split("#")[-1].split("/")[-1]) |
| 666 | + |
| 667 | + # Extract DatatypeProperties |
| 668 | + datatype_props = {} |
| 669 | + for prop in g.subjects(RDF.type, OWL.DatatypeProperty): |
| 670 | + prop_name = str(prop).split("#")[-1].split("/")[-1] |
| 671 | + domains = list(g.objects(prop, RDFS.domain)) |
| 672 | + ranges = list(g.objects(prop, RDFS.range)) |
| 673 | + |
| 674 | + domain_name = str(domains[0]).split("#")[-1].split("/")[-1] if domains else None |
| 675 | + range_type = xsd_to_neo4j.get(str(ranges[0]), "STRING") if ranges else "STRING" |
| 676 | + |
| 677 | + if domain_name: |
| 678 | + if domain_name not in datatype_props: |
| 679 | + datatype_props[domain_name] = [] |
| 680 | + datatype_props[domain_name].append({ |
| 681 | + "name": prop_name, |
| 682 | + "type": range_type |
| 683 | + }) |
| 684 | + |
| 685 | + # Extract ObjectProperties -> Relationships |
| 686 | + object_props = [] |
| 687 | + for prop in g.subjects(RDF.type, OWL.ObjectProperty): |
| 688 | + prop_name = str(prop).split("#")[-1].split("/")[-1] |
| 689 | + domains = list(g.objects(prop, RDFS.domain)) |
| 690 | + ranges = list(g.objects(prop, RDFS.range)) |
| 691 | + |
| 692 | + if domains and ranges: |
| 693 | + domain_name = str(domains[0]).split("#")[-1].split("/")[-1] |
| 694 | + range_name = str(ranges[0]).split("#")[-1].split("/")[-1] |
| 695 | + |
| 696 | + object_props.append({ |
| 697 | + "type": prop_name, |
| 698 | + "start_node_label": domain_name, |
| 699 | + "end_node_label": range_name |
| 700 | + }) |
| 701 | + |
| 702 | + # Create Nodes |
| 703 | + nodes = [] |
| 704 | + for class_name in classes: |
| 705 | + props_for_class = datatype_props.get(class_name, []) |
| 706 | + |
| 707 | + # Use the first property as key property, or create a default one |
| 708 | + if props_for_class: |
| 709 | + key_prop = Property( |
| 710 | + name=props_for_class[0]["name"], |
| 711 | + type=props_for_class[0]["type"] |
| 712 | + ) |
| 713 | + other_props = [ |
| 714 | + Property(name=p["name"], type=p["type"]) |
| 715 | + for p in props_for_class[1:] |
| 716 | + ] |
| 717 | + else: |
| 718 | + # Create a default key property |
| 719 | + key_prop = Property(name=f"{class_name.lower()}Id", type="STRING") |
| 720 | + other_props = [] |
| 721 | + |
| 722 | + nodes.append(Node( |
| 723 | + label=class_name, |
| 724 | + key_property=key_prop, |
| 725 | + properties=other_props |
| 726 | + )) |
| 727 | + |
| 728 | + # Create Relationships |
| 729 | + relationships = [] |
| 730 | + for obj_prop in object_props: |
| 731 | + relationships.append(Relationship( |
| 732 | + type=obj_prop["type"], |
| 733 | + start_node_label=obj_prop["start_node_label"], |
| 734 | + end_node_label=obj_prop["end_node_label"] |
| 735 | + )) |
| 736 | + |
| 737 | + return cls(nodes=nodes, relationships=relationships) |
554 | 738 |
|
555 | 739 | def get_node_cypher_ingest_query_for_many_records(self, node_label: str) -> str: |
556 | 740 | "Generate a Cypher query to ingest a list of Node records into a Neo4j database." |
|
0 commit comments