Improve clarity of diagnosis example (#117)

jmsfltchr · web-flow · commit 6b028b5ed0af · 2019-12-16T16:04:34.000Z
## What is the goal of this PR?

Reformat the diagnosis example to make it as clear as possible to users how they can build their own KGCN.

## What are the changes implemented in this PR?

- Move main parameters to be easily visible
- Move out generic util methods for retrieving types and roles
- Improved docstrings
- Some outdated format docstrings updated to Google format
diff --git a/kglib/kgcn/README.md b/kglib/kgcn/README.md
@@ -1,6 +1,6 @@
 # Knowledge Graph Convolutional Networks
 
-This project introduces a novel model: the *Knowledge Graph Convolutional Network* (KGCN). This work is in its second major iteration since inception.
+This project introduces a novel model: the *Knowledge Graph Convolutional Network* (KGCN).
 
 ### Getting Started - Running the Machine Learning Pipeline
 
diff --git a/kglib/kgcn/examples/diagnosis/BUILD b/kglib/kgcn/examples/diagnosis/BUILD
@@ -26,6 +26,7 @@ py_library(
         "//kglib/kgcn/plot",
         "//kglib/kgcn/models",
         "//kglib/utils/grakn/synthetic",
+        "//kglib/utils/grakn/type",
         "@graknlabs_client_python//:client_python",
     ],
     visibility=['//visibility:public']
diff --git a/kglib/kgcn/examples/diagnosis/diagnosis.py b/kglib/kgcn/examples/diagnosis/diagnosis.py
@@ -25,16 +25,58 @@
 
 from kglib.kgcn.pipeline.pipeline import pipeline
 from kglib.utils.grakn.synthetic.examples.diagnosis.generate import generate_example_graphs
+from kglib.utils.grakn.type.type import get_thing_types, get_role_types
 from kglib.utils.graph.iterate import multidigraph_data_iterator
 from kglib.utils.graph.query.query_graph import QueryGraph
 from kglib.utils.graph.thing.queries_to_graph import build_graph_from_queries
 
+KEYSPACE = "diagnosis"
+URI = "localhost:48555"
+
+# Existing elements in the graph are those that pre-exist in the graph, and should be predicted to continue to exist
+PREEXISTS = dict(solution=0)
+
+# Candidates are neither present in the input nor in the solution, they are negative samples
+CANDIDATE = dict(solution=1)
+
+# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive samples
+TO_INFER = dict(solution=2)
+
+# Categorical Attribute types and the values of their categories
+CATEGORICAL_ATTRIBUTES = {'name': ['Diabetes Type II', 'Multiple Sclerosis', 'Blurred vision', 'Fatigue', 'Cigarettes',
+                                   'Alcohol']}
+# Continuous Attribute types and their min and max values
+CONTINUOUS_ATTRIBUTES = {'severity': (0, 1), 'age': (7, 80), 'units-per-week': (3, 29)}
+
+TYPES_TO_IGNORE = ['candidate-diagnosis', 'example-id', 'probability-exists', 'probability-non-exists', 'probability-preexists']
+ROLES_TO_IGNORE = ['candidate-patient', 'candidate-diagnosed-disease']
+
+# The learner should see candidate relations the same as the ground truth relations, so adjust these candidates to
+# look like their ground truth counterparts
+TYPES_AND_ROLES_TO_OBFUSCATE = {'candidate-diagnosis': 'diagnosis',
+                                'candidate-patient': 'patient',
+                                'candidate-diagnosed-disease': 'diagnosed-disease'}
+
 
 def diagnosis_example(num_graphs=200,
                       num_processing_steps_tr=5,
                       num_processing_steps_ge=5,
                       num_training_iterations=1000,
-                      keyspace="diagnosis", uri="localhost:48555"):
+                      keyspace=KEYSPACE, uri=URI):
+    """
+    Run the diagnosis example from start to finish, including traceably ingesting predictions back into Grakn
+
+    Args:
+        num_graphs: Number of graphs to use for training and testing combined
+        num_processing_steps_tr: The number of message-passing steps for training
+        num_processing_steps_ge: The number of message-passing steps for testing
+        num_training_iterations: The number of training epochs
+        keyspace: The name of the keyspace to retrieve example subgraphs from
+        uri: The uri of the running Grakn instance
+
+    Returns:
+        Final accuracies for training and for testing
+    """
 
     tr_ge_split = int(num_graphs*0.5)
 
@@ -48,7 +90,10 @@ def diagnosis_example(num_graphs=200,
     with session.transaction().read() as tx:
         # Change the terminology here onwards from thing -> node and role -> edge
         node_types = get_thing_types(tx)
+        [node_types.remove(el) for el in TYPES_TO_IGNORE]
+
         edge_types = get_role_types(tx)
+        [edge_types.remove(el) for el in ROLES_TO_IGNORE]
         print(f'Found node types: {node_types}')
         print(f'Found edge types: {edge_types}')
 
@@ -72,12 +117,17 @@ def diagnosis_example(num_graphs=200,
     return solveds_tr, solveds_ge
 
 
-CATEGORICAL_ATTRIBUTES = {'name': ['Diabetes Type II', 'Multiple Sclerosis', 'Blurred vision', 'Fatigue', 'Cigarettes',
-                                   'Alcohol']}
-CONTINUOUS_ATTRIBUTES = {'severity': (0, 1), 'age': (7, 80), 'units-per-week': (3, 29)}
+def create_concept_graphs(example_indices, grakn_session):
+    """
+    Builds an in-memory graph for each example, with an example_id as an anchor for each example subgraph.
+    Args:
+        example_indices: The values used to anchor the subgraph queries within the entire knowledge graph
+        grakn_session: Grakn Session
 
+    Returns:
+        In-memory graphs of Grakn subgraphs
+    """
 
-def create_concept_graphs(example_indices, grakn_session):
     graphs = []
     infer = True
 
@@ -90,37 +140,28 @@ def create_concept_graphs(example_indices, grakn_session):
 
         # Remove label leakage - change type labels that indicate candidates into non-candidates
         for data in multidigraph_data_iterator(graph):
-            typ = data['type']
-            if typ == 'candidate-diagnosis':
-                data.update(type='diagnosis')
-            elif typ == 'candidate-patient':
-                data.update(type='patient')
-            elif typ == 'candidate-diagnosed-disease':
-                data.update(type='diagnosed-disease')
+            for label_to_obfuscate, with_label in TYPES_AND_ROLES_TO_OBFUSCATE.items():
+                if data['type'] == label_to_obfuscate:
+                    data.update(type=with_label)
+                    break
 
         graph.name = example_id
         graphs.append(graph)
 
     return graphs
 
 
-# Existing elements in the graph are those that pre-exist in the graph, and should be predicted to continue to exist
-PREEXISTS = dict(solution=0)
-
-# Candidates are neither present in the input nor in the solution, they are negative samples
-CANDIDATE = dict(solution=1)
-
-# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive samples
-TO_INFER = dict(solution=2)
-
-
 def get_query_handles(example_id):
     """
-    1. Supply a query
-    2. Supply a `QueryGraph` object to represent that query. That itself is a subclass of a networkx graph
-    3. Execute the query
-    4. Make a graph of the query results by taking the variables you got back and arranging the concepts as they are in the `QueryGraph`. This gives one graph for each result, for each query.
-    5. Combine all of these graphs into one single graph, and that’s your example subgraph
+    Creates an iterable, each element containing a Graql query, a function to sample the answers, and a QueryGraph
+    object which must be the Grakn graph representation of the query. This tuple is termed a "query_handle"
+
+    Args:
+        example_id: A uniquely identifiable attribute value used to anchor the results of the queries to a specific
+                    subgraph
+
+    Returns:
+        query handles
     """
 
     # === Hereditary Feature ===
@@ -165,7 +206,6 @@ def get_query_handles(example_id):
             $p isa person, has example-id {example_id}, has age $a; 
             get;''')
 
-
     vars = p, a = 'p', 'a'
     g = QueryGraph()
     g.add_vars(*vars, **PREEXISTS)
@@ -248,48 +288,6 @@ def get_query_handles(example_id):
     ]
 
 
-def get_thing_types(tx):
-    """
-    Get all schema types, excluding those for implicit attribute relations, base types, and candidate types
-    Args:
-        tx: Grakn transaction
-
-    Returns:
-        Grakn types
-    """
-    schema_concepts = tx.query(
-        "match $x sub thing; "
-        "not {$x sub @has-attribute;}; "
-        "not {$x sub @key-attribute;}; "
-        "get;")
-    thing_types = [schema_concept.get('x').label() for schema_concept in schema_concepts]
-    [thing_types.remove(el) for el in
-     ['thing', 'relation', 'entity', 'attribute', 'candidate-diagnosis', 'example-id', 'probability-exists',
-      'probability-non-exists', 'probability-preexists']]
-    return thing_types
-
-
-def get_role_types(tx):
-    """
-    Get all schema roles, excluding those for implicit attribute relations, the base role type, and candidate roles
-    Args:
-        tx: Grakn transaction
-
-    Returns:
-        Grakn roles
-    """
-    schema_concepts = tx.query(
-        "match $x sub role; "
-        "not{$x sub @key-attribute-value;}; "
-        "not{$x sub @key-attribute-owner;}; "
-        "not{$x sub @has-attribute-value;}; "
-        "not{$x sub @has-attribute-owner;};"
-        "get;")
-    role_types = ['has'] + [role.get('x').label() for role in schema_concepts]
-    [role_types.remove(el) for el in ['role', 'candidate-patient', 'candidate-diagnosed-disease']]
-    return role_types
-
-
 def write_predictions_to_grakn(graphs, tx):
     """
     Take predictions from the ML model, and insert representations of those predictions back into the graph.
diff --git a/kglib/kgcn/pipeline/utils.py b/kglib/kgcn/pipeline/utils.py
@@ -23,8 +23,11 @@ def duplicate_edges_in_reverse(graph):
     Takes in a directed multi graph, and creates duplicates of all edges, the duplicates having reversed direction to
     the originals. This is useful since directed edges constrain the direction of messages passed. We want to permit
     omni-directional message passing.
-    :param graph: The graph
-    :return: The graph with duplicated edges, reversed, with all original edge properties attached to the duplicates
+    Args:
+        graph: The graph
+
+    Returns:
+        The graph with duplicated edges, reversed, with all original edge properties attached to the duplicates
     """
     for sender, receiver, keys, data in graph.edges(data=True, keys=True):
         graph.add_edge(receiver, sender, keys, **data)
diff --git a/kglib/utils/grakn/BUILD b/kglib/utils/grakn/BUILD
@@ -8,6 +8,7 @@ py_library(
         '//kglib/utils/grakn/test',
         '//kglib/utils/grakn/object',
         '//kglib/utils/grakn/synthetic',
+        '//kglib/utils/grakn/type',
     ],
     visibility=['//visibility:public']
 )
diff --git a/kglib/utils/grakn/type/BUILD b/kglib/utils/grakn/type/BUILD
@@ -0,0 +1,11 @@
+load("@io_bazel_rules_python//python:python.bzl", "py_library")
+load("@pypi_dependencies//:requirements.bzl", "requirement")
+
+
+py_library(
+    name = "type",
+    srcs = [
+        'type.py',
+    ],
+    visibility=['//visibility:public']
+)
diff --git a/kglib/utils/grakn/type/type.py b/kglib/utils/grakn/type/type.py
@@ -0,0 +1,58 @@
+#
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+#
+
+
+def get_thing_types(tx):
+    """
+    Get all schema types, excluding those for implicit attribute relations and base types
+    Args:
+        tx: Grakn transaction
+
+    Returns:
+        Grakn types
+    """
+    schema_concepts = tx.query(
+        "match $x sub thing; "
+        "not {$x sub @has-attribute;}; "
+        "not {$x sub @key-attribute;}; "
+        "get;")
+    thing_types = [schema_concept.get('x').label() for schema_concept in schema_concepts]
+    [thing_types.remove(el) for el in ['thing', 'relation', 'entity', 'attribute']]
+    return thing_types
+
+
+def get_role_types(tx):
+    """
+    Get all schema roles, excluding those for implicit attribute relations, the base role type
+    Args:
+        tx: Grakn transaction
+
+    Returns:
+        Grakn roles
+    """
+    schema_concepts = tx.query(
+        "match $x sub role; "
+        "not{$x sub @key-attribute-value;}; "
+        "not{$x sub @key-attribute-owner;}; "
+        "not{$x sub @has-attribute-value;}; "
+        "not{$x sub @has-attribute-owner;};"
+        "get;")
+    role_types = ['has'] + [role.get('x').label() for role in schema_concepts]
+    role_types.remove('role')
+    return role_types
diff --git a/kglib/utils/graph/thing/queries_to_graph.py b/kglib/utils/graph/thing/queries_to_graph.py
@@ -29,18 +29,26 @@ def concept_dict_from_concept_map(concept_map):
     """
     Given a concept map, build a dictionary of the variables present and the concepts they refer to, locally storing any
     information required about those concepts.
-    :param concept_map: A dict of Concepts provided by Grakn keyed by query variables
-    :return: A dictionary of concepts keyed by query variables
+
+    Args:
+        concept_map: A dict of Concepts provided by Grakn keyed by query variables
+
+    Returns:
+        A dictionary of concepts keyed by query variables
     """
     return {variable: build_thing(grakn_concept) for variable, grakn_concept in concept_map.map().items()}
 
 
 def combine_2_graphs(graph1, graph2):
     """
     Combine two graphs into one. Do this by recognising common nodes between the two.
-    :param graph1: Graph to compare
-    :param graph2: Graph to compare
-    :return: Combined graph
+
+    Args:
+        graph1: Graph to compare
+        graph2: Graph to compare
+
+    Returns:
+        Combined graph
     """
 
     for node, data in graph1.nodes(data=True):
@@ -67,8 +75,12 @@ def combine_2_graphs(graph1, graph2):
 def combine_n_graphs(graphs_list):
     """
     Combine N graphs into one. Do this by recognising common nodes between the two.
-    :param graphs_list: List of graphs to combine
-    :return: Combined graph
+
+    Args:
+        graphs_list: List of graphs to combine
+
+    Returns:
+        Combined graph
     """
     return reduce(lambda x, y: combine_2_graphs(x, y), graphs_list)
 
@@ -78,14 +90,19 @@ def build_graph_from_queries(query_sampler_variable_graph_tuples, grakn_transact
     """
     Builds a graph of Things, interconnected by roles (and *has*), from a set of queries and graphs representing those
     queries (variable graphs)of those queries, over a Grakn transaction
-    :param infer:
-    :param query_sampler_variable_graph_tuples: A list of tuples, each tuple containing a query, a sampling function,
-    and a variable_graph
-    :param grakn_transaction: A Grakn transaction
-    :param concept_dict_converter: The function to use to convert from concept_dicts to a Grakn model. This could be
-    a typical model or a mathematical model
-    :return: A networkx graph
+
+    Args:
+        infer: whether to use Grakn's inference engine
+        query_sampler_variable_graph_tuples: A list of tuples, each tuple containing a query, a sampling function,
+            and a variable_graph
+        grakn_transaction: A Grakn transaction
+        concept_dict_converter: The function to use to convert from concept_dicts to a Grakn model. This could be
+            a typical model or a mathematical model
+
+    Returns:
+        A networkx graph
     """
+
     query_concept_graphs = []
 
     for query, sampler, variable_graph in query_sampler_variable_graph_tuples:

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ py_library(`
`8`	`8`	`'//kglib/utils/grakn/test',`
`9`	`9`	`'//kglib/utils/grakn/object',`
`10`	`10`	`'//kglib/utils/grakn/synthetic',`
	`11`	`+ '//kglib/utils/grakn/type',`
`11`	`12`	`],`
`12`	`13`	`visibility=['//visibility:public']`
`13`	`14`	`)`