minor changes and fixes in Cross Doc Coref API

Alon Eirew · Alon Eirew · commit d510f81d7b2f · 2018-11-25T15:26:03.000+02:00
diff --git a/examples/cross_doc_coref/cross_doc_coref_sieves.py b/examples/cross_doc_coref/cross_doc_coref_sieves.py
@@ -15,8 +15,11 @@
 # ******************************************************************************
 
 import logging
+from typing import List
 
 from nlp_architect import LIBRARY_ROOT
+from nlp_architect.common.cdc.cluster import Clusters
+from nlp_architect.common.cdc.topics import Topics
 from nlp_architect.data.cdc_resources.relations.relation_types_enums import RelationType
 from nlp_architect.models.cross_doc_coref.cdc_config import EventConfig, EntityConfig
 from nlp_architect.models.cross_doc_coref.cdc_resource import CDCResources
@@ -34,8 +37,8 @@ def run_example():
         (SieveType.RELAX, RelationType.SAME_HEAD_LEMMA_RELAX, 0.5),
     ]
 
-    event_config.gold_mentions_file = LIBRARY_ROOT + \
-        '/datasets/ecb/ecb_all_event_mentions.json'
+    event_config.gold_mentions = Topics(LIBRARY_ROOT
+                                        + '/datasets/ecb/ecb_all_event_mentions.json')
 
     entity_config = EntityConfig()
 
@@ -47,8 +50,8 @@ def run_example():
         (SieveType.VERY_RELAX, RelationType.REFERENT_DICT, 0.5)
     ]
 
-    entity_config.gold_mentions_file = LIBRARY_ROOT + \
-        '/datasets/ecb/ecb_all_entity_mentions.json'
+    entity_config.gold_mentions = Topics(LIBRARY_ROOT
+                                         + '/datasets/ecb/ecb_all_entity_mentions.json')
 
     # CDCResources hold default attribute values that might need to be change,
     # (using the defaults values in this example), use to configure attributes
@@ -68,19 +71,25 @@ def run_example():
         entity_clusters = run_entity_coref(resources)
 
     print('-=Cross Document Coref Results=-')
-    print('-=Event Clusters Mentions=-')
-    for event_cluster in event_clusters.clusters_list:
-        print(event_cluster.coref_chain)
-        for event_mention in event_cluster.mentions:
-            print(event_mention.mention_id)
-            print(event_mention.tokens_str)
-
-    print('-=Entity Clusters Mentions=-')
-    for entity_cluster in entity_clusters.clusters_list:
-        print(entity_cluster.coref_chain)
-        for entity_mention in entity_cluster.mentions:
-            print(entity_mention.mention_id)
-            print(entity_mention.tokens_str)
+    print_results(event_clusters, 'Event')
+    print('################################')
+    print_results(entity_clusters, 'Entity')
+
+
+def print_results(clusters: List[Clusters], type: str):
+    print('-=' + type + ' Clusters=-')
+    for topic_cluster in clusters:
+        print('\n\tCluster Topic=' + topic_cluster.topic_id)
+        for cluster in topic_cluster.clusters_list:
+            cluster_mentions = list()
+            for mention in cluster.mentions:
+                mentions_dict = dict()
+                mentions_dict['id'] = mention.mention_id
+                mentions_dict['text'] = mention.tokens_str
+                cluster_mentions.append(mentions_dict)
+
+            print('\t\tCluster(' + str(cluster.coref_chain) + ') Mentions='
+                  + str(cluster_mentions))
 
 
 if __name__ == '__main__':
diff --git a/nlp_architect/common/cdc/cluster.py b/nlp_architect/common/cdc/cluster.py
@@ -65,14 +65,15 @@ def get_cluster_id(self) -> str:
 class Clusters(object):
     cluster_coref_chain = 1000
 
-    def __init__(self, mentions: List[MentionData] = None) -> None:
+    def __init__(self, topic_id: str, mentions: List[MentionData] = None) -> None:
         """
 
         Args:
             mentions: ``list[MentionData]``, required
                 The initial mentions to create the clusters from
         """
         self.clusters_list = []
+        self.topic_id = topic_id
         self.set_initial_clusters(mentions)
 
     def set_initial_clusters(self, mentions: List[MentionData]) -> None:
diff --git a/nlp_architect/common/cdc/topics.py b/nlp_architect/common/cdc/topics.py
@@ -37,9 +37,9 @@ def __init__(self, mentions_file_path: str) -> None:
         Args:
             mentions_file_path: this topic mentions json file
         """
-        self.topics_list = self.load_gold_mentions(mentions_file_path)
+        self.topics_list = self.load_gold_mentions_from_file(mentions_file_path)
 
-    def load_gold_mentions(self, mentions_file_path: str) -> List[Topic]:
+    def load_gold_mentions_from_file(self, mentions_file_path: str) -> List[Topic]:
         start_data_load = time.time()
         logger.info('Loading mentions from-%s', mentions_file_path)
         mentions = load_json_file(mentions_file_path)
diff --git a/nlp_architect/data/cdc_resources/wikipedia/wiki_online.py b/nlp_architect/data/cdc_resources/wikipedia/wiki_online.py
@@ -15,6 +15,7 @@
 # ******************************************************************************
 
 import os
+import logging
 
 from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page import WikipediaPage
 from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations import \
@@ -27,6 +28,8 @@
 DISAMBIGUATE_PAGE = ['wikimedia disambiguation page', 'wikipedia disambiguation page']
 NAME_DESCRIPTIONS = ['given name', 'first name', 'family name']
 
+logger = logging.getLogger(__name__)
+
 
 class WikiOnline(object):
     def __init__(self):
@@ -52,7 +55,7 @@ def get_pages(self, phrase):
                     full_page = self.get_wiki_page_with_items(phrase, page_result)
                     ret_pages.add(WikipediaSearchPageResult(appr, full_page))
             except Exception as e:
-                print(e)
+                logger.error(e)
 
         self.cache[phrase] = ret_pages
         return ret_pages
@@ -73,7 +76,7 @@ def get_wiki_page_with_items(self, phrase, page):
 
         ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid, description, relations)
 
-        print('Page:' + str(ret_page) + ". Extracted successfully")
+        logger.debug('Page:' + str(ret_page) + ". Extracted successfully")
 
         return ret_page
 
diff --git a/nlp_architect/models/cross_doc_coref/cdc_config.py b/nlp_architect/models/cross_doc_coref/cdc_config.py
@@ -16,6 +16,7 @@
 from typing import List, Tuple
 
 from nlp_architect import LIBRARY_ROOT
+from nlp_architect.common.cdc.topics import Topics
 from nlp_architect.data.cdc_resources.relations.relation_types_enums import RelationType
 from nlp_architect.models.cross_doc_coref.system.sieves.sieves import SieveType
 
@@ -26,20 +27,20 @@ def __init__(self):
 
         self.__sieves_order = None
         self.__run_evaluation = False
-        self.__gold_mentions_file = None
+        self.__gold_mentions = None
 
     @property
     def sieves_order(self):
         """
         Sieve definition and Sieve running order
 
-            Tuple[SieveType, RelationType, Threshold(float)] - define sieves to run, were
+        Tuple[SieveType, RelationType, Threshold(float)] - define sieves to run, were
 
-            Strict- Merge clusters only in case all mentions has current relation between them,
-            Relax- Merge clusters in case (matched mentions) / len(cluster_1.mentions)) >= thresh,
-            Very_Relax- Merge clusters in case (matched mentions) / (all possible pairs) >= thresh
+        Strict- Merge clusters only in case all mentions has current relation between them,
+        Relax- Merge clusters in case (matched mentions) / len(cluster_1.mentions)) >= thresh,
+        Very_Relax- Merge clusters in case (matched mentions) / (all possible pairs) >= thresh
 
-            RelationType represent the type of sieve to run.
+        RelationType represent the type of sieve to run.
 
         """
         return self.__sieves_order
@@ -58,13 +59,13 @@ def run_evaluation(self, run_evaluation: bool):
         self.__run_evaluation = run_evaluation
 
     @property
-    def gold_mentions_file(self):
+    def gold_mentions(self) -> Topics:
         """Mentions file to run against"""
-        return self.__gold_mentions_file
+        return self.__gold_mentions
 
-    @gold_mentions_file.setter
-    def gold_mentions_file(self, gold_file):
-        self.__gold_mentions_file = gold_file
+    @gold_mentions.setter
+    def gold_mentions(self, gold_mentions_topics: Topics):
+        self.__gold_mentions = gold_mentions_topics
 
 
 class EventConfig(CDCConfig):
@@ -91,8 +92,8 @@ def __init__(self):
             (SieveType.STRICT, RelationType.WORDNET_DERIVATIONALLY, 0.0)
         ]
 
-        self.gold_mentions_file = LIBRARY_ROOT + \
-            '/datasets/ecb/ecb_all_event_mentions.json'
+        self.gold_mentions = Topics(LIBRARY_ROOT
+                                    + '/datasets/ecb/ecb_all_event_mentions.json')
 
 
 class EntityConfig(CDCConfig):
@@ -119,5 +120,5 @@ def __init__(self):
             (SieveType.VERY_RELAX, RelationType.REFERENT_DICT, 0.5)
         ]
 
-        self.gold_mentions_file = LIBRARY_ROOT + \
-            '/datasets/ecb/ecb_all_entity_mentions.json'
+        self.gold_mentions = (LIBRARY_ROOT
+                              + '/datasets/ecb/ecb_all_entity_mentions.json')
diff --git a/nlp_architect/models/cross_doc_coref/system/cdc_settings.py b/nlp_architect/models/cross_doc_coref/system/cdc_settings.py
@@ -16,7 +16,6 @@
 
 import logging
 
-from nlp_architect.common.cdc.topics import Topics
 from nlp_architect.data.cdc_resources.relations.computed_relation_extraction import \
     ComputedRelationExtraction
 from nlp_architect.data.cdc_resources.relations.referent_dict_relation_extraction import \
@@ -53,9 +52,9 @@ def __init__(self, resources, event_coref_config, entity_coref_config):
         self.load_modules()
 
         if event_coref_config.run_evaluation:
-            self.events_topics = Topics(event_coref_config.gold_mentions_file)
+            self.events_topics = event_coref_config.gold_mentions
         if entity_coref_config.run_evaluation:
-            self.entity_topics = Topics(entity_coref_config.gold_mentions_file)
+            self.entity_topics = entity_coref_config.gold_mentions
         if not self.events_topics and not self.entity_topics:
             raise Exception('No entity or events Gold topics loaded!')
 
diff --git a/nlp_architect/models/cross_doc_coref/system/sieves/run_sieve_system.py b/nlp_architect/models/cross_doc_coref/system/sieves/run_sieve_system.py
@@ -29,7 +29,7 @@ def __init__(self, topic):
         self.results_dict = dict()
         self.results_ordered = []
         logger.info('loading topic %s, total mentions: %d', topic.topic_id, len(topic.mentions))
-        self.clusters = Clusters(topic.mentions)
+        self.clusters = Clusters(topic.topic_id, topic.mentions)
 
     @staticmethod
     def set_sieves_from_config(config, get_rel_extraction):
diff --git a/nlp_architect/models/cross_doc_sieves.py b/nlp_architect/models/cross_doc_sieves.py
@@ -15,6 +15,7 @@
 # ******************************************************************************
 import logging
 import os
+from typing import List
 
 from nlp_architect.common.cdc.cluster import Clusters
 from nlp_architect.models.cross_doc_coref.system.cdc_settings import CDCSettings
@@ -27,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 
-def run_event_coref(resources: CDCSettings) -> Clusters:
+def run_event_coref(resources: CDCSettings) -> List[Clusters]:
     """
     Running Cross Document Coref on event mentions
     Args:
@@ -37,12 +38,12 @@ def run_event_coref(resources: CDCSettings) -> Clusters:
         Clusters: List of clusters and mentions with predicted cross doc coref within each topic
     """
     io.create_folder(resources.cdc_resources.eval_output_dir)
+    event_clusters_list = list()
     for topic in resources.events_topics.topics_list:
         sieves_list_event = RunSystemsEvent(topic, resources)
         clusters = sieves_list_event.run_deterministic()
-
         clusters.set_coref_chain_to_mentions()
-
+        event_clusters_list.append(clusters)
         with open(os.path.join(
                 resources.cdc_resources.eval_output_dir, 'event_clusters.txt'), 'w') \
                 as event_clusters_file:
@@ -51,10 +52,10 @@ def run_event_coref(resources: CDCSettings) -> Clusters:
     logger.info('Write event coref results')
     write_event_coref_scorer_results(resources.events_topics.topics_list,
                                      resources.cdc_resources.eval_output_dir)
-    return clusters
+    return event_clusters_list
 
 
-def run_entity_coref(resources: CDCSettings) -> Clusters:
+def run_entity_coref(resources: CDCSettings) -> List[Clusters]:
     """
     Running Cross Document Coref on Entity mentions
     Args:
@@ -64,11 +65,12 @@ def run_entity_coref(resources: CDCSettings) -> Clusters:
         Clusters: List of topics and mentions with predicted cross doc coref within each topic
     """
     io.create_folder(resources.cdc_resources.eval_output_dir)
+    entity_clusters_list = list()
     for topic in resources.entity_topics.topics_list:
         sieves_list_entity = RunSystemsEntity(topic, resources)
         clusters = sieves_list_entity.run_deterministic()
-
         clusters.set_coref_chain_to_mentions()
+        entity_clusters_list.append(clusters)
 
         with open(os.path.join(
                 resources.cdc_resources.eval_output_dir, 'entity_clusters.txt'), 'w') \
@@ -79,4 +81,4 @@ def run_entity_coref(resources: CDCSettings) -> Clusters:
     write_entity_coref_scorer_results(resources.entity_topics.topics_list,
                                       resources.cdc_resources.eval_output_dir)
 
-    return clusters
+    return entity_clusters_list
diff --git a/nlp_architect/utils/string_utils.py b/nlp_architect/utils/string_utils.py
@@ -33,7 +33,7 @@
 class StringUtils:
     spacy_no_parser = SpacyInstance(disable=['parser'])
     spacy_parser = SpacyInstance()
-    stop_words = None
+    stop_words = list()
     pronouns = None
     preposition = None