Skip to content
This repository was archived by the owner on Nov 8, 2022. It is now read-only.

Commit d510f81

Browse files
author
Alon Eirew
committed
minor changes and fixes in Cross Doc Coref API
1 parent bd37204 commit d510f81

File tree

9 files changed

+64
-49
lines changed

9 files changed

+64
-49
lines changed

examples/cross_doc_coref/cross_doc_coref_sieves.py

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,11 @@
1515
# ******************************************************************************
1616

1717
import logging
18+
from typing import List
1819

1920
from nlp_architect import LIBRARY_ROOT
21+
from nlp_architect.common.cdc.cluster import Clusters
22+
from nlp_architect.common.cdc.topics import Topics
2023
from nlp_architect.data.cdc_resources.relations.relation_types_enums import RelationType
2124
from nlp_architect.models.cross_doc_coref.cdc_config import EventConfig, EntityConfig
2225
from nlp_architect.models.cross_doc_coref.cdc_resource import CDCResources
@@ -34,8 +37,8 @@ def run_example():
3437
(SieveType.RELAX, RelationType.SAME_HEAD_LEMMA_RELAX, 0.5),
3538
]
3639

37-
event_config.gold_mentions_file = LIBRARY_ROOT + \
38-
'/datasets/ecb/ecb_all_event_mentions.json'
40+
event_config.gold_mentions = Topics(LIBRARY_ROOT
41+
+ '/datasets/ecb/ecb_all_event_mentions.json')
3942

4043
entity_config = EntityConfig()
4144

@@ -47,8 +50,8 @@ def run_example():
4750
(SieveType.VERY_RELAX, RelationType.REFERENT_DICT, 0.5)
4851
]
4952

50-
entity_config.gold_mentions_file = LIBRARY_ROOT + \
51-
'/datasets/ecb/ecb_all_entity_mentions.json'
53+
entity_config.gold_mentions = Topics(LIBRARY_ROOT
54+
+ '/datasets/ecb/ecb_all_entity_mentions.json')
5255

5356
# CDCResources hold default attribute values that might need to be change,
5457
# (using the defaults values in this example), use to configure attributes
@@ -68,19 +71,25 @@ def run_example():
6871
entity_clusters = run_entity_coref(resources)
6972

7073
print('-=Cross Document Coref Results=-')
71-
print('-=Event Clusters Mentions=-')
72-
for event_cluster in event_clusters.clusters_list:
73-
print(event_cluster.coref_chain)
74-
for event_mention in event_cluster.mentions:
75-
print(event_mention.mention_id)
76-
print(event_mention.tokens_str)
77-
78-
print('-=Entity Clusters Mentions=-')
79-
for entity_cluster in entity_clusters.clusters_list:
80-
print(entity_cluster.coref_chain)
81-
for entity_mention in entity_cluster.mentions:
82-
print(entity_mention.mention_id)
83-
print(entity_mention.tokens_str)
74+
print_results(event_clusters, 'Event')
75+
print('################################')
76+
print_results(entity_clusters, 'Entity')
77+
78+
79+
def print_results(clusters: List[Clusters], type: str):
80+
print('-=' + type + ' Clusters=-')
81+
for topic_cluster in clusters:
82+
print('\n\tCluster Topic=' + topic_cluster.topic_id)
83+
for cluster in topic_cluster.clusters_list:
84+
cluster_mentions = list()
85+
for mention in cluster.mentions:
86+
mentions_dict = dict()
87+
mentions_dict['id'] = mention.mention_id
88+
mentions_dict['text'] = mention.tokens_str
89+
cluster_mentions.append(mentions_dict)
90+
91+
print('\t\tCluster(' + str(cluster.coref_chain) + ') Mentions='
92+
+ str(cluster_mentions))
8493

8594

8695
if __name__ == '__main__':

nlp_architect/common/cdc/cluster.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,14 +65,15 @@ def get_cluster_id(self) -> str:
6565
class Clusters(object):
6666
cluster_coref_chain = 1000
6767

68-
def __init__(self, mentions: List[MentionData] = None) -> None:
68+
def __init__(self, topic_id: str, mentions: List[MentionData] = None) -> None:
6969
"""
7070
7171
Args:
7272
mentions: ``list[MentionData]``, required
7373
The initial mentions to create the clusters from
7474
"""
7575
self.clusters_list = []
76+
self.topic_id = topic_id
7677
self.set_initial_clusters(mentions)
7778

7879
def set_initial_clusters(self, mentions: List[MentionData]) -> None:

nlp_architect/common/cdc/topics.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ def __init__(self, mentions_file_path: str) -> None:
3737
Args:
3838
mentions_file_path: this topic mentions json file
3939
"""
40-
self.topics_list = self.load_gold_mentions(mentions_file_path)
40+
self.topics_list = self.load_gold_mentions_from_file(mentions_file_path)
4141

42-
def load_gold_mentions(self, mentions_file_path: str) -> List[Topic]:
42+
def load_gold_mentions_from_file(self, mentions_file_path: str) -> List[Topic]:
4343
start_data_load = time.time()
4444
logger.info('Loading mentions from-%s', mentions_file_path)
4545
mentions = load_json_file(mentions_file_path)

nlp_architect/data/cdc_resources/wikipedia/wiki_online.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# ******************************************************************************
1616

1717
import os
18+
import logging
1819

1920
from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page import WikipediaPage
2021
from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations import \
@@ -27,6 +28,8 @@
2728
DISAMBIGUATE_PAGE = ['wikimedia disambiguation page', 'wikipedia disambiguation page']
2829
NAME_DESCRIPTIONS = ['given name', 'first name', 'family name']
2930

31+
logger = logging.getLogger(__name__)
32+
3033

3134
class WikiOnline(object):
3235
def __init__(self):
@@ -52,7 +55,7 @@ def get_pages(self, phrase):
5255
full_page = self.get_wiki_page_with_items(phrase, page_result)
5356
ret_pages.add(WikipediaSearchPageResult(appr, full_page))
5457
except Exception as e:
55-
print(e)
58+
logger.error(e)
5659

5760
self.cache[phrase] = ret_pages
5861
return ret_pages
@@ -73,7 +76,7 @@ def get_wiki_page_with_items(self, phrase, page):
7376

7477
ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid, description, relations)
7578

76-
print('Page:' + str(ret_page) + ". Extracted successfully")
79+
logger.debug('Page:' + str(ret_page) + ". Extracted successfully")
7780

7881
return ret_page
7982

nlp_architect/models/cross_doc_coref/cdc_config.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from typing import List, Tuple
1717

1818
from nlp_architect import LIBRARY_ROOT
19+
from nlp_architect.common.cdc.topics import Topics
1920
from nlp_architect.data.cdc_resources.relations.relation_types_enums import RelationType
2021
from nlp_architect.models.cross_doc_coref.system.sieves.sieves import SieveType
2122

@@ -26,20 +27,20 @@ def __init__(self):
2627

2728
self.__sieves_order = None
2829
self.__run_evaluation = False
29-
self.__gold_mentions_file = None
30+
self.__gold_mentions = None
3031

3132
@property
3233
def sieves_order(self):
3334
"""
3435
Sieve definition and Sieve running order
3536
36-
Tuple[SieveType, RelationType, Threshold(float)] - define sieves to run, were
37+
Tuple[SieveType, RelationType, Threshold(float)] - define sieves to run, were
3738
38-
Strict- Merge clusters only in case all mentions has current relation between them,
39-
Relax- Merge clusters in case (matched mentions) / len(cluster_1.mentions)) >= thresh,
40-
Very_Relax- Merge clusters in case (matched mentions) / (all possible pairs) >= thresh
39+
Strict- Merge clusters only in case all mentions has current relation between them,
40+
Relax- Merge clusters in case (matched mentions) / len(cluster_1.mentions)) >= thresh,
41+
Very_Relax- Merge clusters in case (matched mentions) / (all possible pairs) >= thresh
4142
42-
RelationType represent the type of sieve to run.
43+
RelationType represent the type of sieve to run.
4344
4445
"""
4546
return self.__sieves_order
@@ -58,13 +59,13 @@ def run_evaluation(self, run_evaluation: bool):
5859
self.__run_evaluation = run_evaluation
5960

6061
@property
61-
def gold_mentions_file(self):
62+
def gold_mentions(self) -> Topics:
6263
"""Mentions file to run against"""
63-
return self.__gold_mentions_file
64+
return self.__gold_mentions
6465

65-
@gold_mentions_file.setter
66-
def gold_mentions_file(self, gold_file):
67-
self.__gold_mentions_file = gold_file
66+
@gold_mentions.setter
67+
def gold_mentions(self, gold_mentions_topics: Topics):
68+
self.__gold_mentions = gold_mentions_topics
6869

6970

7071
class EventConfig(CDCConfig):
@@ -91,8 +92,8 @@ def __init__(self):
9192
(SieveType.STRICT, RelationType.WORDNET_DERIVATIONALLY, 0.0)
9293
]
9394

94-
self.gold_mentions_file = LIBRARY_ROOT + \
95-
'/datasets/ecb/ecb_all_event_mentions.json'
95+
self.gold_mentions = Topics(LIBRARY_ROOT
96+
+ '/datasets/ecb/ecb_all_event_mentions.json')
9697

9798

9899
class EntityConfig(CDCConfig):
@@ -119,5 +120,5 @@ def __init__(self):
119120
(SieveType.VERY_RELAX, RelationType.REFERENT_DICT, 0.5)
120121
]
121122

122-
self.gold_mentions_file = LIBRARY_ROOT + \
123-
'/datasets/ecb/ecb_all_entity_mentions.json'
123+
self.gold_mentions = (LIBRARY_ROOT
124+
+ '/datasets/ecb/ecb_all_entity_mentions.json')

nlp_architect/models/cross_doc_coref/system/cdc_settings.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
import logging
1818

19-
from nlp_architect.common.cdc.topics import Topics
2019
from nlp_architect.data.cdc_resources.relations.computed_relation_extraction import \
2120
ComputedRelationExtraction
2221
from nlp_architect.data.cdc_resources.relations.referent_dict_relation_extraction import \
@@ -53,9 +52,9 @@ def __init__(self, resources, event_coref_config, entity_coref_config):
5352
self.load_modules()
5453

5554
if event_coref_config.run_evaluation:
56-
self.events_topics = Topics(event_coref_config.gold_mentions_file)
55+
self.events_topics = event_coref_config.gold_mentions
5756
if entity_coref_config.run_evaluation:
58-
self.entity_topics = Topics(entity_coref_config.gold_mentions_file)
57+
self.entity_topics = entity_coref_config.gold_mentions
5958
if not self.events_topics and not self.entity_topics:
6059
raise Exception('No entity or events Gold topics loaded!')
6160

nlp_architect/models/cross_doc_coref/system/sieves/run_sieve_system.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __init__(self, topic):
2929
self.results_dict = dict()
3030
self.results_ordered = []
3131
logger.info('loading topic %s, total mentions: %d', topic.topic_id, len(topic.mentions))
32-
self.clusters = Clusters(topic.mentions)
32+
self.clusters = Clusters(topic.topic_id, topic.mentions)
3333

3434
@staticmethod
3535
def set_sieves_from_config(config, get_rel_extraction):

nlp_architect/models/cross_doc_sieves.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# ******************************************************************************
1616
import logging
1717
import os
18+
from typing import List
1819

1920
from nlp_architect.common.cdc.cluster import Clusters
2021
from nlp_architect.models.cross_doc_coref.system.cdc_settings import CDCSettings
@@ -27,7 +28,7 @@
2728
logger = logging.getLogger(__name__)
2829

2930

30-
def run_event_coref(resources: CDCSettings) -> Clusters:
31+
def run_event_coref(resources: CDCSettings) -> List[Clusters]:
3132
"""
3233
Running Cross Document Coref on event mentions
3334
Args:
@@ -37,12 +38,12 @@ def run_event_coref(resources: CDCSettings) -> Clusters:
3738
Clusters: List of clusters and mentions with predicted cross doc coref within each topic
3839
"""
3940
io.create_folder(resources.cdc_resources.eval_output_dir)
41+
event_clusters_list = list()
4042
for topic in resources.events_topics.topics_list:
4143
sieves_list_event = RunSystemsEvent(topic, resources)
4244
clusters = sieves_list_event.run_deterministic()
43-
4445
clusters.set_coref_chain_to_mentions()
45-
46+
event_clusters_list.append(clusters)
4647
with open(os.path.join(
4748
resources.cdc_resources.eval_output_dir, 'event_clusters.txt'), 'w') \
4849
as event_clusters_file:
@@ -51,10 +52,10 @@ def run_event_coref(resources: CDCSettings) -> Clusters:
5152
logger.info('Write event coref results')
5253
write_event_coref_scorer_results(resources.events_topics.topics_list,
5354
resources.cdc_resources.eval_output_dir)
54-
return clusters
55+
return event_clusters_list
5556

5657

57-
def run_entity_coref(resources: CDCSettings) -> Clusters:
58+
def run_entity_coref(resources: CDCSettings) -> List[Clusters]:
5859
"""
5960
Running Cross Document Coref on Entity mentions
6061
Args:
@@ -64,11 +65,12 @@ def run_entity_coref(resources: CDCSettings) -> Clusters:
6465
Clusters: List of topics and mentions with predicted cross doc coref within each topic
6566
"""
6667
io.create_folder(resources.cdc_resources.eval_output_dir)
68+
entity_clusters_list = list()
6769
for topic in resources.entity_topics.topics_list:
6870
sieves_list_entity = RunSystemsEntity(topic, resources)
6971
clusters = sieves_list_entity.run_deterministic()
70-
7172
clusters.set_coref_chain_to_mentions()
73+
entity_clusters_list.append(clusters)
7274

7375
with open(os.path.join(
7476
resources.cdc_resources.eval_output_dir, 'entity_clusters.txt'), 'w') \
@@ -79,4 +81,4 @@ def run_entity_coref(resources: CDCSettings) -> Clusters:
7981
write_entity_coref_scorer_results(resources.entity_topics.topics_list,
8082
resources.cdc_resources.eval_output_dir)
8183

82-
return clusters
84+
return entity_clusters_list

nlp_architect/utils/string_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
class StringUtils:
3434
spacy_no_parser = SpacyInstance(disable=['parser'])
3535
spacy_parser = SpacyInstance()
36-
stop_words = None
36+
stop_words = list()
3737
pronouns = None
3838
preposition = None
3939

0 commit comments

Comments
 (0)