diff --git a/src/named_entity_recognition/database_value_finder/database_value_finder.py b/src/named_entity_recognition/database_value_finder/database_value_finder.py index 993b12b..df75bda 100644 --- a/src/named_entity_recognition/database_value_finder/database_value_finder.py +++ b/src/named_entity_recognition/database_value_finder/database_value_finder.py @@ -28,11 +28,11 @@ def find_similar_values_in_database(self, potential_values): conn = sqlite3.connect(str(self.database_path.resolve())) cursor = conn.cursor() - + print("Potential Values before entering whole db scanning : " ,potential_values) for table, columns in table_text_column_mapping.items(): if columns: query = self._assemble_query(columns, table) - + print(query) data = self.fetch_data(query, cursor) # The overhead of parallelization only helps after a certain size of data. Example: a table with ~ 300k entries and 4 columns takes ~20s with a single core. @@ -58,8 +58,8 @@ def _find_matches_in_column(self, table, column, column_idx, data, potential_val def _find_matches_by_similarity(self, data, column_idx, potential_values): matching_value_in_database = [] + print("Potential Values before entering similarity matching: " ,potential_values) potential_values_found = [] - for row in data: cell_value = row[column_idx] # avoid comparing None values diff --git a/src/named_entity_recognition/pre_process_ner_values.py b/src/named_entity_recognition/pre_process_ner_values.py index 8e35b10..35eca21 100644 --- a/src/named_entity_recognition/pre_process_ner_values.py +++ b/src/named_entity_recognition/pre_process_ner_values.py @@ -76,16 +76,21 @@ def match_values_in_database(db_id: str, extracted_data: NerExtractionData): # Remember: 1.0 is looking for exact matches only. Also remember: we do lower-case only comparison, so 'Male' and 'male' will match with 1.0 candidates = [] # With values in quote we are a bit tolerant. Important: we keep this values anyway, as the are often used in fuzzy LIKE searches. + # print("Candidates before quote mentionings : ", candidates) _add_without_duplicates( [(quote, 0.9) for quote in extracted_data.heuristic_values_in_quote], candidates) # Gender values we only want exact matches. + # print("Candidates before gender mentionings : ", candidates) _add_without_duplicates( [(gender, 1.0) for gender in extracted_data.heuristics_genders], candidates) + # print("Candidates before common mentionings : ", candidates) _add_without_duplicates([(common_mentionings, 0.9) for common_mentionings in extracted_data.heuristics_variety_common_mentionings], candidates) # a special code should match exactly + # print("Candidates before special codes mentionings : ", candidates) _add_without_duplicates( [(special_code, 1.0) for special_code in extracted_data.heuristics_special_codes], candidates) + # print("Candidates before capitalized mentionings : ", candidates) _add_without_duplicates([(capitalized_word, 0.75) for capitalized_word in extracted_data.heuristics_capitalized_words], candidates) _add_without_duplicates( @@ -96,7 +101,8 @@ def match_values_in_database(db_id: str, extracted_data: NerExtractionData): [(ner_value, 0.75) for ner_value in extracted_data.ner_remaining], candidates) database_matches = _find_matches_in_database(db_value_finder, candidates) - + print(database_matches) + print("Final candidates: ",candidates) # Here we put all the values to one happy list together: the ones we matched via database and the ones we got directly out of the question. # The 'set' is to remove duplicates. return list(set(extracted_data.heuristic_values_in_quote + # we put in values in quote a second time as those values are often fuzzy strings. @@ -120,6 +126,7 @@ def _find_matches_in_database(db_value_finder, potential_values): try: matching_db_values = db_value_finder.find_similar_values_in_database( potential_values) + print(matching_db_values) matches = list(map(lambda v: v[0], matching_db_values)) except Exception as e: print(