Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ def find_similar_values_in_database(self, potential_values):

conn = sqlite3.connect(str(self.database_path.resolve()))
cursor = conn.cursor()

print("Potential Values before entering whole db scanning : " ,potential_values)
for table, columns in table_text_column_mapping.items():
if columns:
query = self._assemble_query(columns, table)

print(query)
data = self.fetch_data(query, cursor)

# The overhead of parallelization only helps after a certain size of data. Example: a table with ~ 300k entries and 4 columns takes ~20s with a single core.
Expand All @@ -58,8 +58,8 @@ def _find_matches_in_column(self, table, column, column_idx, data, potential_val

def _find_matches_by_similarity(self, data, column_idx, potential_values):
matching_value_in_database = []
print("Potential Values before entering similarity matching: " ,potential_values)
potential_values_found = []

for row in data:
cell_value = row[column_idx]
# avoid comparing None values
Expand Down
9 changes: 8 additions & 1 deletion src/named_entity_recognition/pre_process_ner_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,21 @@ def match_values_in_database(db_id: str, extracted_data: NerExtractionData):
# Remember: 1.0 is looking for exact matches only. Also remember: we do lower-case only comparison, so 'Male' and 'male' will match with 1.0
candidates = []
# With values in quote we are a bit tolerant. Important: we keep this values anyway, as the are often used in fuzzy LIKE searches.
# print("Candidates before quote mentionings : ", candidates)
_add_without_duplicates(
[(quote, 0.9) for quote in extracted_data.heuristic_values_in_quote], candidates)
# Gender values we only want exact matches.
# print("Candidates before gender mentionings : ", candidates)
_add_without_duplicates(
[(gender, 1.0) for gender in extracted_data.heuristics_genders], candidates)
# print("Candidates before common mentionings : ", candidates)
_add_without_duplicates([(common_mentionings, 0.9)
for common_mentionings in extracted_data.heuristics_variety_common_mentionings], candidates)
# a special code should match exactly
# print("Candidates before special codes mentionings : ", candidates)
_add_without_duplicates(
[(special_code, 1.0) for special_code in extracted_data.heuristics_special_codes], candidates)
# print("Candidates before capitalized mentionings : ", candidates)
_add_without_duplicates([(capitalized_word, 0.75)
for capitalized_word in extracted_data.heuristics_capitalized_words], candidates)
_add_without_duplicates(
Expand All @@ -96,7 +101,8 @@ def match_values_in_database(db_id: str, extracted_data: NerExtractionData):
[(ner_value, 0.75) for ner_value in extracted_data.ner_remaining], candidates)

database_matches = _find_matches_in_database(db_value_finder, candidates)

print(database_matches)
print("Final candidates: ",candidates)
# Here we put all the values to one happy list together: the ones we matched via database and the ones we got directly out of the question.
# The 'set' is to remove duplicates.
return list(set(extracted_data.heuristic_values_in_quote + # we put in values in quote a second time as those values are often fuzzy strings.
Expand All @@ -120,6 +126,7 @@ def _find_matches_in_database(db_value_finder, potential_values):
try:
matching_db_values = db_value_finder.find_similar_values_in_database(
potential_values)
print(matching_db_values)
matches = list(map(lambda v: v[0], matching_db_values))
except Exception as e:
print(
Expand Down