Skip to content

Warning: [W017] Alias 'Barack Obama' already exists in the Knowledge Base. #13473

@ghdunn

Description

@ghdunn

I am creating a kb by combining elements of my local graphdb with wikidata. It cannot add a duplicate, but I think it should be able to handle synonyms? so that Paris (France) and Paris(Texas) are two different things? Or in this case, Barack Obama (president) and Barack Obama (a fictional horse) could be two different things..

How to reproduce the behaviour

import numpy as np
import spacy
from SPARQLWrapper import SPARQLWrapper, JSON
from spacy.kb import InMemoryLookupKB
from spacy.language import Language
from spacy.tokens import Doc
from spacy.training import Example

def query_endpoint(endpoint, query):
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    try:
        results = endpoint.query().convert()
        return results
    except Exception as e:
        print(f"Failed to query endpoint: {str(e)}")
        return None

def create_kb_from_sparql(vocab):
    # Local SPARQL endpoint
    local_sparql = SPARQLWrapper("http://DESKTOP-N288CLP:7200/repositories/mykb")
    local_query = """
        PREFIX : <http://example.org/schema#>
        SELECT ?entity ?alias ?description WHERE {
          ?entity :hasAlias ?alias;
                   :description ?description .
        }
    """
    local_results = query_endpoint(local_sparql, local_query)
    
    # Wikidata SPARQL endpoint for Barack Obama
    wikidata_sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    wikidata_query = """
        SELECT ?entity ?alias ?description WHERE {
          VALUES ?entity { wd:Q76 }  # Barack Obama's Wikidata ID
          ?entity rdfs:label ?alias;
                  schema:description ?description .
          FILTER (LANG(?alias) = "en" && LANG(?description) = "en")
        }
    """
    wikidata_results = query_endpoint(wikidata_sparql, wikidata_query)

    # Initialize the InMemoryLookupKB
    kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=300)
    
    # Function to populate KB with SPARQL results
    def populate_kb(results):
        if results:
            for result in results["results"]["bindings"]:
                entity_id = result["entity"]["value"]
                alias = result["alias"]["value"]
                description = result["description"]["value"]

                # Generate a dummy vector of appropriate length
                entity_vector = np.zeros((300,), dtype=np.float32)

                # Add entity with a frequency and a numpy vector
                kb.add_entity(entity=entity_id, freq=1, entity_vector=entity_vector)
                kb.add_alias(alias=alias, entities=[entity_id], probabilities=[1])

    # Populate KB with data from both endpoints
    populate_kb(local_results)
    populate_kb(wikidata_results)
    
    # Print the contents of the KB
    print_kb_contents(kb)
    
    return kb

Your Environment

  • Operating System:
  • Python Version Used:
  • spaCy Version Used:
  • Environment Information:

Metadata

Metadata

Assignees

No one assigned

    Labels

    feat / nelFeature: Named Entity linkingusageGeneral spaCy usage

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions