-
-
Notifications
You must be signed in to change notification settings - Fork 4.6k
Closed
Labels
feat / nelFeature: Named Entity linkingFeature: Named Entity linkingusageGeneral spaCy usageGeneral spaCy usage
Description
I am creating a kb by combining elements of my local graphdb with wikidata. It cannot add a duplicate, but I think it should be able to handle synonyms? so that Paris (France) and Paris(Texas) are two different things? Or in this case, Barack Obama (president) and Barack Obama (a fictional horse) could be two different things..
How to reproduce the behaviour
import numpy as np
import spacy
from SPARQLWrapper import SPARQLWrapper, JSON
from spacy.kb import InMemoryLookupKB
from spacy.language import Language
from spacy.tokens import Doc
from spacy.training import Example
def query_endpoint(endpoint, query):
endpoint.setQuery(query)
endpoint.setReturnFormat(JSON)
try:
results = endpoint.query().convert()
return results
except Exception as e:
print(f"Failed to query endpoint: {str(e)}")
return None
def create_kb_from_sparql(vocab):
# Local SPARQL endpoint
local_sparql = SPARQLWrapper("http://DESKTOP-N288CLP:7200/repositories/mykb")
local_query = """
PREFIX : <http://example.org/schema#>
SELECT ?entity ?alias ?description WHERE {
?entity :hasAlias ?alias;
:description ?description .
}
"""
local_results = query_endpoint(local_sparql, local_query)
# Wikidata SPARQL endpoint for Barack Obama
wikidata_sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
wikidata_query = """
SELECT ?entity ?alias ?description WHERE {
VALUES ?entity { wd:Q76 } # Barack Obama's Wikidata ID
?entity rdfs:label ?alias;
schema:description ?description .
FILTER (LANG(?alias) = "en" && LANG(?description) = "en")
}
"""
wikidata_results = query_endpoint(wikidata_sparql, wikidata_query)
# Initialize the InMemoryLookupKB
kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=300)
# Function to populate KB with SPARQL results
def populate_kb(results):
if results:
for result in results["results"]["bindings"]:
entity_id = result["entity"]["value"]
alias = result["alias"]["value"]
description = result["description"]["value"]
# Generate a dummy vector of appropriate length
entity_vector = np.zeros((300,), dtype=np.float32)
# Add entity with a frequency and a numpy vector
kb.add_entity(entity=entity_id, freq=1, entity_vector=entity_vector)
kb.add_alias(alias=alias, entities=[entity_id], probabilities=[1])
# Populate KB with data from both endpoints
populate_kb(local_results)
populate_kb(wikidata_results)
# Print the contents of the KB
print_kb_contents(kb)
return kb
Your Environment
- Operating System:
- Python Version Used:
- spaCy Version Used:
- Environment Information:
Metadata
Metadata
Assignees
Labels
feat / nelFeature: Named Entity linkingFeature: Named Entity linkingusageGeneral spaCy usageGeneral spaCy usage