Skip to content

Commit 1ad4567

Browse files
committed
exclude obsolete entities from text tagger (#1196)
1 parent 049fce3 commit 1ad4567

File tree

36 files changed

+941
-39
lines changed

36 files changed

+941
-39
lines changed

apitester4/src/main/java/uk/ac/ebi/ols/apitester/Ols4ApiTester.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,14 @@ public boolean testTagText() throws IOException {
149149
JsonElement tagSourceFiltered = post(url + "/api/v2/tag_text?source=test-curations", "{\"text\": \"genomic data sharing for clinical trial data\"}");
150150
write(outDir + "/v2/tag_text_source_filtered.json", tagSourceFiltered);
151151

152+
// Test includeObsoleteEntities=true (should include obsolete entity with is_obsolete flag)
153+
JsonElement tagWithObsolete = post(url + "/api/v2/tag_text?includeObsoleteEntities=true", "{\"text\": \"testing the obsolete xmpl entity label\"}");
154+
write(outDir + "/v2/tag_text_with_obsolete.json", tagWithObsolete);
155+
156+
// Test includeObsoleteEntities=false (default; should exclude obsolete entity)
157+
JsonElement tagNoObsolete = post(url + "/api/v2/tag_text?includeObsoleteEntities=false", "{\"text\": \"testing the obsolete xmpl entity label\"}");
158+
write(outDir + "/v2/tag_text_no_obsolete.json", tagNoObsolete);
159+
152160
return true;
153161
}
154162

backend/src/main/java/uk/ac/ebi/spot/ols/controller/api/v2/V2TextTaggerController.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ public HttpEntity<Map<String, Object>> tagText(
4141
@RequestParam(value = "source", required = false) List<String> sources,
4242
@RequestParam(value = "delimiters", required = false) String delimiters,
4343
@RequestParam(value = "minLength", required = false, defaultValue = "3") int minLength,
44-
@RequestParam(value = "includeSubstrings", required = false, defaultValue = "true") boolean includeSubstrings
44+
@RequestParam(value = "includeSubstrings", required = false, defaultValue = "true") boolean includeSubstrings,
45+
@RequestParam(value = "includeObsoleteEntities", required = false, defaultValue = "false") boolean includeObsoleteEntities
4546
) {
4647

4748
if (!textTaggerService.isAvailable()) {
@@ -62,6 +63,10 @@ public HttpEntity<Map<String, Object>> tagText(
6263

6364
List<TaggedEntity> entities = textTaggerService.tagText(text, ontologyIds, sources, delimiters, minLength, includeSubstrings);
6465

66+
if (!includeObsoleteEntities) {
67+
entities.removeIf(e -> e.isObsolete);
68+
}
69+
6570
List<Map<String, Object>> entityMaps = new ArrayList<>(entities.size());
6671
for (TaggedEntity e : entities) {
6772
Map<String, Object> m = new LinkedHashMap<>();
@@ -73,6 +78,7 @@ public HttpEntity<Map<String, Object>> tagText(
7378
if (e.stringType != null) m.put("string_type", e.stringType);
7479
if (e.source != null) m.put("source", e.source);
7580
if (e.subjectCategories != null) m.put("subject_categories", e.subjectCategories);
81+
if (e.isObsolete) m.put("is_obsolete", true);
7682
entityMaps.add(m);
7783
}
7884

backend/src/main/java/uk/ac/ebi/spot/ols/service/TextTaggerService.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,11 @@ public static class TaggedEntity {
4444
public final String stringType;
4545
public final String source;
4646
public final List<String> subjectCategories;
47+
public final boolean isObsolete;
4748

4849
public TaggedEntity(int start, int end, String termLabel, String termIri, String ontologyId,
4950
String stringType, String source,
50-
List<String> subjectCategories) {
51+
List<String> subjectCategories, boolean isObsolete) {
5152
this.start = start;
5253
this.end = end;
5354
this.termLabel = termLabel;
@@ -56,6 +57,7 @@ public TaggedEntity(int start, int end, String termLabel, String termIri, String
5657
this.stringType = stringType;
5758
this.source = source;
5859
this.subjectCategories = subjectCategories;
60+
this.isObsolete = isObsolete;
5961
}
6062
}
6163

@@ -251,6 +253,7 @@ private List<TaggedEntity> parseResponse(String json) {
251253
String stringType = e.has("string_type") ? e.get("string_type").getAsString() : null;
252254
String source = e.has("source") ? e.get("source").getAsString() : null;
253255
List<String> subjectCategories = jsonArrayToStringList(e, "subject_categories");
256+
boolean isObsolete = e.has("is_obsolete") && e.get("is_obsolete").getAsBoolean();
254257
results.add(new TaggedEntity(
255258
e.get("start").getAsInt(),
256259
e.get("end").getAsInt(),
@@ -259,7 +262,8 @@ private List<TaggedEntity> parseResponse(String json) {
259262
e.has("ontology_id") ? e.get("ontology_id").getAsString() : "",
260263
stringType,
261264
source,
262-
subjectCategories
265+
subjectCategories,
266+
isObsolete
263267
));
264268
}
265269
return results;

dataload/extract_strings_from_terms/src/main.rs

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ fn main() -> Result<(), Box<dyn Error>> {
6464
let mut total: u64 = 0;
6565
let mut embedded: u64 = 0;
6666

67-
writeln!(&mut writer, "pk\tontology_id\tentity_type\tiri\tlabel\thash\ttext_to_embed\tstring_type\tcurated_from_source\tcurated_from_subject_categories").unwrap();
67+
writeln!(&mut writer, "pk\tontology_id\tentity_type\tiri\tlabel\thash\ttext_to_embed\tstring_type\tcurated_from_source\tcurated_from_subject_categories\tis_obsolete").unwrap();
6868

6969
for input_file in &args.input_files {
7070
eprintln!("Processing file: {}", input_file);
@@ -167,6 +167,7 @@ fn process_entity(
167167
let mut labels:Vec<String> = Vec::new();
168168
let mut synonyms:Vec<String> = Vec::new();
169169
let mut is_defining_ontology = false;
170+
let mut is_obsolete = false;
170171
let mut curated_from_entries: Vec<CuratedFromJson> = Vec::new();
171172

172173
json.begin_object().unwrap();
@@ -184,6 +185,12 @@ fn process_entity(
184185
} else {
185186
json.skip_value().unwrap();
186187
}
188+
} else if key == "isObsolete" {
189+
if json.peek().unwrap() == ValueType::Boolean {
190+
is_obsolete = json.next_bool().unwrap();
191+
} else {
192+
json.skip_value().unwrap();
193+
}
187194
} else if key == "curatedFrom" {
188195
curated_from_entries = parse_curated_from(json);
189196
} else {
@@ -210,12 +217,15 @@ fn process_entity(
210217
let pk = format!("{}:{}:{}", ontology_id, entity_type, &iri_value);
211218
let mut written: u64 = 0;
212219

220+
let is_obsolete_str = if is_obsolete { "true" } else { "" };
221+
213222
// Emit LABEL rows (for labels and synonyms)
214223
for text in &texts_to_embed {
215224
if let Some(row) = make_row(text, tokenizer) {
216-
writeln!(writer, "{}\t{}\t{}\t{}\t{}\t{}\t{}\tLABEL\t\t",
225+
writeln!(writer, "{}\t{}\t{}\t{}\t{}\t{}\t{}\tLABEL\t\t\t{}",
217226
pk, ontology_id, entity_type, &iri_value, &label_str,
218-
row.hash, row.document
227+
row.hash, row.document,
228+
is_obsolete_str
219229
).unwrap();
220230
written += 1;
221231
}
@@ -224,11 +234,12 @@ fn process_entity(
224234
// Emit CURATION rows (for curated text-to-term mappings)
225235
for entry in &curated_from_entries {
226236
if let Some(row) = make_row(&entry.text, tokenizer) {
227-
writeln!(writer, "{}\t{}\t{}\t{}\t{}\t{}\t{}\tCURATION\t{}\t{}",
237+
writeln!(writer, "{}\t{}\t{}\t{}\t{}\t{}\t{}\tCURATION\t{}\t{}\t{}",
228238
pk, ontology_id, entity_type, &iri_value, &label_str,
229239
row.hash, row.document,
230240
entry.source,
231-
entry.subject_categories
241+
entry.subject_categories,
242+
is_obsolete_str
232243
).unwrap();
233244
written += 1;
234245
}

testcases/iri-labels/user-defined-pref-label.owl

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,15 @@
6666
<rdfs:label xml:lang="en">XMPL000003</rdfs:label>
6767
</owl:Class>
6868

69+
<!-- http://exmpl.org/xmpl/XMPL000004 -->
70+
71+
<owl:Class rdf:about="http://exmpl.org/xmpl/XMPL000004">
72+
<rdfs:subClassOf rdf:resource="http://exmpl.org/xmpl/XMPL000001"/>
73+
<owl:deprecated rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean">true</owl:deprecated>
74+
<xmpl:Preferred_name xml:lang="en">obsolete xmpl entity</xmpl:Preferred_name>
75+
<rdfs:label xml:lang="en">XMPL000004</rdfs:label>
76+
</owl:Class>
77+
6978
</rdf:RDF>
7079

7180

testcases_expected_output/iri-labels/use-user-defined-pref-label/autocomplete.jsonl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44
{"ontologyId":"xmpl","id":"xmpl+class+http://exmpl.org/xmpl/XMPL000002","label":"label of the first children class"}
55
{"ontologyId":"xmpl","id":"xmpl+class+http://exmpl.org/xmpl/XMPL000003","label":"2nd children class pref label"}
66
{"ontologyId":"xmpl","id":"xmpl+class+http://exmpl.org/xmpl/XMPL000003","label":"label of the second children class"}
7+
{"ontologyId":"xmpl","id":"xmpl+class+http://exmpl.org/xmpl/XMPL000004","label":"obsolete xmpl entity"}
78
{"ontologyId":"xmpl","id":"xmpl+property+http://exmpl.org/xmpl/Preferred_name","label":"XMPL_Preferred_name"}
89
{"ontologyId":"xmpl","id":"xmpl+property+http://exmpl.org/xmpl/Synonym","label":"XMPL_Synonym"}

0 commit comments

Comments
 (0)