Skip to content

Commit 8c513ae

Browse files
authored
Merge pull request #26 from monarch-initiative/develop
Develop
2 parents 50e31d5 + ce6633d commit 8c513ae

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+3401
-215
lines changed

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ nav:
3535
- "Template": 'languages.md'
3636
- "English": "english.md"
3737
- Setup: "setup.md"
38+
- Batch: "batch.md"
3839

3940
plugins:
4041
- search

pom.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>org.monarchinitiative</groupId>
88
<artifactId>phenopacket2prompt</artifactId>
9-
<version>0.3.14</version>
9+
<version>0.4.0</version>
1010

1111
<name>phenopacket2prompt</name>
1212
<url>https://github.com/monarch-initiative/phenopacket2prompt</url>
@@ -186,8 +186,8 @@
186186
<artifactId>maven-compiler-plugin</artifactId>
187187
<version>3.8.1</version>
188188
<configuration>
189-
<source>${java.version}</source>
190-
<target>${java.version}</target>
189+
<source>21</source>
190+
<target>21</target>
191191
<!-- <release>1.8</release> -->
192192
</configuration>
193193
</plugin>

src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java

Lines changed: 67 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import org.monarchinitiative.phenopacket2prompt.model.PhenopacketDisease;
1010
import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual;
1111
import org.monarchinitiative.phenopacket2prompt.output.CorrectResult;
12+
import org.monarchinitiative.phenopacket2prompt.output.PpktCopy;
1213
import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator;
1314
import org.slf4j.Logger;
1415
import org.slf4j.LoggerFactory;
@@ -23,6 +24,7 @@
2324
import java.util.ArrayList;
2425
import java.util.List;
2526
import java.util.Map;
27+
import java.util.Set;
2628
import java.util.concurrent.Callable;
2729

2830
@CommandLine.Command(name = "batch", aliases = {"B"},
@@ -40,9 +42,16 @@ public class GbtTranslateBatchCommand implements Callable<Integer> {
4042
description = "path to translations file")
4143
private String translationsPath = "data/hp-international.obo";
4244

45+
@CommandLine.Option(names = {"-o", "--outdir"},
46+
description = "path to outdir")
47+
private String outdirname = "prompts";
48+
4349
@CommandLine.Option(names = {"-d", "--dir"}, description = "Path to directory with JSON phenopacket files", required = true)
4450
private String ppktDir;
4551

52+
private String currentLanguageCode = null;
53+
private int currentCount;
54+
4655
@Override
4756
public Integer call() throws Exception {
4857
File hpJsonFile = new File(hpoJsonPath);
@@ -57,19 +66,51 @@ public Integer call() throws Exception {
5766
return 1;
5867
}
5968
HpInternationalOboParser oboParser = new HpInternationalOboParser(translationsFile);
69+
6070
Map<String, HpInternational> internationalMap = oboParser.getLanguageToInternationalMap();
6171
LOGGER.info("Got {} translations", internationalMap.size());
6272
List<File> ppktFiles = getAllPhenopacketJsonFiles();
63-
createDir("prompts");
73+
createDir(outdirname);
6474
List<CorrectResult> correctResultList = outputPromptsEnglish(ppktFiles, hpo);
6575
// output all non-English languages here
66-
PromptGenerator spanish = PromptGenerator.spanish(hpo, internationalMap.get("es"));
76+
77+
// SPANISH
78+
PromptGenerator spanish = PromptGenerator.spanish(internationalMap.get("es"));
79+
resetOutput("es");
6780
outputPromptsInternational(ppktFiles, hpo, "es", spanish);
81+
82+
resetOutput("nl");
83+
PromptGenerator dutch = PromptGenerator.dutch(internationalMap.get("nl"));
84+
outputPromptsInternational(ppktFiles, hpo, "nl", dutch);
85+
// GERMAN
86+
resetOutput("de");
87+
PromptGenerator german = PromptGenerator.german(internationalMap.get("de"));
88+
outputPromptsInternational(ppktFiles, hpo, "de", german);
89+
90+
// ITALIAN
91+
resetOutput("it");
92+
PromptGenerator italian = PromptGenerator.italian(internationalMap.get("it"));
93+
outputPromptsInternational(ppktFiles, hpo, "it", italian);
94+
resetOutput("finished");
95+
// output original phenopackets
96+
PpktCopy pcopy = new PpktCopy(new File(outdirname));
97+
for (var file : ppktFiles) {
98+
pcopy.copyFile(file);
99+
}
100+
68101
// output file with correct diagnosis list
69102
outputCorrectResults(correctResultList);
70103
return 0;
71104
}
72105

106+
private void resetOutput(String es) {
107+
if (currentLanguageCode != null) {
108+
System.out.printf("Finished writing %d phenopackets in %s\n", currentCount, currentLanguageCode);
109+
}
110+
currentLanguageCode = es;
111+
currentCount = 0;
112+
}
113+
73114
private void outputCorrectResults(List<CorrectResult> correctResultList) {
74115
File outfile = new File("prompts" + File.separator + "correct_results.tsv");
75116
try (BufferedWriter bw = new BufferedWriter(new FileWriter(outfile))) {
@@ -79,12 +120,12 @@ private void outputCorrectResults(List<CorrectResult> correctResultList) {
79120
} catch (IOException e) {
80121
e.printStackTrace();
81122
}
82-
System.out.printf("[INFO] Output a total of %d prompts in en and es.\n", correctResultList.size());
123+
System.out.printf("[INFO] Output a total of %d prompts in en, es, nl, de, and it.\n", correctResultList.size());
83124
}
84125

85126

86-
private String getFileName(String phenopacketID) {
87-
return phenopacketID.replaceAll("[^\\w]", phenopacketID).replaceAll("/","_") + "-prompt.txt";
127+
private String getFileName(String phenopacketID, String languageCode) {
128+
return phenopacketID.replaceAll("[^\\w]","_") + "_" + languageCode + "-prompt.txt";
88129
}
89130

90131

@@ -94,21 +135,28 @@ private void outputPromptsInternational(List<File> ppktFiles, Ontology hpo, Stri
94135
createDir(dirpath);
95136
List<String> diagnosisList = new ArrayList<>();
96137
for (var f: ppktFiles) {
97-
PpktIndividual individual = new PpktIndividual(f);
138+
PpktIndividual individual = PpktIndividual.fromFile(f);
98139
List<PhenopacketDisease> diseaseList = individual.getDiseases();
99140
if (diseaseList.size() != 1) {
100-
System.err.println(String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()));
101-
continue;
141+
String errmsg = String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId());
142+
throw new PhenolRuntimeException(errmsg);
102143
}
103144
PhenopacketDisease pdisease = diseaseList.get(0);
104-
String promptFileName = getFileName( individual.getPhenopacketId());
145+
String promptFileName = getFileName( individual.getPhenopacketId(), languageCode);
105146
String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath());
106147
try {
107148
diagnosisList.add(diagnosisLine);
108149
String prompt = generator.createPrompt(individual);
109150
outputPrompt(prompt, promptFileName, dirpath);
110151
} catch (Exception e) {
111-
e.printStackTrace();
152+
System.err.printf("[ERROR] Could not process %s: %s\n", promptFileName, e.getMessage());
153+
//e.printStackTrace();
154+
}
155+
}
156+
Set<String> missing = generator.getMissingTranslations();
157+
if (! missing.isEmpty()) {
158+
for (var m : missing) {
159+
System.out.printf("[%s] Missing: %s\n", languageCode, m);
112160
}
113161
}
114162
}
@@ -117,17 +165,17 @@ private void outputPromptsInternational(List<File> ppktFiles, Ontology hpo, Stri
117165
private List<CorrectResult> outputPromptsEnglish(List<File> ppktFiles, Ontology hpo) {
118166
createDir("prompts/en");
119167
List<CorrectResult> correctResultList = new ArrayList<>();
120-
PromptGenerator generator = PromptGenerator.english(hpo);
168+
PromptGenerator generator = PromptGenerator.english();
121169

122170
for (var f: ppktFiles) {
123-
PpktIndividual individual = new PpktIndividual(f);
171+
PpktIndividual individual = PpktIndividual.fromFile(f);
124172
List<PhenopacketDisease> diseaseList = individual.getDiseases();
125173
if (diseaseList.size() != 1) {
126-
System.err.println(String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()));
174+
System.err.printf("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId());
127175
continue;
128176
}
129177
PhenopacketDisease pdisease = diseaseList.get(0);
130-
String promptFileName = getFileName( individual.getPhenopacketId());
178+
String promptFileName = getFileName( individual.getPhenopacketId(), "en");
131179
String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath());
132180
try {
133181
String prompt = generator.createPrompt(individual);
@@ -150,7 +198,8 @@ private void outputPrompt(String prompt, String promptFileName, String dir) {
150198
} catch (IOException e) {
151199
e.printStackTrace();
152200
}
153-
System.out.print(".");
201+
System.out.printf("%s %d.\r", currentLanguageCode, currentCount);
202+
currentCount++;
154203
}
155204

156205

@@ -177,6 +226,9 @@ private List<File> getAllPhenopacketJsonFiles() {
177226
for (File item : items) {
178227
if (item.isDirectory())
179228
ppktDirectories.add(ppktDir+item.getName());
229+
else if (item.isFile() && item.getName().endsWith(".json")) {
230+
ppktFiles.add(item);
231+
}
180232
}
181233
for (var f: ppktDirectories) {
182234
File subdir = new File(f);

src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ public class GptTranslateCommand implements Callable<Integer> {
3434
@CommandLine.Option(names = {"-p", "--ppkt"}, description = "Path to JSON phenopacket file", required = true)
3535
private String ppkt;
3636

37+
@CommandLine.Option(names = {"-l", "--language"}, description = "Language code", defaultValue = "de")
38+
private String languageCode;
39+
3740

3841
@Override
3942
public Integer call() throws Exception {
@@ -54,13 +57,31 @@ public Integer call() throws Exception {
5457

5558

5659
System.out.println(hpo.version().orElse("n/a"));
57-
PromptGenerator generator = PromptGenerator.english(hpo);
58-
PpktIndividual individual = new PpktIndividual(new File(ppkt));
60+
PromptGenerator generator = PromptGenerator.english();
61+
PpktIndividual individual = PpktIndividual.fromFile(new File(ppkt));
5962
String prompt = generator.createPrompt(individual);
6063
System.out.println(prompt);
61-
System.out.println("SPANISH");
62-
PromptGenerator spanish = PromptGenerator.spanish(hpo, internationalMap.get("es"));
63-
prompt = spanish.createPrompt(individual);
64+
switch (languageCode) {
65+
case "de" -> {
66+
PromptGenerator german = PromptGenerator.german(internationalMap.get("de"));
67+
prompt = german.createPrompt(individual);
68+
}
69+
case "es" -> {
70+
PromptGenerator spanish = PromptGenerator.spanish(internationalMap.get("es"));
71+
prompt = spanish.createPrompt(individual);
72+
}
73+
case "nl" -> {
74+
PromptGenerator dutch = PromptGenerator.dutch(internationalMap.get("nl"));
75+
prompt = dutch.createPrompt(individual);
76+
}
77+
case "it" -> {
78+
PromptGenerator italian = PromptGenerator.italian(internationalMap.get("it"));
79+
prompt = italian.createPrompt(individual);
80+
}
81+
default -> prompt = "did not recognize language code " + languageCode;
82+
}
83+
84+
6485
System.out.println(prompt);
6586

6687
return 0;

src/main/java/org/monarchinitiative/phenopacket2prompt/international/HpInternationalOboParser.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public class HpInternationalOboParser {
2727
* @return in this case "tr"
2828
*/
2929
public Optional<String> getLanguage(String annots) {
30-
final String translation = "translation:language=\"(\\w{2,2})\"";
30+
final String translation = "translation:language=\"(\\w{2,3})\"";
3131
final Pattern pattern = Pattern.compile(translation);
3232
Matcher matcher = pattern.matcher(annots);
3333
if (matcher.find()) {
@@ -53,7 +53,7 @@ public Optional<String> getTranslation(String annots) {
5353
public HpInternationalOboParser(File file) {
5454
languageToInternationalMap = new HashMap<>();
5555
String pattern = "id: (HP:\\d{7,7})";
56-
Set<String> acronyms = Set.of("cs", "en", "es", "fr", "ja", "nl", "nna", "tr", "tw", "zh");
56+
Set<String> acronyms = Set.of("cs", "en", "de", "dtp", "it", "es", "fr", "ja", "nl", "nna", "tr", "tw", "zh");
5757
for (String acronym : acronyms) {
5858
languageToInternationalMap.put(acronym, new HpInternational(acronym));
5959
}
@@ -84,9 +84,13 @@ public HpInternationalOboParser(File file) {
8484
Optional<String> opt = getLanguage(annots);
8585
if (opt.isPresent()) {
8686
String language = opt.get();
87+
if (! languageToInternationalMap.containsKey(language)) {
88+
System.err.println("[ERROR] Could not find language \"" + language + "\"");
89+
continue;
90+
}
8791
languageToInternationalMap.get(language).addTerm(currentHpoTermId, hpoLabel);
8892
} else {
89-
System.err.printf("[ERROR] Could not extract language for %s.", line);
93+
System.err.printf("[ERROR] Could not extract language for %s.\n", line);
9094
}
9195
}
9296

@@ -98,13 +102,13 @@ public HpInternationalOboParser(File file) {
98102
} catch (IOException e) {
99103
e.printStackTrace();
100104
}
101-
for (String language : languageToInternationalMap.keySet()) {
105+
/*for (String language : languageToInternationalMap.keySet()) {
102106
System.out.println(language);
103107
HpInternational international = languageToInternationalMap.get(language);
104108
for (var entry : international.getTermIdToLabelMap().entrySet()) {
105109
System.out.printf("\t%s: %s\n", entry.getKey().getValue(), entry.getValue());
106110
}
107-
}
111+
}*/
108112
}
109113

110114
public Map<String, HpInternational> getLanguageToInternationalMap() {

src/main/java/org/monarchinitiative/phenopacket2prompt/model/AgeNotSpecified.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package org.monarchinitiative.phenopacket2prompt.model;
22

3-
public class AgeNotSpecified implements PhenopacketAge {
3+
import java.util.Objects;
4+
5+
public final class AgeNotSpecified implements PhenopacketAge {
46
@Override
57
public String age() {
68
return "";
@@ -43,4 +45,18 @@ public int totalDays() {
4345

4446
@Override
4547
public boolean specified() {return false; }
48+
49+
@Override
50+
public int hashCode() {
51+
return Objects.hashCode(totalDays());
52+
}
53+
54+
@Override
55+
public boolean equals(Object obj) {
56+
if (! (obj instanceof PhenopacketAge)) return false;
57+
PhenopacketAge iso = (PhenopacketAge) obj;
58+
return iso.totalDays() == totalDays();
59+
}
60+
61+
4662
}

0 commit comments

Comments
 (0)