|
44 | 44 | "http://dl-learner.org/carcinogenesis#hasAtom", |
45 | 45 | "http://dl-learner.org/carcinogenesis#charge", |
46 | 46 | ], |
| 47 | + ["http://dl-learner.org/carcinogenesis#salmonella"], |
| 48 | + ["http://dl-learner.org/carcinogenesis#cytogen_sce"], |
| 49 | + ["http://dl-learner.org/carcinogenesis#cytogen_ca"], |
| 50 | + ["http://dl-learner.org/carcinogenesis#mouse_lymph"], |
| 51 | + ["http://dl-learner.org/carcinogenesis#amesTestPositive"], |
47 | 52 | ], |
48 | 53 | ), |
49 | 54 | entities, |
|
71 | 76 |
|
72 | 77 | print("\nUsing literals:") |
73 | 78 | features = [] |
74 | | -for charges in literals: |
75 | | - charges = list(map(float, *charges)) # type: ignore |
| 79 | + |
| 80 | +for literal in literals: |
| 81 | + charges, salmonella, sce, ca, lymph, pos_test = literal |
| 82 | + |
| 83 | + charges = list(charges) # type: ignore |
| 84 | + |
| 85 | + salmonella_feat = int(salmonella == "true") |
| 86 | + salmonella_missing = int(salmonella == np.NaN) |
| 87 | + |
| 88 | + sce_feat = int(sce == "true") |
| 89 | + sce_missing = int(sce == np.NaN) |
| 90 | + |
| 91 | + ca_feat = int(ca == "true") |
| 92 | + ca_missing = int(ca == np.NaN) |
| 93 | + |
| 94 | + lymph_feat = int(lymph == "true") |
| 95 | + lymph_missing = int(lymph == np.NaN) |
| 96 | + |
| 97 | + pos_test_feat = int(pos_test == "true") |
| 98 | + pos_test_missing = int(pos_test == np.NaN) |
| 99 | + |
76 | 100 | features.append( |
77 | 101 | [ |
78 | 102 | np.max(charges), |
|
81 | 105 | np.std(charges), # type: ignore |
82 | 106 | len(charges), # type: ignore |
83 | 107 | np.sum(charges), # type: ignore |
| 108 | + salmonella_feat, |
| 109 | + salmonella_missing, |
| 110 | + sce_feat, |
| 111 | + sce_missing, |
| 112 | + ca_feat, |
| 113 | + ca_missing, |
| 114 | + lymph_feat, |
| 115 | + lymph_missing, |
| 116 | + pos_test_feat, |
| 117 | + pos_test_missing, |
84 | 118 | ] |
85 | 119 | ) |
86 | 120 | features = np.array(features) # type: ignore |
87 | | -train_embeddings = np.hstack( |
| 121 | + |
| 122 | +train_embeddings2 = np.hstack( |
88 | 123 | (train_embeddings, features[: len(train_entities)]) # type: ignore |
89 | 124 | ) |
90 | | -test_embeddings = np.hstack( |
| 125 | +test_embeddings2 = np.hstack( |
91 | 126 | (test_embeddings, features[len(train_entities) :]) # type: ignore |
92 | 127 | ) |
93 | 128 |
|
| 129 | +train_features = features[: len(train_entities)] |
| 130 | +test_features = features[len(train_entities) :] |
| 131 | + |
| 132 | +# fit a Support Vector Machine on train embeddings. |
94 | 133 | clf = GridSearchCV( |
95 | 134 | SVC(random_state=RANDOM_STATE), {"C": [10 ** i for i in range(-3, 4)]} |
96 | 135 | ) |
97 | | -clf.fit(train_embeddings, train_labels) |
| 136 | +clf.fit(train_embeddings2, train_labels) |
98 | 137 |
|
99 | | -predictions = clf.predict(test_embeddings) |
| 138 | +# Evaluate the Support Vector Machine on test embeddings. |
| 139 | +predictions2 = clf.predict(test_embeddings2) |
100 | 140 | print( |
101 | 141 | f"Predicted {len(test_entities)} entities with an accuracy of " |
102 | | - + f"{accuracy_score(test_labels, predictions) * 100 :.4f}%" |
| 142 | + + f"{accuracy_score(test_labels, predictions2) * 100 :.4f}%" |
103 | 143 | ) |
104 | 144 | print(f"Confusion Matrix ([[TN, FP], [FN, TP]]):") |
105 | | -print(confusion_matrix(test_labels, predictions)) |
| 145 | +print(confusion_matrix(test_labels, predictions2)) |
106 | 146 |
|
107 | | -# Reduce the dimensions of entity embeddings to represent them in a 2D plane. |
108 | | -X_tsne = TSNE(random_state=RANDOM_STATE).fit_transform( |
109 | | - np.vstack((train_embeddings, test_embeddings)) |
110 | | -) |
| 147 | +f, ax = plt.subplots(1, 2, figsize=(15, 6)) |
111 | 148 |
|
112 | 149 | # Define the color map. |
113 | 150 | colors = ["r", "g"] |
114 | 151 | color_map = {} |
115 | 152 | for i, label in enumerate(set(labels)): |
116 | 153 | color_map[label] = colors[i] |
117 | 154 |
|
118 | | -# Set the graph with a certain size. |
119 | | -plt.figure(figsize=(10, 4)) |
| 155 | +ax[0].set_title( |
| 156 | + f"Without Literals ({accuracy_score(test_labels, predictions) * 100:.2f}%)" |
| 157 | +) |
| 158 | + |
| 159 | +# Reduce the dimensions of entity embeddings without literals to represent them in a 2D plane. |
| 160 | +X_tsne = TSNE(random_state=RANDOM_STATE).fit_transform( |
| 161 | + np.vstack((train_embeddings, test_embeddings)) |
| 162 | +) |
120 | 163 |
|
121 | | -# Plot the train embeddings. |
122 | | -plt.scatter( |
| 164 | +# Plot the train embeddings without literals. |
| 165 | +ax[0].scatter( |
123 | 166 | X_tsne[: len(train_entities), 0], |
124 | 167 | X_tsne[: len(train_entities), 1], |
125 | 168 | edgecolors=[color_map[i] for i in labels[: len(train_entities)]], |
126 | 169 | facecolors=[color_map[i] for i in labels[: len(train_entities)]], |
127 | 170 | ) |
128 | 171 |
|
129 | | -# Plot the test embeddings. |
130 | | -plt.scatter( |
| 172 | +# Plot the test embeddings without literals. |
| 173 | +ax[0].scatter( |
131 | 174 | X_tsne[len(train_entities) :, 0], |
132 | 175 | X_tsne[len(train_entities) :, 1], |
133 | 176 | edgecolors=[color_map[i] for i in labels[len(train_entities) :]], |
134 | 177 | facecolors="none", |
135 | 178 | ) |
136 | 179 |
|
137 | | -# Annotate few points. |
138 | | -plt.annotate( |
139 | | - entities[25].split("/")[-1], |
140 | | - xy=(X_tsne[25, 0], X_tsne[25, 1]), |
141 | | - xycoords="data", |
142 | | - xytext=(0.01, 0.0), |
143 | | - fontsize=8, |
144 | | - textcoords="axes fraction", |
145 | | - arrowprops=dict(arrowstyle="->", facecolor="black"), |
| 180 | +# Create a legend. |
| 181 | +ax[0].scatter([], [], edgecolors="r", facecolors="r", label="train -") |
| 182 | +ax[0].scatter([], [], edgecolors="g", facecolors="g", label="train +") |
| 183 | +ax[0].scatter([], [], edgecolors="r", facecolors="none", label="test -") |
| 184 | +ax[0].scatter([], [], edgecolors="g", facecolors="none", label="test +") |
| 185 | +ax[0].legend(loc="upper right", ncol=2) |
| 186 | + |
| 187 | +ax[1].set_title( |
| 188 | + f"With Literals ({accuracy_score(test_labels, predictions2) * 100 :.2f}%)" |
146 | 189 | ) |
147 | | -plt.annotate( |
148 | | - entities[35].split("/")[-1], |
149 | | - xy=(X_tsne[35, 0], X_tsne[35, 1]), |
150 | | - xycoords="data", |
151 | | - xytext=(0.4, 0.0), |
152 | | - fontsize=8, |
153 | | - textcoords="axes fraction", |
154 | | - arrowprops=dict(arrowstyle="->", facecolor="black"), |
| 190 | + |
| 191 | +# Reduce the dimensions of entity embeddings with literals to represent them in a 2D plane. |
| 192 | +X_tsne = TSNE(random_state=RANDOM_STATE).fit_transform( |
| 193 | + np.vstack((train_embeddings2, test_embeddings2)) |
155 | 194 | ) |
156 | 195 |
|
157 | | -# Create a legend. |
158 | | -plt.scatter([], [], edgecolors="r", facecolors="r", label="train -") |
159 | | -plt.scatter([], [], edgecolors="g", facecolors="g", label="train +") |
160 | | -plt.scatter([], [], edgecolors="r", facecolors="none", label="test -") |
161 | | -plt.scatter([], [], edgecolors="g", facecolors="none", label="test +") |
162 | | -plt.legend(loc="upper right", ncol=2) |
163 | | - |
164 | | -# Plot the test embeddings. |
165 | | -plt.scatter( |
| 196 | +# Plot the train embeddings with literals. |
| 197 | +ax[1].scatter( |
| 198 | + X_tsne[: len(train_entities), 0], |
| 199 | + X_tsne[: len(train_entities), 1], |
| 200 | + edgecolors=[color_map[i] for i in labels[: len(train_entities)]], |
| 201 | + facecolors=[color_map[i] for i in labels[: len(train_entities)]], |
| 202 | +) |
| 203 | + |
| 204 | +# Plot the test embeddings with literals. |
| 205 | +ax[1].scatter( |
166 | 206 | X_tsne[len(train_entities) :, 0], |
167 | 207 | X_tsne[len(train_entities) :, 1], |
168 | 208 | edgecolors=[color_map[i] for i in labels[len(train_entities) :]], |
169 | 209 | facecolors="none", |
170 | 210 | ) |
171 | 211 |
|
172 | | -# Display the graph with a title, removing the axes for |
173 | | -# better readability. |
174 | | -plt.title("pyRDF2Vec", fontsize=32) |
175 | | -plt.axis("off") |
| 212 | +# Create a legend. |
| 213 | +ax[1].scatter([], [], edgecolors="r", facecolors="r", label="train -") |
| 214 | +ax[1].scatter([], [], edgecolors="g", facecolors="g", label="train +") |
| 215 | +ax[1].scatter([], [], edgecolors="r", facecolors="none", label="test -") |
| 216 | +ax[1].scatter([], [], edgecolors="g", facecolors="none", label="test +") |
| 217 | +ax[1].legend(loc="upper right", ncol=2) |
| 218 | + |
176 | 219 | plt.show() |
0 commit comments