|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +import pytest |
| 5 | +import spacy |
| 6 | +from deep_reference_parser.prodigy.numbered_reference_annotator import NumberedReferenceAnnotator |
| 7 | + |
| 8 | +@pytest.fixture(scope="function") |
| 9 | +def nra(): |
| 10 | + return NumberedReferenceAnnotator() |
| 11 | + |
| 12 | + |
| 13 | +def test_numbered_reference_splitter(nra): |
| 14 | + |
| 15 | + numbered_reference = { |
| 16 | + "text": "References\n 1. \n Global update on the health sector response to HIV, 2014. Geneva: World Health Organization; \n 2014:168. \n 2. \n WHO, UNICEF, UNAIDS. Global update on HIV treatment 2013: results, impact and \n opportunities. Geneva: World Health Organization; 2013:126. \n 3. \n Consolidated guidelines on the use of antiretroviral drugs for treating and preventing HIV infection: \n recommendations for a public health approach. Geneva: World Health Organization; 2013:272. \n 4.", |
| 17 | + "tokens": [ |
| 18 | + {"text": "References", "start": 0, "end": 10, "id": 0}, |
| 19 | + {"text": "\n ", "start": 10, "end": 12, "id": 1}, |
| 20 | + {"text": "1", "start": 12, "end": 13, "id": 2}, |
| 21 | + {"text": ".", "start": 13, "end": 14, "id": 3}, |
| 22 | + {"text": "\n ", "start": 15, "end": 17, "id": 4}, |
| 23 | + {"text": "Global", "start": 17, "end": 23, "id": 5}, |
| 24 | + {"text": "update", "start": 24, "end": 30, "id": 6}, |
| 25 | + {"text": "on", "start": 31, "end": 33, "id": 7}, |
| 26 | + {"text": "the", "start": 34, "end": 37, "id": 8}, |
| 27 | + {"text": "health", "start": 38, "end": 44, "id": 9}, |
| 28 | + {"text": "sector", "start": 45, "end": 51, "id": 10}, |
| 29 | + {"text": "response", "start": 52, "end": 60, "id": 11}, |
| 30 | + {"text": "to", "start": 61, "end": 63, "id": 12}, |
| 31 | + {"text": "HIV", "start": 64, "end": 67, "id": 13}, |
| 32 | + {"text": ",", "start": 67, "end": 68, "id": 14}, |
| 33 | + {"text": "2014", "start": 69, "end": 73, "id": 15}, |
| 34 | + {"text": ".", "start": 73, "end": 74, "id": 16}, |
| 35 | + {"text": "Geneva", "start": 75, "end": 81, "id": 17}, |
| 36 | + {"text": ":", "start": 81, "end": 82, "id": 18}, |
| 37 | + {"text": "World", "start": 83, "end": 88, "id": 19}, |
| 38 | + {"text": "Health", "start": 89, "end": 95, "id": 20}, |
| 39 | + {"text": "Organization", "start": 96, "end": 108, "id": 21}, |
| 40 | + {"text": ";", "start": 108, "end": 109, "id": 22}, |
| 41 | + {"text": "\n ", "start": 110, "end": 112, "id": 23}, |
| 42 | + {"text": "2014:168", "start": 112, "end": 120, "id": 24}, |
| 43 | + {"text": ".", "start": 120, "end": 121, "id": 25}, |
| 44 | + {"text": "\n ", "start": 122, "end": 124, "id": 26}, |
| 45 | + {"text": "2", "start": 124, "end": 125, "id": 27}, |
| 46 | + {"text": ".", "start": 125, "end": 126, "id": 28}, |
| 47 | + {"text": "\n ", "start": 127, "end": 129, "id": 29}, |
| 48 | + {"text": "WHO", "start": 129, "end": 132, "id": 30}, |
| 49 | + {"text": ",", "start": 132, "end": 133, "id": 31}, |
| 50 | + {"text": "UNICEF", "start": 134, "end": 140, "id": 32}, |
| 51 | + {"text": ",", "start": 140, "end": 141, "id": 33}, |
| 52 | + {"text": "UNAIDS", "start": 142, "end": 148, "id": 34}, |
| 53 | + {"text": ".", "start": 148, "end": 149, "id": 35}, |
| 54 | + {"text": "Global", "start": 150, "end": 156, "id": 36}, |
| 55 | + {"text": "update", "start": 157, "end": 163, "id": 37}, |
| 56 | + {"text": "on", "start": 164, "end": 166, "id": 38}, |
| 57 | + {"text": "HIV", "start": 167, "end": 170, "id": 39}, |
| 58 | + {"text": "treatment", "start": 171, "end": 180, "id": 40}, |
| 59 | + {"text": "2013", "start": 181, "end": 185, "id": 41}, |
| 60 | + {"text": ":", "start": 185, "end": 186, "id": 42}, |
| 61 | + {"text": "results", "start": 187, "end": 194, "id": 43}, |
| 62 | + {"text": ",", "start": 194, "end": 195, "id": 44}, |
| 63 | + {"text": "impact", "start": 196, "end": 202, "id": 45}, |
| 64 | + {"text": "and", "start": 203, "end": 206, "id": 46}, |
| 65 | + {"text": "\n ", "start": 207, "end": 209, "id": 47}, |
| 66 | + {"text": "opportunities", "start": 209, "end": 222, "id": 48}, |
| 67 | + {"text": ".", "start": 222, "end": 223, "id": 49}, |
| 68 | + {"text": "Geneva", "start": 224, "end": 230, "id": 50}, |
| 69 | + {"text": ":", "start": 230, "end": 231, "id": 51}, |
| 70 | + {"text": "World", "start": 232, "end": 237, "id": 52}, |
| 71 | + {"text": "Health", "start": 238, "end": 244, "id": 53}, |
| 72 | + {"text": "Organization", "start": 245, "end": 257, "id": 54}, |
| 73 | + {"text": ";", "start": 257, "end": 258, "id": 55}, |
| 74 | + {"text": "2013:126", "start": 259, "end": 267, "id": 56}, |
| 75 | + {"text": ".", "start": 267, "end": 268, "id": 57}, |
| 76 | + {"text": "\n ", "start": 269, "end": 271, "id": 58}, |
| 77 | + {"text": "3", "start": 271, "end": 272, "id": 59}, |
| 78 | + {"text": ".", "start": 272, "end": 273, "id": 60}, |
| 79 | + {"text": "\n ", "start": 274, "end": 276, "id": 61}, |
| 80 | + {"text": "Consolidated", "start": 276, "end": 288, "id": 62}, |
| 81 | + {"text": "guidelines", "start": 289, "end": 299, "id": 63}, |
| 82 | + {"text": "on", "start": 300, "end": 302, "id": 64}, |
| 83 | + {"text": "the", "start": 303, "end": 306, "id": 65}, |
| 84 | + {"text": "use", "start": 307, "end": 310, "id": 66}, |
| 85 | + {"text": "of", "start": 311, "end": 313, "id": 67}, |
| 86 | + {"text": "antiretroviral", "start": 314, "end": 328, "id": 68}, |
| 87 | + {"text": "drugs", "start": 329, "end": 334, "id": 69}, |
| 88 | + {"text": "for", "start": 335, "end": 338, "id": 70}, |
| 89 | + {"text": "treating", "start": 339, "end": 347, "id": 71}, |
| 90 | + {"text": "and", "start": 348, "end": 351, "id": 72}, |
| 91 | + {"text": "preventing", "start": 352, "end": 362, "id": 73}, |
| 92 | + {"text": "HIV", "start": 363, "end": 366, "id": 74}, |
| 93 | + {"text": "infection", "start": 367, "end": 376, "id": 75}, |
| 94 | + {"text": ":", "start": 376, "end": 377, "id": 76}, |
| 95 | + {"text": "\n ", "start": 378, "end": 380, "id": 77}, |
| 96 | + {"text": "recommendations", "start": 380, "end": 395, "id": 78}, |
| 97 | + {"text": "for", "start": 396, "end": 399, "id": 79}, |
| 98 | + {"text": "a", "start": 400, "end": 401, "id": 80}, |
| 99 | + {"text": "public", "start": 402, "end": 408, "id": 81}, |
| 100 | + {"text": "health", "start": 409, "end": 415, "id": 82}, |
| 101 | + {"text": "approach", "start": 416, "end": 424, "id": 83}, |
| 102 | + {"text": ".", "start": 424, "end": 425, "id": 84}, |
| 103 | + {"text": "Geneva", "start": 426, "end": 432, "id": 85}, |
| 104 | + {"text": ":", "start": 432, "end": 433, "id": 86}, |
| 105 | + {"text": "World", "start": 434, "end": 439, "id": 87}, |
| 106 | + {"text": "Health", "start": 440, "end": 446, "id": 88}, |
| 107 | + {"text": "Organization", "start": 447, "end": 459, "id": 89}, |
| 108 | + {"text": ";", "start": 459, "end": 460, "id": 90}, |
| 109 | + {"text": "2013:272", "start": 461, "end": 469, "id": 91}, |
| 110 | + {"text": ".", "start": 469, "end": 470, "id": 92}, |
| 111 | + {"text": "\n", "start": 470, "end": 471, "id": 92}, |
| 112 | + {"text": "3", "start": 471, "end": 472, "id": 92}, |
| 113 | + {"text": ".", "start": 472, "end": 473, "id": 92}, |
| 114 | + ] |
| 115 | + } |
| 116 | + |
| 117 | + docs = list(nra.run([numbered_reference])) |
| 118 | + text = docs[0]["text"] |
| 119 | + spans = docs[0]["spans"] |
| 120 | + ref_1 = text[spans[0]["start"]:spans[0]["end"]] |
| 121 | + ref_2 = text[spans[1]["start"]:spans[1]["end"]] |
| 122 | + ref_3 = text[spans[2]["start"]:spans[2]["end"]] |
| 123 | + |
| 124 | + assert len(spans) == 3 |
| 125 | + assert ref_1 == "Global update on the health sector response to HIV, 2014. Geneva: World Health Organization; \n 2014:168." |
| 126 | + assert ref_2.strip() == "WHO, UNICEF, UNAIDS. Global update on HIV treatment 2013: results, impact and \n opportunities. Geneva: World Health Organization; 2013:126." |
| 127 | + assert ref_3.strip() == "Consolidated guidelines on the use of antiretroviral drugs for treating and preventing HIV infection: \n recommendations for a public health approach. Geneva: World Health Organization; 2013:272." |
| 128 | + |
| 129 | +def test_numbered_reference_splitter_line_endings(nra): |
| 130 | + """ |
| 131 | + Test case where there two line enedings immediately preceding the reference |
| 132 | + index. |
| 133 | + """ |
| 134 | + |
| 135 | + numbered_reference = { |
| 136 | + "text": "References\n\n1. \n Global update on the health sector response to HIV, 2014. Geneva: World Health Organization; \n 2014:168. \n\n2. \n WHO, UNICEF, UNAIDS. Global update on HIV treatment 2013: results, impact and \n opportunities. Geneva: World Health Organization; 2013:126.\n\n3.", |
| 137 | + "tokens": [ |
| 138 | + {"text": "References", "start": 0, "end": 10, "id": 0}, |
| 139 | + {"text": "\n\n", "start": 10, "end": 12, "id": 1}, |
| 140 | + {"text": "1", "start": 12, "end": 13, "id": 2}, |
| 141 | + {"text": ".", "start": 13, "end": 14, "id": 3}, |
| 142 | + {"text": "\n ", "start": 15, "end": 17, "id": 4}, |
| 143 | + {"text": "Global", "start": 17, "end": 23, "id": 5}, |
| 144 | + {"text": "update", "start": 24, "end": 30, "id": 6}, |
| 145 | + {"text": "on", "start": 31, "end": 33, "id": 7}, |
| 146 | + {"text": "the", "start": 34, "end": 37, "id": 8}, |
| 147 | + {"text": "health", "start": 38, "end": 44, "id": 9}, |
| 148 | + {"text": "sector", "start": 45, "end": 51, "id": 10}, |
| 149 | + {"text": "response", "start": 52, "end": 60, "id": 11}, |
| 150 | + {"text": "to", "start": 61, "end": 63, "id": 12}, |
| 151 | + {"text": "HIV", "start": 64, "end": 67, "id": 13}, |
| 152 | + {"text": ",", "start": 67, "end": 68, "id": 14}, |
| 153 | + {"text": "2014", "start": 69, "end": 73, "id": 15}, |
| 154 | + {"text": ".", "start": 73, "end": 74, "id": 16}, |
| 155 | + {"text": "Geneva", "start": 75, "end": 81, "id": 17}, |
| 156 | + {"text": ":", "start": 81, "end": 82, "id": 18}, |
| 157 | + {"text": "World", "start": 83, "end": 88, "id": 19}, |
| 158 | + {"text": "Health", "start": 89, "end": 95, "id": 20}, |
| 159 | + {"text": "Organization", "start": 96, "end": 108, "id": 21}, |
| 160 | + {"text": ";", "start": 108, "end": 109, "id": 22}, |
| 161 | + {"text": "\n ", "start": 110, "end": 112, "id": 23}, |
| 162 | + {"text": "2014:168", "start": 112, "end": 120, "id": 24}, |
| 163 | + {"text": ".", "start": 120, "end": 121, "id": 25}, |
| 164 | + {"text": "\n\n", "start": 122, "end": 124, "id": 26}, |
| 165 | + {"text": "2", "start": 124, "end": 125, "id": 27}, |
| 166 | + {"text": ".", "start": 125, "end": 126, "id": 28}, |
| 167 | + {"text": "\n ", "start": 127, "end": 129, "id": 29}, |
| 168 | + {"text": "WHO", "start": 129, "end": 132, "id": 30}, |
| 169 | + {"text": ",", "start": 132, "end": 133, "id": 31}, |
| 170 | + {"text": "UNICEF", "start": 134, "end": 140, "id": 32}, |
| 171 | + {"text": ",", "start": 140, "end": 141, "id": 33}, |
| 172 | + {"text": "UNAIDS", "start": 142, "end": 148, "id": 34}, |
| 173 | + {"text": ".", "start": 148, "end": 149, "id": 35}, |
| 174 | + {"text": "Global", "start": 150, "end": 156, "id": 36}, |
| 175 | + {"text": "update", "start": 157, "end": 163, "id": 37}, |
| 176 | + {"text": "on", "start": 164, "end": 166, "id": 38}, |
| 177 | + {"text": "HIV", "start": 167, "end": 170, "id": 39}, |
| 178 | + {"text": "treatment", "start": 171, "end": 180, "id": 40}, |
| 179 | + {"text": "2013", "start": 181, "end": 185, "id": 41}, |
| 180 | + {"text": ":", "start": 185, "end": 186, "id": 42}, |
| 181 | + {"text": "results", "start": 187, "end": 194, "id": 43}, |
| 182 | + {"text": ",", "start": 194, "end": 195, "id": 44}, |
| 183 | + {"text": "impact", "start": 196, "end": 202, "id": 45}, |
| 184 | + {"text": "and", "start": 203, "end": 206, "id": 46}, |
| 185 | + {"text": "\n ", "start": 207, "end": 209, "id": 47}, |
| 186 | + {"text": "opportunities", "start": 209, "end": 222, "id": 48}, |
| 187 | + {"text": ".", "start": 222, "end": 223, "id": 49}, |
| 188 | + {"text": "Geneva", "start": 224, "end": 230, "id": 50}, |
| 189 | + {"text": ":", "start": 230, "end": 231, "id": 51}, |
| 190 | + {"text": "World", "start": 232, "end": 237, "id": 52}, |
| 191 | + {"text": "Health", "start": 238, "end": 244, "id": 53}, |
| 192 | + {"text": "Organization", "start": 245, "end": 257, "id": 54}, |
| 193 | + {"text": ";", "start": 257, "end": 258, "id": 55}, |
| 194 | + {"text": "2013:126", "start": 259, "end": 267, "id": 56}, |
| 195 | + {"text": ".", "start": 260, "end": 261, "id": 57}, |
| 196 | + {"text": "\n\n", "start": 261, "end": 263, "id": 58}, |
| 197 | + {"text": "3", "start": 262, "end": 264, "id": 59}, |
| 198 | + {"text": ".", "start": 263, "end": 265, "id": 60}, |
| 199 | + ] |
| 200 | + } |
| 201 | + |
| 202 | + docs = list(nra.run([numbered_reference])) |
| 203 | + text = docs[0]["text"] |
| 204 | + spans = docs[0]["spans"] |
| 205 | + ref_1 = text[spans[0]["start"]:spans[0]["end"]] |
| 206 | + ref_2 = text[spans[1]["start"]:spans[1]["end"]] |
| 207 | + |
| 208 | + assert len(spans) == 2 |
| 209 | + assert ref_1.strip() == "Global update on the health sector response to HIV, 2014. Geneva: World Health Organization; \n 2014:168." |
| 210 | + assert ref_2.strip() == "WHO, UNICEF, UNAIDS. Global update on HIV treatment 2013: results, impact and \n opportunities. Geneva: World Health Organization; 2013:126" |
0 commit comments