diff --git a/Makefile b/Makefile index a159af5..e86113c 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,8 @@ install: - pipenv install --dev -e . + pip install -e . download_models: - pipenv run python -m spacy download en - pipenv run python -m nltk.downloader averaged_perceptron_tagger + python -m spacy download en clean: -rm -rf build @@ -17,11 +16,11 @@ clean_test: dist: make clean make download_models - pipenv run python setup.py bdist_wheel --dist-dir target + python setup.py bdist_wheel --dist-dir target test: make clean_test - pipenv run nosetests --with-coverage --cover-html -s -v --cover-package=nerds + nosetests --with-coverage --cover-html -s --verbosity=2 --cover-package=nerds nerds/test/ lint: - pipenv run flake8 nerds --verbose + flake8 --ignore=W605,W504 --verbose nerds diff --git a/Makefile.pipenv b/Makefile.pipenv new file mode 100644 index 0000000..a159af5 --- /dev/null +++ b/Makefile.pipenv @@ -0,0 +1,27 @@ +install: + pipenv install --dev -e . + +download_models: + pipenv run python -m spacy download en + pipenv run python -m nltk.downloader averaged_perceptron_tagger + +clean: + -rm -rf build + -rm -rf target + -find . -name "__pycache__" -type d -depth -exec rm -rf {} \; + +clean_test: + -rm -rf cover + -rm .coverage + +dist: + make clean + make download_models + pipenv run python setup.py bdist_wheel --dist-dir target + +test: + make clean_test + pipenv run nosetests --with-coverage --cover-html -s -v --cover-package=nerds + +lint: + pipenv run flake8 nerds --verbose diff --git a/Pipfile b/Pipfile index 7de5051..66a9531 100644 --- a/Pipfile +++ b/Pipfile @@ -20,7 +20,7 @@ scipy = "*" sklearn = "*" sklearn-crfsuite = "*" spacy = "==2.0.11" -tensorflow = "*" +tensorflow >= "1.15.2" nose = "*" coverage = "*" "flake8" = "*" diff --git a/Pipfile.lock b/Pipfile.lock deleted file mode 100644 index 171961b..0000000 --- a/Pipfile.lock +++ /dev/null @@ -1,1335 +0,0 @@ -{ - "_meta": { - "hash": { - "sha256": "19fca82c6035492ccb511569ec10d7677d93da24ed0a12958dfd02a00636b8dd" - }, - "pipfile-spec": 6, - "requires": { - "python_version": "3.6" - }, - "sources": [ - { - "name": "pypi", - "url": "https://pypi.org/simple", - "verify_ssl": true - } - ] - }, - "default": { - "absl-py": { - "hashes": [ - "sha256:e0eb8358b549552b1cc5972350bc3e41dd0a926c15b3ff95ce60f3c78c80824c" - ], - "version": "==0.2.2" - }, - "anago": { - "hashes": [ - "sha256:a4bd7b0d6109408fbdd9cdd2d6bfb60221bd7293c0645a75e6fddddce40abcc1" - ], - "index": "pypi", - "version": "==1.0.6" - }, - "astor": { - "hashes": [ - "sha256:64c805f1ad6fbc505633416b6174fc23796eb164f371a7dc1f3951ea30560fb5", - "sha256:ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d" - ], - "version": "==0.6.2" - }, - "bleach": { - "hashes": [ - "sha256:978e758599b54cd3caa2e160d74102879b230ea8dc93871d0783721eef58bc65", - "sha256:e67f46adcec78dbc3c04462f3aba3213a673d5652eba2609ed1ef15492a44b8d" - ], - "version": "==1.5.0" - }, - "certifi": { - "hashes": [ - "sha256:13e698f54293db9f89122b0581843a782ad0934a4fe0172d2a980ba77fc61bb7", - "sha256:9fa520c1bacfb634fa7af20a76bcbd3d5fb390481724c597da32c719a7dca4b0" - ], - "version": "==2018.4.16" - }, - "chardet": { - "hashes": [ - "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", - "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" - ], - "version": "==3.0.4" - }, - "coverage": { - "hashes": [ - "sha256:03481e81d558d30d230bc12999e3edffe392d244349a90f4ef9b88425fac74ba", - "sha256:0b136648de27201056c1869a6c0d4e23f464750fd9a9ba9750b8336a244429ed", - "sha256:198626739a79b09fa0a2f06e083ffd12eb55449b5f8bfdbeed1df4910b2ca640", - "sha256:28b2191e7283f4f3568962e373b47ef7f0392993bb6660d079c62bd50fe9d162", - "sha256:2eb564bbf7816a9d68dd3369a510be3327f1c618d2357fa6b1216994c2e3d508", - "sha256:337ded681dd2ef9ca04ef5d93cfc87e52e09db2594c296b4a0a3662cb1b41249", - "sha256:3a2184c6d797a125dca8367878d3b9a178b6fdd05fdc2d35d758c3006a1cd694", - "sha256:3c79a6f7b95751cdebcd9037e4d06f8d5a9b60e4ed0cd231342aa8ad7124882a", - "sha256:3d72c20bd105022d29b14a7d628462ebdc61de2f303322c0212a054352f3b287", - "sha256:3eb42bf89a6be7deb64116dd1cc4b08171734d721e7a7e57ad64cc4ef29ed2f1", - "sha256:4635a184d0bbe537aa185a34193898eee409332a8ccb27eea36f262566585000", - "sha256:56e448f051a201c5ebbaa86a5efd0ca90d327204d8b059ab25ad0f35fbfd79f1", - "sha256:5a13ea7911ff5e1796b6d5e4fbbf6952381a611209b736d48e675c2756f3f74e", - "sha256:69bf008a06b76619d3c3f3b1983f5145c75a305a0fea513aca094cae5c40a8f5", - "sha256:6bc583dc18d5979dc0f6cec26a8603129de0304d5ae1f17e57a12834e7235062", - "sha256:701cd6093d63e6b8ad7009d8a92425428bc4d6e7ab8d75efbb665c806c1d79ba", - "sha256:7608a3dd5d73cb06c531b8925e0ef8d3de31fed2544a7de6c63960a1e73ea4bc", - "sha256:76ecd006d1d8f739430ec50cc872889af1f9c1b6b8f48e29941814b09b0fd3cc", - "sha256:7aa36d2b844a3e4a4b356708d79fd2c260281a7390d678a10b91ca595ddc9e99", - "sha256:7d3f553904b0c5c016d1dad058a7554c7ac4c91a789fca496e7d8347ad040653", - "sha256:7e1fe19bd6dce69d9fd159d8e4a80a8f52101380d5d3a4d374b6d3eae0e5de9c", - "sha256:8c3cb8c35ec4d9506979b4cf90ee9918bc2e49f84189d9bf5c36c0c1119c6558", - "sha256:9d6dd10d49e01571bf6e147d3b505141ffc093a06756c60b053a859cb2128b1f", - "sha256:be6cfcd8053d13f5f5eeb284aa8a814220c3da1b0078fa859011c7fffd86dab9", - "sha256:c1bb572fab8208c400adaf06a8133ac0712179a334c09224fb11393e920abcdd", - "sha256:de4418dadaa1c01d497e539210cb6baa015965526ff5afc078c57ca69160108d", - "sha256:e05cb4d9aad6233d67e0541caa7e511fa4047ed7750ec2510d466e806e0255d6", - "sha256:f3f501f345f24383c0000395b26b726e46758b71393267aeae0bd36f8b3ade80" - ], - "index": "pypi", - "version": "==4.5.1" - }, - "cymem": { - "hashes": [ - "sha256:00bb3645dfb9a020d735ba3d6f822b04656388180588d8b2cebde967ee678bcc", - "sha256:0dd61d05977839a922c0d797c355b98949210575918b1743b41e38ae9fb2c3a7", - "sha256:4bc1056b52d959fcbb1e0f32ec84fa131754d6be1e36b65782c6ac86419f4bf3", - "sha256:4c5d9ca6ec706792b8d9b1faf6db77b95545c388c768b21d940f197aa7efbb7e", - "sha256:50292f4dd0d950a8698bae27d71efe59da7ff08e591b735e08b658aae42c4745", - "sha256:616d06333f46dd03c128d97912d361183fc02249e6420a7b7907b41214c51562", - "sha256:944af97d4d34a2470b5199f1c31d2dfc79cdec7bd7a41354d839a8ab87fdfaa6", - "sha256:b38056efb99078b06c504adb5f03a8d9e822a5543451737b746028a71c4b1ac3", - "sha256:b6513b2926c60d641f159e79e6fb16460dfb50ebcce31a5af0370c51837c7efc", - "sha256:daa6003fcc199752ab703142021cff74774872a932303b240dc0ea177adf295d", - "sha256:f06d9b50da0474d7405674d8101c319d89a17d33792d6d429fe3d5c64f0d9df1" - ], - "version": "==1.31.2" - }, - "cytoolz": { - "hashes": [ - "sha256:476a2ad176de5eaef80499b7b43d4f72ba6d23df33d349088dae315e9b31c552" - ], - "version": "==0.8.2" - }, - "decorator": { - "hashes": [ - "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", - "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" - ], - "version": "==4.3.0" - }, - "dill": { - "hashes": [ - "sha256:624dc244b94371bb2d6e7f40084228a2edfff02373fe20e018bef1ee92fdd5b3" - ], - "version": "==0.2.8.2" - }, - "flake8": { - "hashes": [ - "sha256:7253265f7abd8b313e3892944044a365e3f4ac3fcdcfb4298f55ee9ddf188ba0", - "sha256:c7841163e2b576d435799169b78703ad6ac1bbb0f199994fc05f700b2a90ea37" - ], - "index": "pypi", - "version": "==3.5.0" - }, - "future": { - "hashes": [ - "sha256:e39ced1ab767b5936646cedba8bcce582398233d6a627067d4c6a454c90cfedb" - ], - "index": "pypi", - "version": "==0.16.0" - }, - "gast": { - "hashes": [ - "sha256:7068908321ecd2774f145193c4b34a11305bd104b4551b09273dfd1d6a374930" - ], - "version": "==0.2.0" - }, - "grpcio": { - "hashes": [ - "sha256:0feade5de967be3c9ee041662d1347fc537ad05ccbcf05bcf1efa05072bef926", - "sha256:1ae02a9787cf2c5f25add0806f6271283b6074ab8619077d2b5c9037950c890b", - "sha256:2d7215dca11ba4aa49cc6c05b37e4b0a0f99727c8604e8ccd5ef1f6e06332200", - "sha256:35a4f6ffae88ce6a461e503ae91b62dc5c96013cafc717f2d7139686b5c39969", - "sha256:3be7635b4308e06449b2275a5e96a030bbf82ba6797ae8947f14667491924d81", - "sha256:3d1b3e7042a41b167334f718842f13deb80287886c9160efe31252602b13a128", - "sha256:468d4ce007cb859d5f9440cf4a7461cc172fd07d690300f4db88afaa78f01003", - "sha256:59a2fb52d286a38b9cbc7434eb473026fde0b20c223a10a99f5c3d4e395c2c2b", - "sha256:59c7670c902acce952ba709d9126cda87a45d7fed6bd568868e74171e4acd7f7", - "sha256:5b03fd3941c5e1a5deb01026bae025d319b38d3facb3e5fc491bca73e908d69e", - "sha256:65842e698776f4e49f62346c0f80fc31b34907e0df4247650c643113ef167122", - "sha256:68dbe71f890475e2824afbc5dc72714d1fca668bc15df0954bda4a8a5a53d0c7", - "sha256:868973b64b7e2464e5297cc660da588c542c175e85f6d2f7490d86c0dd5dbb4c", - "sha256:86f0c2062fde76789f7cdbf67d4ede116e7e1ceaf4c327fff7b9d17eb5852403", - "sha256:9444863aaba55b662719e22680f11134182604619f241cc607020e5b3786f4cd", - "sha256:9ac704e25d271af62c1ea72f1cb42ec7938f26f00314a8f324999ac5e1bf55eb", - "sha256:a02ef0354fb455a9ce2ad869a40f28f20a64147d46557c59b7269a15832c36d2", - "sha256:aabcdc960633231f9575252c061b480fc56a1ff6dcc7999fa5d4968f574d894f", - "sha256:b47a19a3be2f9608b4296bd16374c9a922d3206cf0a917792801a5cef5a2fa23", - "sha256:bf7bfe162057e6f1e3f4613b2a5f1157c8e286bddeaa40f7b8ce5054cb4b1413", - "sha256:c3cf3f431b41c39aa1501458d0e46086e699836536af873fe028dda1dfc6bcbd", - "sha256:cc7bd47eca988831d58a618908c825204d6ee8e90cdb9854a09b52a3b76ac168", - "sha256:cde83440fb4691d1bd8620ea919a9bd3199e6725e72d2c0d94898a2774c255ee", - "sha256:e1a03666852b956f7949c2a7f187dd54406cae2874c2ce26c1a0dafddf812cb2", - "sha256:e322eb5dc533cfbf21a9e964ebab80da391a26234a82288bfce505058913dfac", - "sha256:f0169d98670ef1db52e4f6930fd470c34731948350cabbe93087a8462b1f1da4", - "sha256:f14faadfd09aa8526536cd2149e274563f45b767fca1736ccc53803a6af3f90e" - ], - "version": "==1.12.1" - }, - "h5py": { - "hashes": [ - "sha256:0f8cd2acbacf3177b4427ed42639c911667b1f24d923388ab1f8ad466a12be5e", - "sha256:11277e3879098f921ee9e29105b20591e1dfdd44963357399f2abaa1a280c560", - "sha256:1241dec0c94ac32f3285cac1d6f44beabf80423e422ab03bd2686d731a8a9294", - "sha256:17b8187de0b3a945d8e8d031e7eb6ece2fce90791f9c5fde36f4396bf38fdde1", - "sha256:308e0758587ee16d4e73e7f2f8aae8351091e343bf0a43d2f697f9535465c816", - "sha256:37cacddf0e8209905f52537a8cf71da0dd9a4de62bd79247274c97b24a408997", - "sha256:38a23bb599748adf23d77f74885c0de6f4a7d9baa42f74e476bbf90fba2b47dd", - "sha256:47ab18b7b7bbc36fd2b606289b703b6f0ee915b923d6ad94dd17ac80ebffc280", - "sha256:486c78330af0bf33f5077b51d1888c0739c3cd1a03d5aade0d48572b3b5690ca", - "sha256:4e2183458d6ef1ae87dfb5d6acd0786359336cd9ac0ece6396c09b59fdaa3bd6", - "sha256:51d0595c3e58814c831f6cd2b664a5bf9590e26262c1d541b380d041e4fcb3c0", - "sha256:56d259d56822b70881760b243957f04a0cf133f0ec65eae6a33f562826aee899", - "sha256:5e6e777653169a3cc24ea56bb3d8c845ea391f8914c35bb6f350b0753a52891c", - "sha256:62bfb0ebb0f59e5dccc0b0dbbc0fc40dd1d1e09d04c0dc71f89790231531d4a2", - "sha256:67d89b64debfa021b54aa6f24bbf008403bd144748a0148596b518bce80d2fc4", - "sha256:9214ca445c18a37bfe9c165982c0e317e2f21f035c8d635d1c6d9fcbaf35b7a8", - "sha256:ab0c52850428d2e86029935389379c2c97f752e76b616da851deec8a4484f8ec", - "sha256:b2eff336697d8dfd712c5d93fef9f4e4d3e97d9d8c258801836b8664a239e07a", - "sha256:bb33fabc0b8f3fe3bb0f8d6821b2fad5b2a64c27a0808e8d1c5c1e3362062064", - "sha256:bd5353ab342bae1262b04745934cc1565df4cbc8d6a979a0c98f42209bd5c265", - "sha256:c45650de228ace7731e4280e14fb687f6d5c29cd666c5b22b42492b035e994d6", - "sha256:d5c0c01da45f901a3d429e7ef9e7e22baa869e1affb8715f1bf94e6a30020740", - "sha256:d75035db5bde802a29f4f29f18bb7548863d29ac90ccbf2c04c11799bbbba2c3", - "sha256:dda88206dc9464923f27f601000bc5b152ac0bd6d0122f098d4f239150a70076", - "sha256:e1c2ac5d0aa232c0f60fecc6bd1122346885086a176f939b91058c4c980cc226", - "sha256:e626c65a8587921ebc7fb8d31a49addfdd0b9a9aa96315ea484c09803337b955" - ], - "index": "pypi", - "version": "==2.8.0" - }, - "html5lib": { - "hashes": [ - "sha256:2612a191a8d5842bfa057e41ba50bbb9dcb722419d2408c78cff4758d0754868" - ], - "version": "==0.9999999" - }, - "hyperopt": { - "hashes": [ - "sha256:4f6e903f7640165ea3e4c622050b41ffab0bee7811ede23c7825a5884976d72f" - ], - "index": "pypi", - "version": "==0.1" - }, - "idna": { - "hashes": [ - "sha256:156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", - "sha256:684a38a6f903c1d71d6d5fac066b58d7768af4de2b832e426ec79c30daa94a16" - ], - "version": "==2.7" - }, - "joblib": { - "hashes": [ - "sha256:9de5fe8bc953f871f862d27e77f153c31d545b84f2aa31a63b5165e912ad6dfa", - "sha256:aba9f97aa3e0548be6fc458b5d708be863eb4be35830caeb3faa7bd3d9afb7bb" - ], - "index": "pypi", - "version": "==0.12.0" - }, - "keras": { - "hashes": [ - "sha256:5b8499d157af217f1a5ee33589e774127ebc3e266c833c22cb5afbb0ed1734bf", - "sha256:fa71a1f576dbd643532b872b8952afb65cc3ff7ed20d172e6b49657b710b43d0" - ], - "index": "pypi", - "version": "==2.2.0" - }, - "keras-applications": { - "hashes": [ - "sha256:7dceb9820b39c01459ea5e8922add86eb99a9e14354c33dc9981d5f5077fa0ac", - "sha256:9924be748e5d180806d133c714d22895b997ed722757491dd99538851145d3bf" - ], - "version": "==1.0.2" - }, - "keras-preprocessing": { - "hashes": [ - "sha256:5283236f0b22a57b30bda766fc819b2ed2483c52f3e1f8b39fcc528f51f772e7", - "sha256:8649ba6377ecc06ea10e0a8a954df5600d115b4b626861e33c79b41ec03c5194" - ], - "version": "==1.0.1" - }, - "markdown": { - "hashes": [ - "sha256:9ba587db9daee7ec761cfc656272be6aabe2ed300fece21208e4aab2e457bc8f", - "sha256:a856869c7ff079ad84a3e19cd87a64998350c2b94e9e08e44270faef33400f81" - ], - "version": "==2.6.11" - }, - "mccabe": { - "hashes": [ - "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", - "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" - ], - "version": "==0.6.1" - }, - "msgpack-numpy": { - "hashes": [ - "sha256:6947df61826a2917e38dbe07957a0c70dc82dce93ec38153dae850fdd21a4583", - "sha256:afc603c7cf8497fb125a8c8c713518a004e9662101f088e3d4fcf7688b08eeb3" - ], - "version": "==0.4.1" - }, - "msgpack-python": { - "hashes": [ - "sha256:378cc8a6d3545b532dfd149da715abae4fda2a3adb6d74e525d0d5e51f46909b" - ], - "version": "==0.5.6" - }, - "murmurhash": { - "hashes": [ - "sha256:651137ed3e1169342c9edade454f3beb7fcdf28d4ad1ac232725237eaf442d9a" - ], - "version": "==0.28.0" - }, - "networkx": { - "hashes": [ - "sha256:0d0e70e10dfb47601cbb3425a00e03e2a2e97477be6f80638fef91d54dd1e4b8", - "sha256:1b229b54fe9ccb009cee4de02a88552191497a542a7d5d34adab216b9f15c1ff", - "sha256:b3e0144d5fe6b7479b694e1b598a5545a38f3fc6f1e3c09173eb30f0c7a5770e" - ], - "index": "pypi", - "version": "==1.11" - }, - "nltk": { - "hashes": [ - "sha256:fe0eda251be65843be86d7de9abfbf7161732256f742e623b21243ec47bdb718" - ], - "index": "pypi", - "version": "==3.3.0" - }, - "nose": { - "hashes": [ - "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", - "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a", - "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98" - ], - "index": "pypi", - "version": "==1.3.7" - }, - "numpy": { - "hashes": [ - "sha256:07379fe0b450f6fd6e5934a9bc015025bb4ce1c8fbed3ca8bef29328b1bc9570", - "sha256:085afac75bbc97a096744fcfc97a4b321c5a87220286811e85089ae04885acdd", - "sha256:2d6481c6bdab1c75affc0fc71eb1bd4b3ecef620d06f2f60c3f00521d54be04f", - "sha256:2df854df882d322d5c23087a4959e145b953dfff2abe1774fec4f639ac2f3160", - "sha256:381ad13c30cd1d0b2f3da8a0c1a4aa697487e8bb0e9e0cbeb7439776bcb645f8", - "sha256:385f1ce46e08676505b692bfde918c1e0b350963a15ef52d77691c2cf0f5dbf6", - "sha256:4d278c2261be6423c5e63d8f0ceb1b0c6db3ff83f2906f4b860db6ae99ca1bb5", - "sha256:51c5dcb51cf88b34b7d04c15f600b07c6ccbb73a089a38af2ab83c02862318da", - "sha256:589336ba5199c8061239cf446ee2f2f1fcc0c68e8531ee1382b6fc0c66b2d388", - "sha256:5edf1acc827ed139086af95ce4449b7b664f57a8c29eb755411a634be280d9f2", - "sha256:6b82b81c6b3b70ed40bc6d0b71222ebfcd6b6c04a6e7945a936e514b9113d5a3", - "sha256:6c57f973218b776195d0356e556ec932698f3a563e2f640cfca7020086383f50", - "sha256:758d1091a501fd2d75034e55e7e98bfd1370dc089160845c242db1c760d944d9", - "sha256:8622db292b766719810e0cb0f62ef6141e15fe32b04e4eb2959888319e59336b", - "sha256:8b8dcfcd630f1981f0f1e3846fae883376762a0c1b472baa35b145b911683b7b", - "sha256:97fa8f1dceffab782069b291e38c4c2227f255cdac5f1e3346666931df87373e", - "sha256:9d69967673ab7b028c2df09cae05ba56bf4e39e3cb04ebe452b6035c3b49848e", - "sha256:9e1f53afae865cc32459ad211493cf9e2a3651a7295b7a38654ef3d123808996", - "sha256:a4a433b3a264dbc9aa9c7c241e87c0358a503ea6394f8737df1683c7c9a102ac", - "sha256:baadc5f770917ada556afb7651a68176559f4dca5f4b2d0947cd15b9fb84fb51", - "sha256:c725d11990a9243e6ceffe0ab25a07c46c1cc2c5dc55e305717b5afe856c9608", - "sha256:d696a8c87315a83983fc59dd27efe034292b9e8ad667aeae51a68b4be14690d9", - "sha256:e1864a4e9f93ddb2dc6b62ccc2ec1f8250ff4ac0d3d7a15c8985dd4e1fbd6418" - ], - "index": "pypi", - "version": "==1.14.5" - }, - "pathlib": { - "hashes": [ - "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f" - ], - "version": "==1.0.1" - }, - "plac": { - "hashes": [ - "sha256:854693ad90367e8267112ffbb8955f57d6fdeac3191791dc9ffce80f87fd2370", - "sha256:ba3f719a018175f0a15a6b04e6cc79c25fd563d348aacd320c3644d2a9baf89b" - ], - "version": "==0.9.6" - }, - "preshed": { - "hashes": [ - "sha256:a6b3a9e34634600e3e410ec25e0debed4b65a47eb37514a063d189d1c425b4dd" - ], - "version": "==1.0.0" - }, - "protobuf": { - "hashes": [ - "sha256:12985d9f40c104da2f44ec089449214876809b40fdc5d9e43b93b512b9e74056", - "sha256:12c97fe27af12fc5d66b23f905ab09dd4fb0c68d5a74a419d914580e6d2e71e3", - "sha256:327fb9d8a8247bc780b9ea7ed03c0643bc0d22c139b761c9ec1efc7cc3f0923e", - "sha256:3895319db04c0b3baed74fb66be7ba9f4cd8e88a432b8e71032cdf08b2dfee23", - "sha256:695072063e256d32335d48b9484451f7c7948edc3dbd419469d6a778602682fc", - "sha256:7d786f3ef5b33a04e6538089674f244a3b0f588155016559d950989010af97d0", - "sha256:8bf82bb7a466a54be7272dcb492f71d55a2453a58d862fb74c3f2083f2768543", - "sha256:9bbc1ae1c33c1bd3a2fc05a3aec328544d2b039ff0ce6f000063628a32fad777", - "sha256:9f1087abb67b34e55108bc610936b34363a7aac692023bcbb17e065c253a1f80", - "sha256:9fefcb92a3784b446abf3641d9a14dad815bee88e0edd10b9a9e0e144d01a991", - "sha256:a37836aa47d1b81c2db1a6b7a5e79926062b5d76bd962115a0e615551be2b48d", - "sha256:cca22955443c55cf86f963a4ad7057bca95e4dcde84d6a493066d380cfab3bb0", - "sha256:d7ac50bc06d31deb07ace6de85556c1d7330e5c0958f3b2af85037d6d1182abf", - "sha256:dfe6899304b898538f4dc94fa0b281b56b70e40f58afa4c6f807805261cbe2e8" - ], - "version": "==3.6.0" - }, - "pyahocorasick": { - "hashes": [ - "sha256:3d584e7836ca7b066f99d7fdb384dc6ef7af211b2b139baedbd960c7c279bb7f" - ], - "index": "pypi", - "version": "==1.1.8" - }, - "pycodestyle": { - "hashes": [ - "sha256:682256a5b318149ca0d2a9185d365d8864a768a28db66a84a2ea946bcc426766", - "sha256:6c4245ade1edfad79c3446fadfc96b0de2759662dc29d07d80a6f27ad1ca6ba9" - ], - "version": "==2.3.1" - }, - "pyflakes": { - "hashes": [ - "sha256:08bd6a50edf8cffa9fa09a463063c425ecaaf10d1eb0335a7e8b1401aef89e6f", - "sha256:8d616a382f243dbf19b54743f280b80198be0bca3a5396f1d2e1fca6223e8805" - ], - "version": "==1.6.0" - }, - "pymongo": { - "hashes": [ - "sha256:061085dfe4fbf1d9d6ed2f2e52fe6ab72559e48b4294370b433751638160d10b", - "sha256:07fdee1c5567f237796a8550233e04853785d8dcf95929f96ab519ed91543109", - "sha256:0d98731aaea8cb32b535c376f6785927e4e3d9459ffe1440b8a639827a849350", - "sha256:10f683950f70626ccedf4a662d1c0b3244e8e013c2067872af5633830abd1bfd", - "sha256:2954b99cfeb76776879e9f8a4cae9c5e19d5eff92d0b7b663ceddcf192adb66b", - "sha256:419ed5d5b76ef304815f354d9df7f2085acfd6ff7cc1b714ca702e2239b341c2", - "sha256:42ec201fd9a26e7c1e611e3db19324dead51dd4646391492eb238b41749340e8", - "sha256:4400fa92af310bf66b76c313c7ded3bb63f3d63b4f43c3bfbff552cf294dc9fa", - "sha256:4807dfbb5cdcfe0224329992dc48b897c780d0ad7553c3799d34f84ba5cab446", - "sha256:54daf67e1e7e7e5a5160c86123bdd39b1d3b25876c2ab38230dc2a764cb3d98f", - "sha256:5fd6ce5ed3c6c92d2c94756e6bf041304e5c7c5a5dbea31b8957d52a78bdf01d", - "sha256:601e00fe7fb283f04c95f5dafb787c0862f48ca015a6f1f81b460c74e4303873", - "sha256:7fbd9233e8b6741b047c5857e2ad5efb74091f167d7fa8a2a3379217165058f9", - "sha256:7ffac35362c07c103b024b89875e8d7f0625129b65c56fa8a3ecebbd56110405", - "sha256:833bc6cb2ec7058dea9f5840a9314ac74738d2117486a044e88f3976e37ea7a0", - "sha256:92cb26a2a9b38e8df5215803f950b20a6c847d5e00d1dd125eaa84f05f9472d7", - "sha256:9e5f0e8967d95a256038817460844a8aab588b9bc9ba6296507a1863960a0e44", - "sha256:abf83b908e535b1386a7732825994e6e36eff6394c1829f3e7a23888136484fa", - "sha256:adb2dba52c8a2a2d7bcd3b267f7bbf7c822850cf6a7cd15211b9f386c3a670ef", - "sha256:ae7b3479822a03f6f651913de84ba67101f23e051ae88034085e974f472dcfff", - "sha256:cc15b30f0ac518e6cbd4b6e6e6162f8aa14edfe255d0841146f146151bd58865", - "sha256:d23498d62063b715078947bef48fa4d34dc354f3b268ed15dc6b46fc809a88e9", - "sha256:dd29bb5bc9068ccc248c8c145efd839421f04363b468b47cfa2d4902ca369afe", - "sha256:e53ad0cc6c489f83e7f6bb6121aa73bb6f6488410024a3bd77c16af1aa3a1000", - "sha256:ecb11113407d919f8714cc7d0841985044633d0b561ef3d797e1b494a3e73537", - "sha256:ece2c2add66d3ec2720a963bf073ca11fc3b0b58159767fc3bc5ddaad791d481", - "sha256:ef25c8675f5c8c19832f69cd97d728d99bb4ab9c3b200e28a5c8416631afaf3c", - "sha256:f62a818d643776873713c5676f17bd95ac4176220b13dd12c14edd3a450d1ac9", - "sha256:f7ebcb846962ee40374db2d9014a89bea9c983ae63c1877957c3a0a756974796" - ], - "version": "==3.6.1" - }, - "python-crfsuite": { - "hashes": [ - "sha256:10d84507d96d2870fba053d6659170113675762745e715befe0d9671d8988098", - "sha256:19882b03d26abe075c280f3450829f520a36d17a050621e48109094ea94f4965", - "sha256:2e1cdceca173cf73360220737648acf87244c1ae98eebf9f41d7a86035d4eaac", - "sha256:3b36da634400dd1557bb947ddf009e6328e7ef76332054f6484e8cec6993b86e", - "sha256:470220d0f4be28769505c5cd8e2854ee25d748bc38d70f42811832031f245273", - "sha256:47f10949a003439cdb7922b43177c96c2c1eea5c56ed2d17270c850bc86ccab3", - "sha256:5ad7395dcce74fd07fc031f1e6c1160ff7b43ea343129caa77fb8d8afe7e2ecb", - "sha256:607f4276025b2217e326e6c9161308fb2ad7f881665bfff072dbedae6cdc9cf8", - "sha256:6c03fe809957eaf9333816f52dd497678a013ee918c502a7a529b191ef19c694", - "sha256:73b57919711499d2ff32843a28b2a6c17487630ba255a6aa4a093ee0a7f3b1fa", - "sha256:7861c9c7635e868c67914e77ccf757e096387edf676f78425aa05b2f8b32ced7", - "sha256:8b656eee39d0fdea760285b574c5517d309f943be98d87e42425d332cdf687ca", - "sha256:942aa72793dad1f9b0d65991b4386e385c4bf9248d2072e0e058f6cbb41e15dc", - "sha256:968f8597f5df1ec54c5e72ce50775118a09f3c5737962e058f94a83da30d2a68", - "sha256:96d0041722bfd9649b4f31971c25941cd9c01d4cbd4f774ba886ac6f77092f0b", - "sha256:9daca96ada19b79353a022d77f38225a1dedfb632d6107a4846f8c1b5ace88a2", - "sha256:9e18d89a1996306bf44341b55dd5cda2afa5a6aeeb4ebb99240b26bce1e89bd2", - "sha256:a8d191eac2c7a395a74ceade0e58eea0ad5de823200b7b995ecab29295cf027b", - "sha256:af63e6ed7635e5180bf278d0bd967097dcb295e0a58ba815a1c29d179479bfae", - "sha256:b5c8b6a7bc194ca189db8cae6991d7fba0e1612c5c44297b989364e096a12a03", - "sha256:b7643090e8e068ca25b5525d81a61d001e6594f6fa62452a1cb536e238b23c99", - "sha256:bd3b5d73ca86928410473be4d3246730263a36fb0afd355f4ddec450ec5f881b", - "sha256:c52d38b7180e19b6f5430eb76ae6fb160a5be2f5630834ea71d592bc5bff3671", - "sha256:c94030f625e8139a41528d546c4b41cc223ee28f301af5d780eda10dcc96364c", - "sha256:d4ce2830fd0d8ec037deefe662633935fa4a4cccc8647ac0515181c939c2184a", - "sha256:dba43d4b1ccf1b1eb67a8adbfae788f3498e21997124010ad0f819dad3cf8b1d", - "sha256:e419b441ba39716f1b9095dc3ed7adf3115611504d8933e973a7036ccef4168f", - "sha256:e7b73325a5b345d10b78018d6ca123c6348825c82f3670a169143050d087ee65" - ], - "version": "==0.9.5" - }, - "pyyaml": { - "hashes": [ - "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736", - "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f", - "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab", - "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7", - "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1", - "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8", - "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4", - "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269" - ], - "index": "pypi", - "version": "==3.12" - }, - "regex": { - "hashes": [ - "sha256:19c4b0f68dd97b7116e590f47d60d97ab9e76966acc321b1d20dd87c2b64dff2", - "sha256:1af6b820bec5ca82af87447af5a6dcc23b3ddc96b0184fd71666be0c24fb2a4f", - "sha256:232dbc28a2562d92d713c3c1eb2b9276f3ebcbdb6d3e96ff68d0417a71926784", - "sha256:3d26ce7e605a501509b68c343fc9d9e09f76c2e9e261df8183027bdc750c97ce", - "sha256:52b590a41b9677314d02d9055edc33992db758b3d5167aa1365229a6a0c26a6d", - "sha256:565f9aac9cd43b2351f7fcbc0d6056f8aebf4f6d049a17982085019ab9acdf28", - "sha256:656984899644d3fe2e40533724f513a21127f77162a15dd5244af3c965152c63", - "sha256:689c9d17c3ba02f52e8481a5c584c8c11ba27d6cc5f939efdd838ae0d0d1af41", - "sha256:8a9d9db8ef1621ae51ea12acb5e503204b4586e05c6cfd418aecb9466a71bd87", - "sha256:ad2beea450d551b11b47512ce920127d7c8645e528cc56dc9502c5973e8732f3", - "sha256:b39867f577bc59b2fec9209facc513c761978e4ac63f4b73b9750a2c1501729e", - "sha256:b6a7725a069be8f9dd09e1e500e5b57556b301942e21c8c712627f73ec048286", - "sha256:b9e9b97696e75e826adac1920b13e7bac3a6a2128c085783abd208d73a278d70", - "sha256:bf4896ed1ca2017153fc6b341bc8a0da8ca5480f85eebd7bfe58bbafceb4e728", - "sha256:c3c2fe1e0d90f4c93be5b588480f05defd44f64c65767a657de69c4db4429a39", - "sha256:d811874ed669165fe1059a54f860db5c6ab5f48100bf4945d915fd2f877b2531", - "sha256:db616380b04e29e5709bc3ec0674e827dfed3d18e7d686c09537ab01506127c9", - "sha256:efa66273b49dbd7a9f6a4d02d1a7d5bf353d568a89f7cd8927812daa9f83bb84", - "sha256:f8feab5b517cdc65a61a50549e7dcfa0f61ab872a0034da1f6b8d61775178b6a" - ], - "index": "pypi", - "version": "==2017.4.5" - }, - "requests": { - "hashes": [ - "sha256:63b52e3c866428a224f97cab011de738c36aec0185aa91cfacd418b5d58911d1", - "sha256:ec22d826a36ed72a7358ff3fe56cbd4ba69dd7a6718ffd450ff0e9df7a47ce6a" - ], - "version": "==2.19.1" - }, - "scikit-learn": { - "hashes": [ - "sha256:13136c6e4f6b808569f7f59299d439b2cd718f85d72ea14b5b6077d44ebc7d17", - "sha256:370919e3148253fd6552496c33a1e3d78290a336fc8d1b9349d9e9770fae6ec0", - "sha256:3775cca4ce3f94508bb7c8a6b113044b78c16b0a30a5c169ddeb6b9fe57a8a72", - "sha256:42f3c5bd893ed73bf47ccccf04dfb98fae743f397d688bb58c2238c0e6ec15d2", - "sha256:56cfa19c31edf62e6414da0a337efee37a4af488b135640e67238786b9be6ab3", - "sha256:5c9ff456d67ef9094e5ea272fff2be05d399a47fc30c6c8ed653b94bdf787bd1", - "sha256:5ca0ad32ee04abe0d4ba02c8d89d501b4e5e0304bdf4d45c2e9875a735b323a0", - "sha256:5db9e68a384ce80a17fc449d4d5d9b45025fe17cf468429599bf404eccb51049", - "sha256:72c194c5092e921d6107a8de8a5adae58c35bbc54e030ba624b6f02fd823bb21", - "sha256:871669cdb5b3481650fe3adff46eb97c455e30ecdc307eaf382ef90d4e2570ab", - "sha256:873245b03361710f47c5410a050dc56ee8ae97b9f8dcc6e3a81521ca2b64ad10", - "sha256:8b17fc29554c5c98d88142f895516a5bec2b6b61daa815e1193a64c868ad53d2", - "sha256:95b155ef6bf829ddfba6026f100ba8e4218b7171ecab97b2163bc9e8d206848f", - "sha256:a21cf8217e31a9e8e32c559246e05e6909981816152406945ae2e3e244dfcc1f", - "sha256:ba3fd442ae1a46830789b3578867daaf2c8409dcca6bf192e30e85beeabbfc2f", - "sha256:ce78bf4d10bd7e28807c36c6d2ab25a9934aaf80906ad987622a5e45627d91a2", - "sha256:d384e6f9a055b7a43492f9d27779adb717eb5dcf78b0603b01d0f070a608d241", - "sha256:d4da369614e55540c7e830ccdd17ab4fe5412ff8e803a4906d3ece393e2e3a63", - "sha256:ddc1eb10138ae93c136cc4b5945d3977f302b5d693592a4731b2805a7d7f2a74", - "sha256:e54a3dd1fe1f8124de90b93c48d120e6da2ea8df29b6895325df01ddc1bd8e26", - "sha256:ee8c3b1898c728b6e5b5659c233f547700a1fea13ce876b6fe7d3434c70cc0e0", - "sha256:f528c4b2bba652cf116f5cccf36f4db95a7f9cbfcd1ee549c4e8d0f8628783b5", - "sha256:f9abae483f4d52acd6f660addb1b67e35dc5748655250af479de2ea6aefc6df0" - ], - "version": "==0.19.1" - }, - "scipy": { - "hashes": [ - "sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7", - "sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a", - "sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd", - "sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3", - "sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37", - "sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463", - "sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3", - "sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631", - "sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5", - "sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a", - "sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f", - "sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559", - "sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692", - "sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1", - "sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac", - "sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a", - "sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2", - "sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01", - "sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552", - "sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40", - "sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020", - "sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae", - "sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40" - ], - "index": "pypi", - "version": "==1.1.0" - }, - "seqeval": { - "hashes": [ - "sha256:6dc7f9ddf5246b909adb0c349575daedbe7828c2bc02df4c81fd4bd80ad8adaa" - ], - "version": "==0.0.3" - }, - "six": { - "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" - ], - "version": "==1.11.0" - }, - "sklearn": { - "hashes": [ - "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31" - ], - "index": "pypi", - "version": "==0.0" - }, - "sklearn-crfsuite": { - "hashes": [ - "sha256:2f59aad3055e01a778a79a6352891cac04788e8b52688aa5bc8b11be7717861e", - "sha256:6e9a42bc3de96941d5f7262335130955b8c380b1356147622368f385075705d9" - ], - "index": "pypi", - "version": "==0.3.6" - }, - "spacy": { - "hashes": [ - "sha256:cddb06e7965222e4339eb59d2258db8dadab19ef8b0a1a44a2d33f94935ba421" - ], - "index": "pypi", - "version": "==2.0.11" - }, - "tabulate": { - "hashes": [ - "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2" - ], - "version": "==0.8.2" - }, - "tensorboard": { - "hashes": [ - "sha256:2651a4d9261a6593cb2c3514576e4bf25e273837c79e98f68a3cf51759f68725", - "sha256:7776cc8bcfd0d07cd106e9e86c011ab8eca38c97b57f5433b9f0fd6bbf31a36e" - ], - "version": "==1.8.0" - }, - "tensorflow": { - "hashes": [ - "sha256:1fc4eb267bc973f2a8466778ad0c6f110f8356373e534c8463a7bb9b6d86e5f0", - "sha256:24c3d3f87dc6108f4de3bbb7f647bcd6f303f874721b176185f260adea88fe40", - "sha256:3087797a85610b756066df294f98e666e49f4fa60c6e1dbcf4b190b21397af71", - "sha256:36a6671d39a3aec89cb4cce125e90305c3f8a69da16d2fb72eff40fe478e1de9", - "sha256:373c637f9c4f06346b1b02e280b30517d262a1651b252ff889f7c22716ef4548", - "sha256:397c0766e166fb768613b1498a2a6f67eeea077425d234c0138d55e85408473d", - "sha256:68bda870c355aa64dc3dc377d5b9d13c829281467a7c3b95cb8529031d8f68b2", - "sha256:899e53e01efcc8e906b3d1f53fe6a6264edf5dfc275b32cadbc6f4e33dca78f7", - "sha256:92801b9aebcc195d3b5a492d1d46330c47c11c6f0c0f7ab6b605da489482c64e", - "sha256:d345d296aeb05eeb50d9de43a1dcb66ceaba6a2bd603f58aeefaa07b2c1bfac1", - "sha256:dccc205e84cd33a240a601046e88eacefe12d677a1bbdf17f0ebafd1a7c84c70", - "sha256:de51f60021ea8160ea6d0340e827a26331cd549f6e7c470fba7ee83aeab4f818" - ], - "index": "pypi", - "version": "==1.8.0" - }, - "termcolor": { - "hashes": [ - "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b" - ], - "version": "==1.1.0" - }, - "thinc": { - "hashes": [ - "sha256:9a1deb850285f76efaf0ae38b605a137a3978826282cc57dcc1e66b779402a76" - ], - "version": "==6.10.2" - }, - "toolz": { - "hashes": [ - "sha256:929f0a7ea7f61c178bd951bdae93920515d3fbdbafc8e6caf82d752b9b3b31c9" - ], - "version": "==0.9.0" - }, - "tqdm": { - "hashes": [ - "sha256:224291ee0d8c52d91b037fd90806f48c79bcd9994d3b0abc9e44b946a908fccd", - "sha256:77b8424d41b31e68f437c6dd9cd567aebc9a860507cb42fbd880a5f822d966fe" - ], - "version": "==4.23.4" - }, - "ujson": { - "hashes": [ - "sha256:f66073e5506e91d204ab0c614a148d5aa938bdbf104751be66f8ad7a222f5f86" - ], - "version": "==1.35" - }, - "urllib3": { - "hashes": [ - "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", - "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" - ], - "version": "==1.23" - }, - "werkzeug": { - "hashes": [ - "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", - "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b" - ], - "version": "==0.14.1" - }, - "wheel": { - "hashes": [ - "sha256:0a2e54558a0628f2145d2fc822137e322412115173e8a2ddbe1c9024338ae83c", - "sha256:80044e51ec5bbf6c894ba0bc48d26a8c20a9ba629f4ca19ea26ecfcf87685f5f" - ], - "markers": "python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.0.*'", - "version": "==0.31.1" - }, - "wrapt": { - "hashes": [ - "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" - ], - "version": "==1.10.11" - } - }, - "develop": { - "absl-py": { - "hashes": [ - "sha256:e0eb8358b549552b1cc5972350bc3e41dd0a926c15b3ff95ce60f3c78c80824c" - ], - "version": "==0.2.2" - }, - "anago": { - "hashes": [ - "sha256:a4bd7b0d6109408fbdd9cdd2d6bfb60221bd7293c0645a75e6fddddce40abcc1" - ], - "index": "pypi", - "version": "==1.0.6" - }, - "astor": { - "hashes": [ - "sha256:64c805f1ad6fbc505633416b6174fc23796eb164f371a7dc1f3951ea30560fb5", - "sha256:ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d" - ], - "version": "==0.6.2" - }, - "bleach": { - "hashes": [ - "sha256:978e758599b54cd3caa2e160d74102879b230ea8dc93871d0783721eef58bc65", - "sha256:e67f46adcec78dbc3c04462f3aba3213a673d5652eba2609ed1ef15492a44b8d" - ], - "version": "==1.5.0" - }, - "certifi": { - "hashes": [ - "sha256:13e698f54293db9f89122b0581843a782ad0934a4fe0172d2a980ba77fc61bb7", - "sha256:9fa520c1bacfb634fa7af20a76bcbd3d5fb390481724c597da32c719a7dca4b0" - ], - "version": "==2018.4.16" - }, - "chardet": { - "hashes": [ - "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", - "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" - ], - "version": "==3.0.4" - }, - "cymem": { - "hashes": [ - "sha256:00bb3645dfb9a020d735ba3d6f822b04656388180588d8b2cebde967ee678bcc", - "sha256:0dd61d05977839a922c0d797c355b98949210575918b1743b41e38ae9fb2c3a7", - "sha256:4bc1056b52d959fcbb1e0f32ec84fa131754d6be1e36b65782c6ac86419f4bf3", - "sha256:4c5d9ca6ec706792b8d9b1faf6db77b95545c388c768b21d940f197aa7efbb7e", - "sha256:50292f4dd0d950a8698bae27d71efe59da7ff08e591b735e08b658aae42c4745", - "sha256:616d06333f46dd03c128d97912d361183fc02249e6420a7b7907b41214c51562", - "sha256:944af97d4d34a2470b5199f1c31d2dfc79cdec7bd7a41354d839a8ab87fdfaa6", - "sha256:b38056efb99078b06c504adb5f03a8d9e822a5543451737b746028a71c4b1ac3", - "sha256:b6513b2926c60d641f159e79e6fb16460dfb50ebcce31a5af0370c51837c7efc", - "sha256:daa6003fcc199752ab703142021cff74774872a932303b240dc0ea177adf295d", - "sha256:f06d9b50da0474d7405674d8101c319d89a17d33792d6d429fe3d5c64f0d9df1" - ], - "version": "==1.31.2" - }, - "cytoolz": { - "hashes": [ - "sha256:476a2ad176de5eaef80499b7b43d4f72ba6d23df33d349088dae315e9b31c552" - ], - "version": "==0.8.2" - }, - "decorator": { - "hashes": [ - "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", - "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" - ], - "version": "==4.3.0" - }, - "dill": { - "hashes": [ - "sha256:624dc244b94371bb2d6e7f40084228a2edfff02373fe20e018bef1ee92fdd5b3" - ], - "version": "==0.2.8.2" - }, - "future": { - "hashes": [ - "sha256:e39ced1ab767b5936646cedba8bcce582398233d6a627067d4c6a454c90cfedb" - ], - "index": "pypi", - "version": "==0.16.0" - }, - "gast": { - "hashes": [ - "sha256:7068908321ecd2774f145193c4b34a11305bd104b4551b09273dfd1d6a374930" - ], - "version": "==0.2.0" - }, - "grpcio": { - "hashes": [ - "sha256:0feade5de967be3c9ee041662d1347fc537ad05ccbcf05bcf1efa05072bef926", - "sha256:1ae02a9787cf2c5f25add0806f6271283b6074ab8619077d2b5c9037950c890b", - "sha256:2d7215dca11ba4aa49cc6c05b37e4b0a0f99727c8604e8ccd5ef1f6e06332200", - "sha256:35a4f6ffae88ce6a461e503ae91b62dc5c96013cafc717f2d7139686b5c39969", - "sha256:3be7635b4308e06449b2275a5e96a030bbf82ba6797ae8947f14667491924d81", - "sha256:3d1b3e7042a41b167334f718842f13deb80287886c9160efe31252602b13a128", - "sha256:468d4ce007cb859d5f9440cf4a7461cc172fd07d690300f4db88afaa78f01003", - "sha256:59a2fb52d286a38b9cbc7434eb473026fde0b20c223a10a99f5c3d4e395c2c2b", - "sha256:59c7670c902acce952ba709d9126cda87a45d7fed6bd568868e74171e4acd7f7", - "sha256:5b03fd3941c5e1a5deb01026bae025d319b38d3facb3e5fc491bca73e908d69e", - "sha256:65842e698776f4e49f62346c0f80fc31b34907e0df4247650c643113ef167122", - "sha256:68dbe71f890475e2824afbc5dc72714d1fca668bc15df0954bda4a8a5a53d0c7", - "sha256:868973b64b7e2464e5297cc660da588c542c175e85f6d2f7490d86c0dd5dbb4c", - "sha256:86f0c2062fde76789f7cdbf67d4ede116e7e1ceaf4c327fff7b9d17eb5852403", - "sha256:9444863aaba55b662719e22680f11134182604619f241cc607020e5b3786f4cd", - "sha256:9ac704e25d271af62c1ea72f1cb42ec7938f26f00314a8f324999ac5e1bf55eb", - "sha256:a02ef0354fb455a9ce2ad869a40f28f20a64147d46557c59b7269a15832c36d2", - "sha256:aabcdc960633231f9575252c061b480fc56a1ff6dcc7999fa5d4968f574d894f", - "sha256:b47a19a3be2f9608b4296bd16374c9a922d3206cf0a917792801a5cef5a2fa23", - "sha256:bf7bfe162057e6f1e3f4613b2a5f1157c8e286bddeaa40f7b8ce5054cb4b1413", - "sha256:c3cf3f431b41c39aa1501458d0e46086e699836536af873fe028dda1dfc6bcbd", - "sha256:cc7bd47eca988831d58a618908c825204d6ee8e90cdb9854a09b52a3b76ac168", - "sha256:cde83440fb4691d1bd8620ea919a9bd3199e6725e72d2c0d94898a2774c255ee", - "sha256:e1a03666852b956f7949c2a7f187dd54406cae2874c2ce26c1a0dafddf812cb2", - "sha256:e322eb5dc533cfbf21a9e964ebab80da391a26234a82288bfce505058913dfac", - "sha256:f0169d98670ef1db52e4f6930fd470c34731948350cabbe93087a8462b1f1da4", - "sha256:f14faadfd09aa8526536cd2149e274563f45b767fca1736ccc53803a6af3f90e" - ], - "version": "==1.12.1" - }, - "h5py": { - "hashes": [ - "sha256:0f8cd2acbacf3177b4427ed42639c911667b1f24d923388ab1f8ad466a12be5e", - "sha256:11277e3879098f921ee9e29105b20591e1dfdd44963357399f2abaa1a280c560", - "sha256:1241dec0c94ac32f3285cac1d6f44beabf80423e422ab03bd2686d731a8a9294", - "sha256:17b8187de0b3a945d8e8d031e7eb6ece2fce90791f9c5fde36f4396bf38fdde1", - "sha256:308e0758587ee16d4e73e7f2f8aae8351091e343bf0a43d2f697f9535465c816", - "sha256:37cacddf0e8209905f52537a8cf71da0dd9a4de62bd79247274c97b24a408997", - "sha256:38a23bb599748adf23d77f74885c0de6f4a7d9baa42f74e476bbf90fba2b47dd", - "sha256:47ab18b7b7bbc36fd2b606289b703b6f0ee915b923d6ad94dd17ac80ebffc280", - "sha256:486c78330af0bf33f5077b51d1888c0739c3cd1a03d5aade0d48572b3b5690ca", - "sha256:4e2183458d6ef1ae87dfb5d6acd0786359336cd9ac0ece6396c09b59fdaa3bd6", - "sha256:51d0595c3e58814c831f6cd2b664a5bf9590e26262c1d541b380d041e4fcb3c0", - "sha256:56d259d56822b70881760b243957f04a0cf133f0ec65eae6a33f562826aee899", - "sha256:5e6e777653169a3cc24ea56bb3d8c845ea391f8914c35bb6f350b0753a52891c", - "sha256:62bfb0ebb0f59e5dccc0b0dbbc0fc40dd1d1e09d04c0dc71f89790231531d4a2", - "sha256:67d89b64debfa021b54aa6f24bbf008403bd144748a0148596b518bce80d2fc4", - "sha256:9214ca445c18a37bfe9c165982c0e317e2f21f035c8d635d1c6d9fcbaf35b7a8", - "sha256:ab0c52850428d2e86029935389379c2c97f752e76b616da851deec8a4484f8ec", - "sha256:b2eff336697d8dfd712c5d93fef9f4e4d3e97d9d8c258801836b8664a239e07a", - "sha256:bb33fabc0b8f3fe3bb0f8d6821b2fad5b2a64c27a0808e8d1c5c1e3362062064", - "sha256:bd5353ab342bae1262b04745934cc1565df4cbc8d6a979a0c98f42209bd5c265", - "sha256:c45650de228ace7731e4280e14fb687f6d5c29cd666c5b22b42492b035e994d6", - "sha256:d5c0c01da45f901a3d429e7ef9e7e22baa869e1affb8715f1bf94e6a30020740", - "sha256:d75035db5bde802a29f4f29f18bb7548863d29ac90ccbf2c04c11799bbbba2c3", - "sha256:dda88206dc9464923f27f601000bc5b152ac0bd6d0122f098d4f239150a70076", - "sha256:e1c2ac5d0aa232c0f60fecc6bd1122346885086a176f939b91058c4c980cc226", - "sha256:e626c65a8587921ebc7fb8d31a49addfdd0b9a9aa96315ea484c09803337b955" - ], - "index": "pypi", - "version": "==2.8.0" - }, - "html5lib": { - "hashes": [ - "sha256:2612a191a8d5842bfa057e41ba50bbb9dcb722419d2408c78cff4758d0754868" - ], - "version": "==0.9999999" - }, - "hyperopt": { - "hashes": [ - "sha256:4f6e903f7640165ea3e4c622050b41ffab0bee7811ede23c7825a5884976d72f" - ], - "index": "pypi", - "version": "==0.1" - }, - "idna": { - "hashes": [ - "sha256:156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", - "sha256:684a38a6f903c1d71d6d5fac066b58d7768af4de2b832e426ec79c30daa94a16" - ], - "version": "==2.7" - }, - "joblib": { - "hashes": [ - "sha256:9de5fe8bc953f871f862d27e77f153c31d545b84f2aa31a63b5165e912ad6dfa", - "sha256:aba9f97aa3e0548be6fc458b5d708be863eb4be35830caeb3faa7bd3d9afb7bb" - ], - "index": "pypi", - "version": "==0.12.0" - }, - "keras": { - "hashes": [ - "sha256:5b8499d157af217f1a5ee33589e774127ebc3e266c833c22cb5afbb0ed1734bf", - "sha256:fa71a1f576dbd643532b872b8952afb65cc3ff7ed20d172e6b49657b710b43d0" - ], - "index": "pypi", - "version": "==2.2.0" - }, - "keras-applications": { - "hashes": [ - "sha256:7dceb9820b39c01459ea5e8922add86eb99a9e14354c33dc9981d5f5077fa0ac", - "sha256:9924be748e5d180806d133c714d22895b997ed722757491dd99538851145d3bf" - ], - "version": "==1.0.2" - }, - "keras-preprocessing": { - "hashes": [ - "sha256:5283236f0b22a57b30bda766fc819b2ed2483c52f3e1f8b39fcc528f51f772e7", - "sha256:8649ba6377ecc06ea10e0a8a954df5600d115b4b626861e33c79b41ec03c5194" - ], - "version": "==1.0.1" - }, - "markdown": { - "hashes": [ - "sha256:9ba587db9daee7ec761cfc656272be6aabe2ed300fece21208e4aab2e457bc8f", - "sha256:a856869c7ff079ad84a3e19cd87a64998350c2b94e9e08e44270faef33400f81" - ], - "version": "==2.6.11" - }, - "msgpack-numpy": { - "hashes": [ - "sha256:6947df61826a2917e38dbe07957a0c70dc82dce93ec38153dae850fdd21a4583", - "sha256:afc603c7cf8497fb125a8c8c713518a004e9662101f088e3d4fcf7688b08eeb3" - ], - "version": "==0.4.1" - }, - "msgpack-python": { - "hashes": [ - "sha256:378cc8a6d3545b532dfd149da715abae4fda2a3adb6d74e525d0d5e51f46909b" - ], - "version": "==0.5.6" - }, - "murmurhash": { - "hashes": [ - "sha256:651137ed3e1169342c9edade454f3beb7fcdf28d4ad1ac232725237eaf442d9a" - ], - "version": "==0.28.0" - }, - "nerds": { - "editable": true, - "path": "." - }, - "networkx": { - "hashes": [ - "sha256:0d0e70e10dfb47601cbb3425a00e03e2a2e97477be6f80638fef91d54dd1e4b8", - "sha256:1b229b54fe9ccb009cee4de02a88552191497a542a7d5d34adab216b9f15c1ff", - "sha256:b3e0144d5fe6b7479b694e1b598a5545a38f3fc6f1e3c09173eb30f0c7a5770e" - ], - "index": "pypi", - "version": "==1.11" - }, - "nltk": { - "hashes": [ - "sha256:fe0eda251be65843be86d7de9abfbf7161732256f742e623b21243ec47bdb718" - ], - "index": "pypi", - "version": "==3.3.0" - }, - "nose": { - "hashes": [ - "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", - "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a", - "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98" - ], - "index": "pypi", - "version": "==1.3.7" - }, - "numpy": { - "hashes": [ - "sha256:07379fe0b450f6fd6e5934a9bc015025bb4ce1c8fbed3ca8bef29328b1bc9570", - "sha256:085afac75bbc97a096744fcfc97a4b321c5a87220286811e85089ae04885acdd", - "sha256:2d6481c6bdab1c75affc0fc71eb1bd4b3ecef620d06f2f60c3f00521d54be04f", - "sha256:2df854df882d322d5c23087a4959e145b953dfff2abe1774fec4f639ac2f3160", - "sha256:381ad13c30cd1d0b2f3da8a0c1a4aa697487e8bb0e9e0cbeb7439776bcb645f8", - "sha256:385f1ce46e08676505b692bfde918c1e0b350963a15ef52d77691c2cf0f5dbf6", - "sha256:4d278c2261be6423c5e63d8f0ceb1b0c6db3ff83f2906f4b860db6ae99ca1bb5", - "sha256:51c5dcb51cf88b34b7d04c15f600b07c6ccbb73a089a38af2ab83c02862318da", - "sha256:589336ba5199c8061239cf446ee2f2f1fcc0c68e8531ee1382b6fc0c66b2d388", - "sha256:5edf1acc827ed139086af95ce4449b7b664f57a8c29eb755411a634be280d9f2", - "sha256:6b82b81c6b3b70ed40bc6d0b71222ebfcd6b6c04a6e7945a936e514b9113d5a3", - "sha256:6c57f973218b776195d0356e556ec932698f3a563e2f640cfca7020086383f50", - "sha256:758d1091a501fd2d75034e55e7e98bfd1370dc089160845c242db1c760d944d9", - "sha256:8622db292b766719810e0cb0f62ef6141e15fe32b04e4eb2959888319e59336b", - "sha256:8b8dcfcd630f1981f0f1e3846fae883376762a0c1b472baa35b145b911683b7b", - "sha256:97fa8f1dceffab782069b291e38c4c2227f255cdac5f1e3346666931df87373e", - "sha256:9d69967673ab7b028c2df09cae05ba56bf4e39e3cb04ebe452b6035c3b49848e", - "sha256:9e1f53afae865cc32459ad211493cf9e2a3651a7295b7a38654ef3d123808996", - "sha256:a4a433b3a264dbc9aa9c7c241e87c0358a503ea6394f8737df1683c7c9a102ac", - "sha256:baadc5f770917ada556afb7651a68176559f4dca5f4b2d0947cd15b9fb84fb51", - "sha256:c725d11990a9243e6ceffe0ab25a07c46c1cc2c5dc55e305717b5afe856c9608", - "sha256:d696a8c87315a83983fc59dd27efe034292b9e8ad667aeae51a68b4be14690d9", - "sha256:e1864a4e9f93ddb2dc6b62ccc2ec1f8250ff4ac0d3d7a15c8985dd4e1fbd6418" - ], - "index": "pypi", - "version": "==1.14.5" - }, - "pathlib": { - "hashes": [ - "sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f" - ], - "version": "==1.0.1" - }, - "plac": { - "hashes": [ - "sha256:854693ad90367e8267112ffbb8955f57d6fdeac3191791dc9ffce80f87fd2370", - "sha256:ba3f719a018175f0a15a6b04e6cc79c25fd563d348aacd320c3644d2a9baf89b" - ], - "version": "==0.9.6" - }, - "preshed": { - "hashes": [ - "sha256:a6b3a9e34634600e3e410ec25e0debed4b65a47eb37514a063d189d1c425b4dd" - ], - "version": "==1.0.0" - }, - "protobuf": { - "hashes": [ - "sha256:12985d9f40c104da2f44ec089449214876809b40fdc5d9e43b93b512b9e74056", - "sha256:12c97fe27af12fc5d66b23f905ab09dd4fb0c68d5a74a419d914580e6d2e71e3", - "sha256:327fb9d8a8247bc780b9ea7ed03c0643bc0d22c139b761c9ec1efc7cc3f0923e", - "sha256:3895319db04c0b3baed74fb66be7ba9f4cd8e88a432b8e71032cdf08b2dfee23", - "sha256:695072063e256d32335d48b9484451f7c7948edc3dbd419469d6a778602682fc", - "sha256:7d786f3ef5b33a04e6538089674f244a3b0f588155016559d950989010af97d0", - "sha256:8bf82bb7a466a54be7272dcb492f71d55a2453a58d862fb74c3f2083f2768543", - "sha256:9bbc1ae1c33c1bd3a2fc05a3aec328544d2b039ff0ce6f000063628a32fad777", - "sha256:9f1087abb67b34e55108bc610936b34363a7aac692023bcbb17e065c253a1f80", - "sha256:9fefcb92a3784b446abf3641d9a14dad815bee88e0edd10b9a9e0e144d01a991", - "sha256:a37836aa47d1b81c2db1a6b7a5e79926062b5d76bd962115a0e615551be2b48d", - "sha256:cca22955443c55cf86f963a4ad7057bca95e4dcde84d6a493066d380cfab3bb0", - "sha256:d7ac50bc06d31deb07ace6de85556c1d7330e5c0958f3b2af85037d6d1182abf", - "sha256:dfe6899304b898538f4dc94fa0b281b56b70e40f58afa4c6f807805261cbe2e8" - ], - "version": "==3.6.0" - }, - "pyahocorasick": { - "hashes": [ - "sha256:3d584e7836ca7b066f99d7fdb384dc6ef7af211b2b139baedbd960c7c279bb7f" - ], - "index": "pypi", - "version": "==1.1.8" - }, - "pymongo": { - "hashes": [ - "sha256:061085dfe4fbf1d9d6ed2f2e52fe6ab72559e48b4294370b433751638160d10b", - "sha256:07fdee1c5567f237796a8550233e04853785d8dcf95929f96ab519ed91543109", - "sha256:0d98731aaea8cb32b535c376f6785927e4e3d9459ffe1440b8a639827a849350", - "sha256:10f683950f70626ccedf4a662d1c0b3244e8e013c2067872af5633830abd1bfd", - "sha256:2954b99cfeb76776879e9f8a4cae9c5e19d5eff92d0b7b663ceddcf192adb66b", - "sha256:419ed5d5b76ef304815f354d9df7f2085acfd6ff7cc1b714ca702e2239b341c2", - "sha256:42ec201fd9a26e7c1e611e3db19324dead51dd4646391492eb238b41749340e8", - "sha256:4400fa92af310bf66b76c313c7ded3bb63f3d63b4f43c3bfbff552cf294dc9fa", - "sha256:4807dfbb5cdcfe0224329992dc48b897c780d0ad7553c3799d34f84ba5cab446", - "sha256:54daf67e1e7e7e5a5160c86123bdd39b1d3b25876c2ab38230dc2a764cb3d98f", - "sha256:5fd6ce5ed3c6c92d2c94756e6bf041304e5c7c5a5dbea31b8957d52a78bdf01d", - "sha256:601e00fe7fb283f04c95f5dafb787c0862f48ca015a6f1f81b460c74e4303873", - "sha256:7fbd9233e8b6741b047c5857e2ad5efb74091f167d7fa8a2a3379217165058f9", - "sha256:7ffac35362c07c103b024b89875e8d7f0625129b65c56fa8a3ecebbd56110405", - "sha256:833bc6cb2ec7058dea9f5840a9314ac74738d2117486a044e88f3976e37ea7a0", - "sha256:92cb26a2a9b38e8df5215803f950b20a6c847d5e00d1dd125eaa84f05f9472d7", - "sha256:9e5f0e8967d95a256038817460844a8aab588b9bc9ba6296507a1863960a0e44", - "sha256:abf83b908e535b1386a7732825994e6e36eff6394c1829f3e7a23888136484fa", - "sha256:adb2dba52c8a2a2d7bcd3b267f7bbf7c822850cf6a7cd15211b9f386c3a670ef", - "sha256:ae7b3479822a03f6f651913de84ba67101f23e051ae88034085e974f472dcfff", - "sha256:cc15b30f0ac518e6cbd4b6e6e6162f8aa14edfe255d0841146f146151bd58865", - "sha256:d23498d62063b715078947bef48fa4d34dc354f3b268ed15dc6b46fc809a88e9", - "sha256:dd29bb5bc9068ccc248c8c145efd839421f04363b468b47cfa2d4902ca369afe", - "sha256:e53ad0cc6c489f83e7f6bb6121aa73bb6f6488410024a3bd77c16af1aa3a1000", - "sha256:ecb11113407d919f8714cc7d0841985044633d0b561ef3d797e1b494a3e73537", - "sha256:ece2c2add66d3ec2720a963bf073ca11fc3b0b58159767fc3bc5ddaad791d481", - "sha256:ef25c8675f5c8c19832f69cd97d728d99bb4ab9c3b200e28a5c8416631afaf3c", - "sha256:f62a818d643776873713c5676f17bd95ac4176220b13dd12c14edd3a450d1ac9", - "sha256:f7ebcb846962ee40374db2d9014a89bea9c983ae63c1877957c3a0a756974796" - ], - "version": "==3.6.1" - }, - "python-crfsuite": { - "hashes": [ - "sha256:10d84507d96d2870fba053d6659170113675762745e715befe0d9671d8988098", - "sha256:19882b03d26abe075c280f3450829f520a36d17a050621e48109094ea94f4965", - "sha256:2e1cdceca173cf73360220737648acf87244c1ae98eebf9f41d7a86035d4eaac", - "sha256:3b36da634400dd1557bb947ddf009e6328e7ef76332054f6484e8cec6993b86e", - "sha256:470220d0f4be28769505c5cd8e2854ee25d748bc38d70f42811832031f245273", - "sha256:47f10949a003439cdb7922b43177c96c2c1eea5c56ed2d17270c850bc86ccab3", - "sha256:5ad7395dcce74fd07fc031f1e6c1160ff7b43ea343129caa77fb8d8afe7e2ecb", - "sha256:607f4276025b2217e326e6c9161308fb2ad7f881665bfff072dbedae6cdc9cf8", - "sha256:6c03fe809957eaf9333816f52dd497678a013ee918c502a7a529b191ef19c694", - "sha256:73b57919711499d2ff32843a28b2a6c17487630ba255a6aa4a093ee0a7f3b1fa", - "sha256:7861c9c7635e868c67914e77ccf757e096387edf676f78425aa05b2f8b32ced7", - "sha256:8b656eee39d0fdea760285b574c5517d309f943be98d87e42425d332cdf687ca", - "sha256:942aa72793dad1f9b0d65991b4386e385c4bf9248d2072e0e058f6cbb41e15dc", - "sha256:968f8597f5df1ec54c5e72ce50775118a09f3c5737962e058f94a83da30d2a68", - "sha256:96d0041722bfd9649b4f31971c25941cd9c01d4cbd4f774ba886ac6f77092f0b", - "sha256:9daca96ada19b79353a022d77f38225a1dedfb632d6107a4846f8c1b5ace88a2", - "sha256:9e18d89a1996306bf44341b55dd5cda2afa5a6aeeb4ebb99240b26bce1e89bd2", - "sha256:a8d191eac2c7a395a74ceade0e58eea0ad5de823200b7b995ecab29295cf027b", - "sha256:af63e6ed7635e5180bf278d0bd967097dcb295e0a58ba815a1c29d179479bfae", - "sha256:b5c8b6a7bc194ca189db8cae6991d7fba0e1612c5c44297b989364e096a12a03", - "sha256:b7643090e8e068ca25b5525d81a61d001e6594f6fa62452a1cb536e238b23c99", - "sha256:bd3b5d73ca86928410473be4d3246730263a36fb0afd355f4ddec450ec5f881b", - "sha256:c52d38b7180e19b6f5430eb76ae6fb160a5be2f5630834ea71d592bc5bff3671", - "sha256:c94030f625e8139a41528d546c4b41cc223ee28f301af5d780eda10dcc96364c", - "sha256:d4ce2830fd0d8ec037deefe662633935fa4a4cccc8647ac0515181c939c2184a", - "sha256:dba43d4b1ccf1b1eb67a8adbfae788f3498e21997124010ad0f819dad3cf8b1d", - "sha256:e419b441ba39716f1b9095dc3ed7adf3115611504d8933e973a7036ccef4168f", - "sha256:e7b73325a5b345d10b78018d6ca123c6348825c82f3670a169143050d087ee65" - ], - "version": "==0.9.5" - }, - "pyyaml": { - "hashes": [ - "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736", - "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f", - "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab", - "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7", - "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1", - "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8", - "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4", - "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269" - ], - "index": "pypi", - "version": "==3.12" - }, - "regex": { - "hashes": [ - "sha256:19c4b0f68dd97b7116e590f47d60d97ab9e76966acc321b1d20dd87c2b64dff2", - "sha256:1af6b820bec5ca82af87447af5a6dcc23b3ddc96b0184fd71666be0c24fb2a4f", - "sha256:232dbc28a2562d92d713c3c1eb2b9276f3ebcbdb6d3e96ff68d0417a71926784", - "sha256:3d26ce7e605a501509b68c343fc9d9e09f76c2e9e261df8183027bdc750c97ce", - "sha256:52b590a41b9677314d02d9055edc33992db758b3d5167aa1365229a6a0c26a6d", - "sha256:565f9aac9cd43b2351f7fcbc0d6056f8aebf4f6d049a17982085019ab9acdf28", - "sha256:656984899644d3fe2e40533724f513a21127f77162a15dd5244af3c965152c63", - "sha256:689c9d17c3ba02f52e8481a5c584c8c11ba27d6cc5f939efdd838ae0d0d1af41", - "sha256:8a9d9db8ef1621ae51ea12acb5e503204b4586e05c6cfd418aecb9466a71bd87", - "sha256:ad2beea450d551b11b47512ce920127d7c8645e528cc56dc9502c5973e8732f3", - "sha256:b39867f577bc59b2fec9209facc513c761978e4ac63f4b73b9750a2c1501729e", - "sha256:b6a7725a069be8f9dd09e1e500e5b57556b301942e21c8c712627f73ec048286", - "sha256:b9e9b97696e75e826adac1920b13e7bac3a6a2128c085783abd208d73a278d70", - "sha256:bf4896ed1ca2017153fc6b341bc8a0da8ca5480f85eebd7bfe58bbafceb4e728", - "sha256:c3c2fe1e0d90f4c93be5b588480f05defd44f64c65767a657de69c4db4429a39", - "sha256:d811874ed669165fe1059a54f860db5c6ab5f48100bf4945d915fd2f877b2531", - "sha256:db616380b04e29e5709bc3ec0674e827dfed3d18e7d686c09537ab01506127c9", - "sha256:efa66273b49dbd7a9f6a4d02d1a7d5bf353d568a89f7cd8927812daa9f83bb84", - "sha256:f8feab5b517cdc65a61a50549e7dcfa0f61ab872a0034da1f6b8d61775178b6a" - ], - "index": "pypi", - "version": "==2017.4.5" - }, - "requests": { - "hashes": [ - "sha256:63b52e3c866428a224f97cab011de738c36aec0185aa91cfacd418b5d58911d1", - "sha256:ec22d826a36ed72a7358ff3fe56cbd4ba69dd7a6718ffd450ff0e9df7a47ce6a" - ], - "version": "==2.19.1" - }, - "scikit-learn": { - "hashes": [ - "sha256:13136c6e4f6b808569f7f59299d439b2cd718f85d72ea14b5b6077d44ebc7d17", - "sha256:370919e3148253fd6552496c33a1e3d78290a336fc8d1b9349d9e9770fae6ec0", - "sha256:3775cca4ce3f94508bb7c8a6b113044b78c16b0a30a5c169ddeb6b9fe57a8a72", - "sha256:42f3c5bd893ed73bf47ccccf04dfb98fae743f397d688bb58c2238c0e6ec15d2", - "sha256:56cfa19c31edf62e6414da0a337efee37a4af488b135640e67238786b9be6ab3", - "sha256:5c9ff456d67ef9094e5ea272fff2be05d399a47fc30c6c8ed653b94bdf787bd1", - "sha256:5ca0ad32ee04abe0d4ba02c8d89d501b4e5e0304bdf4d45c2e9875a735b323a0", - "sha256:5db9e68a384ce80a17fc449d4d5d9b45025fe17cf468429599bf404eccb51049", - "sha256:72c194c5092e921d6107a8de8a5adae58c35bbc54e030ba624b6f02fd823bb21", - "sha256:871669cdb5b3481650fe3adff46eb97c455e30ecdc307eaf382ef90d4e2570ab", - "sha256:873245b03361710f47c5410a050dc56ee8ae97b9f8dcc6e3a81521ca2b64ad10", - "sha256:8b17fc29554c5c98d88142f895516a5bec2b6b61daa815e1193a64c868ad53d2", - "sha256:95b155ef6bf829ddfba6026f100ba8e4218b7171ecab97b2163bc9e8d206848f", - "sha256:a21cf8217e31a9e8e32c559246e05e6909981816152406945ae2e3e244dfcc1f", - "sha256:ba3fd442ae1a46830789b3578867daaf2c8409dcca6bf192e30e85beeabbfc2f", - "sha256:ce78bf4d10bd7e28807c36c6d2ab25a9934aaf80906ad987622a5e45627d91a2", - "sha256:d384e6f9a055b7a43492f9d27779adb717eb5dcf78b0603b01d0f070a608d241", - "sha256:d4da369614e55540c7e830ccdd17ab4fe5412ff8e803a4906d3ece393e2e3a63", - "sha256:ddc1eb10138ae93c136cc4b5945d3977f302b5d693592a4731b2805a7d7f2a74", - "sha256:e54a3dd1fe1f8124de90b93c48d120e6da2ea8df29b6895325df01ddc1bd8e26", - "sha256:ee8c3b1898c728b6e5b5659c233f547700a1fea13ce876b6fe7d3434c70cc0e0", - "sha256:f528c4b2bba652cf116f5cccf36f4db95a7f9cbfcd1ee549c4e8d0f8628783b5", - "sha256:f9abae483f4d52acd6f660addb1b67e35dc5748655250af479de2ea6aefc6df0" - ], - "version": "==0.19.1" - }, - "scipy": { - "hashes": [ - "sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7", - "sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a", - "sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd", - "sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3", - "sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37", - "sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463", - "sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3", - "sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631", - "sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5", - "sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a", - "sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f", - "sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559", - "sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692", - "sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1", - "sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac", - "sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a", - "sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2", - "sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01", - "sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552", - "sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40", - "sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020", - "sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae", - "sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40" - ], - "index": "pypi", - "version": "==1.1.0" - }, - "seqeval": { - "hashes": [ - "sha256:6dc7f9ddf5246b909adb0c349575daedbe7828c2bc02df4c81fd4bd80ad8adaa" - ], - "version": "==0.0.3" - }, - "six": { - "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" - ], - "version": "==1.11.0" - }, - "sklearn": { - "hashes": [ - "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31" - ], - "index": "pypi", - "version": "==0.0" - }, - "sklearn-crfsuite": { - "hashes": [ - "sha256:2f59aad3055e01a778a79a6352891cac04788e8b52688aa5bc8b11be7717861e", - "sha256:6e9a42bc3de96941d5f7262335130955b8c380b1356147622368f385075705d9" - ], - "index": "pypi", - "version": "==0.3.6" - }, - "spacy": { - "hashes": [ - "sha256:cddb06e7965222e4339eb59d2258db8dadab19ef8b0a1a44a2d33f94935ba421" - ], - "index": "pypi", - "version": "==2.0.11" - }, - "tabulate": { - "hashes": [ - "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2" - ], - "version": "==0.8.2" - }, - "tensorboard": { - "hashes": [ - "sha256:2651a4d9261a6593cb2c3514576e4bf25e273837c79e98f68a3cf51759f68725", - "sha256:7776cc8bcfd0d07cd106e9e86c011ab8eca38c97b57f5433b9f0fd6bbf31a36e" - ], - "version": "==1.8.0" - }, - "tensorflow": { - "hashes": [ - "sha256:1fc4eb267bc973f2a8466778ad0c6f110f8356373e534c8463a7bb9b6d86e5f0", - "sha256:24c3d3f87dc6108f4de3bbb7f647bcd6f303f874721b176185f260adea88fe40", - "sha256:3087797a85610b756066df294f98e666e49f4fa60c6e1dbcf4b190b21397af71", - "sha256:36a6671d39a3aec89cb4cce125e90305c3f8a69da16d2fb72eff40fe478e1de9", - "sha256:373c637f9c4f06346b1b02e280b30517d262a1651b252ff889f7c22716ef4548", - "sha256:397c0766e166fb768613b1498a2a6f67eeea077425d234c0138d55e85408473d", - "sha256:68bda870c355aa64dc3dc377d5b9d13c829281467a7c3b95cb8529031d8f68b2", - "sha256:899e53e01efcc8e906b3d1f53fe6a6264edf5dfc275b32cadbc6f4e33dca78f7", - "sha256:92801b9aebcc195d3b5a492d1d46330c47c11c6f0c0f7ab6b605da489482c64e", - "sha256:d345d296aeb05eeb50d9de43a1dcb66ceaba6a2bd603f58aeefaa07b2c1bfac1", - "sha256:dccc205e84cd33a240a601046e88eacefe12d677a1bbdf17f0ebafd1a7c84c70", - "sha256:de51f60021ea8160ea6d0340e827a26331cd549f6e7c470fba7ee83aeab4f818" - ], - "index": "pypi", - "version": "==1.8.0" - }, - "termcolor": { - "hashes": [ - "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b" - ], - "version": "==1.1.0" - }, - "thinc": { - "hashes": [ - "sha256:9a1deb850285f76efaf0ae38b605a137a3978826282cc57dcc1e66b779402a76" - ], - "version": "==6.10.2" - }, - "toolz": { - "hashes": [ - "sha256:929f0a7ea7f61c178bd951bdae93920515d3fbdbafc8e6caf82d752b9b3b31c9" - ], - "version": "==0.9.0" - }, - "tqdm": { - "hashes": [ - "sha256:224291ee0d8c52d91b037fd90806f48c79bcd9994d3b0abc9e44b946a908fccd", - "sha256:77b8424d41b31e68f437c6dd9cd567aebc9a860507cb42fbd880a5f822d966fe" - ], - "version": "==4.23.4" - }, - "ujson": { - "hashes": [ - "sha256:f66073e5506e91d204ab0c614a148d5aa938bdbf104751be66f8ad7a222f5f86" - ], - "version": "==1.35" - }, - "urllib3": { - "hashes": [ - "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", - "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" - ], - "version": "==1.23" - }, - "werkzeug": { - "hashes": [ - "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", - "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b" - ], - "version": "==0.14.1" - }, - "wheel": { - "hashes": [ - "sha256:0a2e54558a0628f2145d2fc822137e322412115173e8a2ddbe1c9024338ae83c", - "sha256:80044e51ec5bbf6c894ba0bc48d26a8c20a9ba629f4ca19ea26ecfcf87685f5f" - ], - "markers": "python_version != '3.1.*' and python_version != '3.2.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.0.*'", - "version": "==0.31.1" - }, - "wrapt": { - "hashes": [ - "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" - ], - "version": "==1.10.11" - } - } -} diff --git a/README.md b/README.md index 7b0dd81..befd880 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # nerds -![nerds logo](nerds.png) +![nerds logo](docs/nerds.png) # How to set up a DEV environment @@ -15,9 +15,29 @@ pip3 install pipenv This will make sure that `pipenv` uses your latest version of Python3, which is hopefully 3.6 or higher. Please refer to the [official website](https://docs.pipenv.org/) for more information on `pipenv`. -A Makefile has been created for convenience, so that you can install the project dependencies, download the required models, test and build the tool easily. +A Makefile has been created for convenience, so that you can install the project dependencies, download the required models, test and build the tool easily. Note that this is the preferred environment setup approach, the `Pipfile` and `Pipfile.lock` files ensure that you automatically have access to the installed packages in `requirements.txt` after you do a `make install` (see below). -### Makefile specifications +## Setting up the environment using `conda` + +Alternatively, if you are using the [Anaconda distribution of Python](https://www.anaconda.com/), you can also use `conda` to create an environment using the following command: + +``` +conda create -n nerds python=3.6 anaconda +``` + +You can then enter the newly created conda environment using the following command. After you run the various `make ...` commands, the packages listed in `requirements.txt` and the downloaded models will only be visible inside the `nerds` environment. This approach is usually preferred since it can help prevent version collisions between different environments, at the cost of more disk space. + +``` +conda activate nerds +``` + +and exit the environment using the following command. + +``` +conda deactivate +``` + +## Makefile specifications To install all of the required packages for development and testing run: @@ -55,37 +75,48 @@ NERDS is a framework that provides some NER capabilities - among which the optio ## Understanding the main data exchange classes -There are 3 main classes in the `nerds.core.model.input.*` package that are used in our NER models: `Document`, `Annotation` and `AnnotatedDocument`. - -A `Document` class is the abstract representation of a raw document. It should always implement the `plain_text_` attribute, that returns the plain text representation of the object, as it's the one where we are going to perform NER. Therefore, whenever we want to process any new type of document format - XML, PDF, JSON, brat, etc. - the only requirement is to write an adapter that reads the file(s) from an input directory and transforms them to `Document` objects. The default `Document` object works seamlessly with `.txt` files. +The NERDS master project on [elsevierlabs-os/nerds](https://github.com/elsevierlabs-os/nerds) project uses a set of custom data exchange classes `Document`, `Annotation`, and `AnnotatedDocument`. The project provided a set of conversion utilities which could be used to convert provided datasets to this format, and convert instances of these classes back to whatever format the underlying wrapped NER model needed. However, this NERDS fork on [sujitpal/nerds](https://github.com/sujitpal/nerds) eliminates this requirement -- the internal format is just a list of list of tokens (words in sentence) or BIO tags. The utility function `nerds.utils.load_data_and_labels` can read a file in CoNLL BIO format and convert to this internal format. This decision was made because 3 of the 5 provided models consume the list of list format natively, and the result is fewer lines of extra code and less potential for error. -The `Annotation` class contains the data for a single annotation. This is the text (e.g. "fox"), the label (e.g. "ANIMAL") and the offsets that correspond to offsets in the `plain_text_` representation of a `Document` (e.g. 40-42). +In general, when given an input format that is not in CoNLL BIO format, the main effort in using NERDS would be to convert it to CoNLL BIO format. Once that is done, it is relatively easy to ingest it into a data and label structure, as shown below. -> **Important to note**: The offsets is a 2-tuple of integers that represent the position of the first and the last character of the annotation. Be careful, because some libraries end the offset one character **after** the final character i.e. at `start_offset + len(word)`. This is not the case with us, we currently end the offsets at **exactly** the final character i.e. at `start_offset + len(word) - 1`. +```python +from nerds.utils import load_data_and_labels -Finally, the `AnnotatedDocument` class is a combination of `Document` and a list of `Annotation`, and it can represent two things: +data, labels = load_data_and_labels("nerds/test/data/example.iob") +print("data:", data) +print("labels:", labels) +``` -* Ground truth data (e.g. brat annotation files). -* Predictions on documents after they run through our NER models. +yields the following output. -The `AnnotatedDocument` class exposes the `annotated_text_` attribute which returns the plain text representation of the document with inline annotations. +``` +data: [ + ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.'], + ['Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.', ',', 'the', 'Dutch', 'publishing', 'group', '.'] +] +labels [ + ['B-PER', 'I-PER', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', 'O'], + ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O'] +] +``` ## Extending the base model class -The basic class that every model needs to extend is the `NERModel` class in the `nerds.core.model.ner.base` package. The model class implements a `fit - transform` API, similarly to `sklearn`. To implement a new model, one must extend the following methods at minimum: +The basic class that every model needs to extend is the `NERModel` class in the `nerds.models` package. The model class implements a `fit - predict` API, similarly to `sklearn`. To implement a new model, one must extend the following methods at minimum: -* `fit`: Trains a model given a list of `AnnotatedDocument` objects. -* `transform`: Gets a list of `Document` objects and transforms them to `AnnotatedDocument`. -* `save`: Disk persistence of a model. -* `load`: Disk persistence of a model. +* `fit(X, y)`: Trains a model given a list of list of tokens X and BIO tags y. +* `predict(X)`: Returns a list of list of BIO tags, given a list of list of tokens X. +* `save(dirpath)`: Saves model to directory given by dirpath. +* `load(dirpath)`: Retrieves model from directory given by dirpath. -Please note that **all** of the class methods, utility functions, etc. should operate on `Document` and `AnnotatedDocument` objects, to maintain compatibility with the rest of the framework. The only exception is "private" methods used internally in classes. +As a best practice, I like to implement a single NER model (or group of related NER models) as a single file in the `models` folder, but have it be accessible from client code directly as `nerds.models.CustomNER`. You can set this redirection up in `nerds/models/__init__.py`. # Running experiments -So, let's assume you have a dataset that contains annotated text. If it's in a format that is already supported (e.g. [brat](http://brat.nlplab.org/standoff.html)), then you may just load it into `AnnotatedDocument` objects using the built-in classes. Otherwise, you will have to extend the `nerds.core.model.input.DataInput` class to support the format. Then, you may use the built-in NER models (or create your own) either alone, or in an ensemble and evaluate their predictive capabilities on your dataset. +There are two examples of running experiments using NERDS. We will continue to update these examples as more functionality becomes available. -In the `nerds.core.model.evaluate` package, there are helper methods and classes to perform k-fold cross-validation. Please, refer to the `nerds.examples` package where you may look at working code examples with real datasets. +* [examples/GMB](examples/GMB) +* [examples/BioNLP](examples/BioNLP) # Contributing to the project @@ -93,3 +124,16 @@ New models and input adapters are always welcome. Please make sure your code is * `make test` shows that all the unit test pass. * `make lint` shows no Python code violations. + +The [CONTRIBUTING.md file](docs/CONTRIBUTING.md) lists contributors who have contributed to the [NERDS (elsevierlabs-os/nerds)](https://github.com/elsevierlabs-os/nerds) project. + +# Changes / Improvements in this Fork + +The [CHANGES.md file](docs/CHANGES.md) lists the changes and improvements that were made in this fork. + +# Talks and Blogs + +* \[slides\] [Slides for talk at PyData LA 2019](https://www.slideshare.net/sujitpal/building-named-entity-recognition-models-efficiently-using-nerds). +* \[video\] [Video of talk at PyData LA 2019](https://www.youtube.com/watch?v=ilzFiK0nAh8). +* \[blog\] [Incorporating third party NER (Flair) into NERDS](https://sujitpal.blogspot.com/2019/12/incorporating-flair-ner-into-nerds.html). +* \[blog\] [Adding a Transformer based NER model into NERDS](https://sujitpal.blogspot.com/2020/01/adding-transformer-based-ner-model-into.html). diff --git a/converters/brat2iob.py b/converters/brat2iob.py new file mode 100644 index 0000000..a3a6e52 --- /dev/null +++ b/converters/brat2iob.py @@ -0,0 +1,178 @@ +import argparse +import operator +import os +import re +import shutil +import spacy +import tempfile + +from nerds.utils import spans_to_tokens, get_logger + +def segment_text_to_sentences(text_file, sentence_splitter): + """ Segment text into sentences. Text is provided by BRAT in .txt + file. + + Args: + text_file (str): the full path to the BRAT .txt file. + sentence_splitter (spacy LM): SpaCy EN language model. + + Returns: + sentences (list((int, int, str))): list of sentence spans. + Spans are triples of (start_offset, end_offset, text), + where offset is relative to the text. + """ + sentences = [] + ftext = open(text_file, "r") + for line in ftext: + splits = sentence_splitter(line.strip()) + for sent in splits.sents: + sentences.append((sent.start_char, sent.end_char, sent.text)) + ftext.close() + return sentences + + +def parse_text_annotations(ann_file): + """ Parses BRAT annotations provided in the .ann file and converts them + to annotation spans of (start_position, end_position, entity_class). + + Args: + ann_file (str): full path to the BRAT .ann file. + + Returns: + annotations (list((int, int, str))): list of annotation spans. + Spans are triples of (start_offset, end_offset, entity_class) + where offset is relative to the text. + """ + annots = [] + fann = open(ann_file, "r") + for line in fann: + cols = re.split(r"\s+", line.strip()) + if not cols[0].startswith("T"): + continue + annots.append((int(cols[2]), int(cols[3]), cols[1])) + fann.close() + return annots + + +def apply_annotations(sentences, annotations, tokenizer): + """ Apply annotation spans to the sentence spans to create a list of tokens + and tags. + + Args: + sentences (list((int, int, str))): list of sentence spans. + annotations (list((int, int, str))): list of annotation spans. + tokenizer (spacy LM): SpaCy EN language model. + + Returns: + tokens_tags_list (list((list(str), list(str)))): list of list of token + tag pairs. Each list of token-tag pairs corresponds to a single + sentence. + """ + tokens_tags_list = [] + for sent_start, sent_end, sent_text in sentences: + sent_annots = [a for a in annotations if a[0] >= sent_start and a[1] <= sent_end] + # convert document offsets to sentence offsets + sent_annots = [(s[0] - sent_start, s[1] - sent_start, s[2]) for s in sent_annots] + tokens, tags = spans_to_tokens(sent_text, sent_annots, tokenizer) + tokens_tags_list.append(zip(tokens, tags)) + return tokens_tags_list + + +def convert_brat_to_iob(input_dir, output_file, nlp): + """ Convenience Convertor function. + + Args: + input_dir (str): the directory where the BRAT .txt and .ann files + are located. + output_file (str): the full path name of file to write output in + IOB format to. + nlp (SpaCy LM): reference to the SpaCy EN model. + + Returns: + None. + """ + fout = open(output_file, "w") + for text_file in os.listdir(input_dir): + # only process .txt and .ann pairs in specified directory + if not text_file.endswith(".txt"): + continue + annot_file = text_file[:-4] + ".ann" + if not os.path.exists(os.path.join(input_dir, annot_file)): + # do not process file if no corresponding .ann file + continue + # process file pair + logger.info("Processing file: {:s}".format(text_file)) + sentences = segment_text_to_sentences(os.path.join(input_dir, text_file), nlp) + annotations = parse_text_annotations(os.path.join(input_dir, annot_file)) + tokens_tags_list = apply_annotations(sentences, annotations, nlp) + for tokens_tags in tokens_tags_list: + for token, tag in tokens_tags: + fout.write("{:s}\t{:s}\n".format(token, tag)) + fout.write("\n") + + fout.close() + + +def do_self_test(nlp): + """ Simple self-test with small dataset to prove that this works okay. """ + text = "Pierre Vinken, 61 years old, will join the board as a nonexecutive director, Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group." + annotations = [ + "T1 PER 0 13 Pierre Vinken", + "T2 PER 86 96 Mr. Vinken", + "T3 DATE 15 27 61 years old", + "T4 DATE 77 84 Nov. 29", + "T5 ORG 112 125 Elsevier N.V.", + "T6 NORP 131 136 Dutch" + ] + input_dir = tempfile.mkdtemp(dir="/tmp") + ftext = open(os.path.join(input_dir, "test.txt"), "w") + ftext.write(text) + ftext.close() + fann = open(os.path.join(input_dir, "test.ann"), "w") + for line in annotations: + fann.write(line + "\n") + fann.close() + output_file = os.path.join(input_dir, "test.iob") + convert_brat_to_iob(input_dir, output_file, nlp) + fout = open(output_file, "r") + for line in fout: + logger.warn(line.strip()) + shutil.rmtree(input_dir) + + +################################ main ################################ +# +# usage: brat2iob.py [-h] [-i INPUT_DIR] [-o OUTPUT_FILE] [-t] +# Script to convert BRAT annotations to IOB (NERDS) format. +# optional arguments: +# -h, --help show this help message and exit +# -i INPUT_DIR, --input_dir INPUT_DIR +# Directory to store BRAT .txt and .ann files. +# -o OUTPUT_FILE, --output_file OUTPUT_FILE +# Output file to write IOB output to. +# -t, --test Runs self test. +###################################################################### + +parser = argparse.ArgumentParser( + description="Script to convert BRAT annotations to IOB (NERDS) format.") +parser.add_argument("-i", "--input_dir", help="Directory to store BRAT .txt and .ann files.") +parser.add_argument("-o", "--output_file", help="Output file to write IOB output to.") +parser.add_argument("-t", "--test", help="Runs self test.", action="store_true") +args = parser.parse_args() + +logger = get_logger() + +input_dir = args.input_dir +output_file = args.output_file +self_test = args.test + +nlp = spacy.load("en") + +if self_test: + logger.info("Executing self test...") + do_self_test(nlp) +else: + logger.info("Reading BRAT .txt and .ann files from: {:s}".format(input_dir)) + logger.info("Writing IOB tokens/tags to file: {:s}".format(output_file)) + convert_brat_to_iob(input_dir, output_file, nlp) + diff --git a/docs/CHANGES.md b/docs/CHANGES.md new file mode 100644 index 0000000..255dc5d --- /dev/null +++ b/docs/CHANGES.md @@ -0,0 +1,51 @@ +# Improvements and Changes + +## Completed + +* Replace AnnotatedDocument common data format to List of List format borrowed from Anago. +* Removes dependency on NLTK +* Model + * NERModel -- base class extends ClassifierMixin, so exposes predict() instead of transform(). + * DictionaryNER + * similar to ExactMatchDictionaryNER except + * takes Anago style IO + * handles multiple classes (as well as single class as special case) + * can handle Anago style input via fit(X, y, combine_tokens=True) and dictionary style input via fit(X, y, combine_tokens=False). + * CrfNER + * similar to CRF except + * takes Anago style IO (native IO format to wrapped model sklearn_crfsuite.CRF) + * replaces dependency on nltk.tokenize_pos() to SpaCy + * allows features to be directly passed to fit() using is_featurized=False. + * SpacyNER + * similar to SpacyStatisticalNER, except + * takes Anago style IO + * more robust to large data sizes, uses mini-batches for training + * BiLstmCrfNER + * similar to BidirectionalLSTM except + * takes Anago style IO + * works against most recent Anago API changes + * does not give timestep size errors + * ElmoNER + * New, available in Anago DEV repo, same API as Anago's BiLSTMCRF + * FlairNER + * New, incorporated from the [Zalando Flair project](https://github.com/flairNLP/flair). + * TransformerNER + * New, provides support for transformer based NERs using choice of BERT, RoBERTa, DistilBERT, CamemBERT, and XLM-RoBERTa language models, uses the [SimpleTransformers library](https://pypi.org/project/simpletransformers/). + * EnsembleNER + * simpler interface + * weights from each classifier + * fit() and predict() can use multiple parallel jobs (`n_jobs`). +* Utils + * Thin wrapper over anago's `load_data_and_labels` + * `flatten_list` and `unflatten_list` to convert between `list(list(str))` produced by NERDS models and `list(str)` required by `sklearn`, scikit-learn metrics can be used. + * `tokens_to_spans` and `spans_to_tokens` -- utility functions to convert between sentence and span format (used by the other 2 of 5 provided models) from and to BIO format. +* Converters + * Converter from BRAT (.txt and .ann) to IOB format +* Miscellaneous + * replaced deprecated sklearn.external.joblib -> joblib + * True Scikit-Learn interoperability -- moved parameters to constructor. However, `sklearn.utils.check_estimator` still fails, most likely because the parameters to `fit()` and `predict()` are `list(list(str))` rather than `list(str)`. + * Docs converted to Numpy Docstring format. + +## Planned + + diff --git a/CONTRIBUTING.md b/docs/CONTRIBUTING.md similarity index 100% rename from CONTRIBUTING.md rename to docs/CONTRIBUTING.md diff --git a/nerds.png b/docs/nerds.png similarity index 100% rename from nerds.png rename to docs/nerds.png diff --git a/examples/BioNLP/README.md b/examples/BioNLP/README.md new file mode 100644 index 0000000..ac4d7f3 --- /dev/null +++ b/examples/BioNLP/README.md @@ -0,0 +1,153 @@ +# Dataset description + +Data comes from the [GENIA Project page for BioNLP / JNLPBA Shared Task 2004](http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004). The page describes the provenance and characteristics of the data. + +In addition, [GloVe (Global Vectors for Word Representation) vectors](https://nlp.stanford.edu/projects/glove/) are needed to run the ElmoNER model. + +To make the data available for use by our example, execute the script `data_prep.sh` in the current directory. This script will create a `data` directory, and also download the GloVe vectors needed by the example. + +## Entity distribution + +``` + 25307 DNA + 2481 RNA + 11217 cell_line + 15466 cell_type + 55117 protein +``` + +## Training + +Our example will use the `data/train/Genia4ERtask1.iob2` file for training, and the `data/test/Genia4EReval1.iob2` file for evaluation. Both files are already in BIO format. Entity distribution shown above is for training data. + +## Results + +### Dictionary NER (from_dictionary=False) + +``` + precision recall f1-score support + + cell_line 0.63 0.47 0.54 1489 + cell_type 0.71 0.63 0.67 4912 + protein 0.72 0.65 0.69 9841 + DNA 0.63 0.50 0.56 2845 + RNA 0.57 0.46 0.51 305 + + micro avg 0.70 0.61 0.65 19392 + macro avg 0.65 0.54 0.59 19392 +weighted avg 0.70 0.61 0.65 19392 +``` + +### CRF NER (c1=0.1, c2=0.1, max_iter=100, featurizer=Default) + +``` + precision recall f1-score support + + cell_line 0.58 0.70 0.63 1489 + cell_type 0.88 0.71 0.79 4912 + protein 0.79 0.80 0.80 9841 + DNA 0.77 0.73 0.75 2845 + RNA 0.77 0.72 0.74 305 + + micro avg 0.79 0.76 0.77 19392 + macro avg 0.76 0.73 0.74 19392 +weighted avg 0.79 0.76 0.77 19392 +``` + +### SpaCy NER (dropout=0.1, max_iter=20, batch_size=32) + +``` + precision recall f1-score support + + cell_line 0.56 0.76 0.65 1489 + cell_type 0.89 0.66 0.76 4912 + protein 0.78 0.84 0.81 9841 + DNA 0.77 0.76 0.77 2845 + RNA 0.77 0.76 0.77 305 + + micro avg 0.78 0.78 0.78 19392 + macro avg 0.76 0.76 0.75 19392 +weighted avg 0.79 0.78 0.78 19392 +``` + +### BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, max_iter=10) + +``` + precision recall f1-score support + + cell_line 0.53 0.77 0.63 1489 + cell_type 0.88 0.71 0.78 4912 + protein 0.81 0.78 0.79 9841 + DNA 0.73 0.83 0.78 2845 + RNA 0.80 0.78 0.79 305 + + micro avg 0.78 0.77 0.77 19392 + macro avg 0.75 0.77 0.76 19392 +weighted avg 0.79 0.77 0.78 19392 +``` + +### ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, max_iter=2) + +``` + precision recall f1-score support + + cell_line 0.53 0.73 0.61 1489 + cell_type 0.85 0.75 0.79 4912 + protein 0.80 0.87 0.83 9841 + DNA 0.77 0.86 0.81 2845 + RNA 0.77 0.86 0.81 305 + + micro avg 0.78 0.82 0.80 19392 + macro avg 0.74 0.81 0.77 19392 +weighted avg 0.79 0.82 0.80 19392 +``` + +### FLAIR NER (hidden_dim=256, embeddings=StackedEmbeddings(WordEmbeddings("glove"), CharEmbeddings()), use_crf=True, use_rnn=True, num_rnn_layers=1, dropout=0.0, word_dropout=0.05, locked_dropout=0.5, optimizer="sgd", learning_rate=0.1, batch_size=32, max_iter=10) + +``` + precision recall f1-score support + + cell_line 0.47 0.75 0.58 1489 + cell_type 0.88 0.63 0.74 4912 + protein 0.82 0.77 0.80 9841 + DNA 0.79 0.76 0.77 2845 + RNA 0.76 0.80 0.78 305 + + micro avg 0.78 0.73 0.76 19392 + macro avg 0.74 0.74 0.73 19392 +weighted avg 0.80 0.73 0.76 19392 + +``` + +### Transformer NER (lang_model_family="bert", lang_model_name="bert-base-cased", max_sequence_length=128, batch_size=32, max_iter=4, learning_rate=4e-5, padding_tag="O", random_state=42) + +``` + precision recall f1-score support + + cell_line 0.80 0.60 0.68 1977 + cell_type 0.75 0.89 0.81 4161 + protein 0.88 0.81 0.84 10700 + DNA 0.84 0.82 0.83 2912 + RNA 0.85 0.79 0.82 325 + + micro avg 0.83 0.81 0.82 20075 + macro avg 0.82 0.78 0.80 20075 +weighted avg 0.84 0.81 0.82 20075 + +``` + +### Majority voting ensemble (pre-trained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) + +``` + precision recall f1-score support + + cell_line 0.67 0.70 0.69 1489 + cell_type 0.91 0.69 0.78 4912 + protein 0.83 0.77 0.80 9841 + DNA 0.83 0.74 0.78 2845 + RNA 0.81 0.73 0.77 305 + + micro avg 0.84 0.74 0.78 19392 + macro avg 0.81 0.73 0.76 19392 +weighted avg 0.84 0.74 0.78 19392 +``` diff --git a/examples/BioNLP/data_prep.sh b/examples/BioNLP/data_prep.sh new file mode 100755 index 0000000..7d8bdb9 --- /dev/null +++ b/examples/BioNLP/data_prep.sh @@ -0,0 +1,24 @@ +#!/bin/bash +echo "Creating directories..." +mkdir data +cd data +mkdir train test + +echo "Downloading training data..." +cd train +curl -O http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz +tar xvf Genia4ERtraining.tar.gz +rm Genia4ERtraining.tar.gz + +echo "Downloading test data..." +cd ../test +curl -O http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Evaluation/Genia4ERtest.tar.gz +tar xvf Genia4ERtest.tar.gz +rm Genia4ERtest.tar.gz + +cd ../.. +echo "Downloading GloVe embeddings..." +wget http://nlp.stanford.edu/data/glove.6B.zip +unzip -a glove.6B.zip +rm glove.6B.200d.txt glove.6B.300d.txt glove.6B.50d.txt glove.6B.zip + diff --git a/examples/BioNLP/test_models.py b/examples/BioNLP/test_models.py new file mode 100644 index 0000000..3d6be2a --- /dev/null +++ b/examples/BioNLP/test_models.py @@ -0,0 +1,136 @@ +import os +import shutil + +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report +from sklearn.utils import shuffle + +from nerds.models import ( + DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, + ElmoNER, FlairNER, BertNER, TransformerNER, + EnsembleNER +) +from nerds.utils import * + +# these are our entities +entity_labels = ["cell_line", "cell_type", "protein", "DNA", "RNA"] + +# load data +xtrain, ytrain = load_data_and_labels("data/train/Genia4ERtask1.iob2") +xtest, ytest = load_data_and_labels("data/test/Genia4EReval1.iob2") +print(len(xtrain), len(ytrain), len(xtest), len(xtest)) + +# make model directory to store our models +if not os.path.exists("models"): + os.makedirs("models") + +# train and test the Dictionary NER +model = DictionaryNER() +model.fit(xtrain, ytrain) +model.save("models/dict_model") +trained_model = model.load("models/dict_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the CRF NER +model = CrfNER() +model.fit(xtrain, ytrain) +model.save("models/crf_model") +trained_model = model.load("models/crf_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the SpaCy NER +model = SpacyNER() +model.fit(xtrain, ytrain) +model.save("models/spacy_model") +trained_model = model.load("models/spacy_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the BiLSTM-CRF NER +model = BiLstmCrfNER() +model.fit(xtrain, ytrain) +model.save("models/bilstm_model") +trained_model = model.load("models/bilstm_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the ELMo NER +if os.path.exists("glove.6B.100d.txt"): + model = ElmoNER() + model.fit(xtrain, ytrain) + model.save("models/elmo_model") + trained_model = model.load("models/elmo_model") + ypred = trained_model.predict(xtest) + print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the FLAIR NER +model = FlairNER("models/flair_model") +model.fit(xtrain, ytrain) +model.save("models/flair_model") +trained_model = model.load("models/flair_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the BERT NER +model = BertNER(padding_tag="X") +model.fit(xtrain, ytrain) +model.save("models/bert_model") +trained_model = model.load("models/bert_model") +ypred = trained_model.predict(xtest) +ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X") +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the Transformers NER +model = TransformerNER( + model_dir="models/transformer_model", + padding_tag="X") +model.fit(xtrain, ytrain) +model.save() +trained_model = model.load() +ypred = trained_model.predict(xtest) +ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X") +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# create and test an ensemble +dict_model = DictionaryNER() +dict_model.load("models/dict_model") +crf_model = CrfNER() +crf_model.load("models/crf_model") +spacy_model = SpacyNER() +spacy_model.load("models/spacy_model") +bilstm_model = BiLstmCrfNER() +bilstm_model.load("models/bilstm_model") +estimators = [ + ("dict_model", dict_model), + ("crf_model", crf_model), + ("spacy_model", spacy_model), + ("bilstm_model", bilstm_model) +] +model = EnsembleNER(estimators=estimators, is_pretrained=True) +ypred = model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# clean up +shutil.rmtree("models") +shutil.rmtree("data") +os.remove("glove.6B.100d.txt") \ No newline at end of file diff --git a/examples/GMB/README.md b/examples/GMB/README.md new file mode 100644 index 0000000..3441236 --- /dev/null +++ b/examples/GMB/README.md @@ -0,0 +1,184 @@ +# Dataset description + +Annotated Corpus for Named Entity Recognition using GMB (Groningen Meaning Bank) corpus for entity classification with enhanced and popular features by Natural Language Processing applied to the data set. Downloaded from [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus) to `train.csv` file locally. + +In addition, [GloVe (Global Vectors for Word Representation) vectors](https://nlp.stanford.edu/projects/glove/) are needed to run the ElmoNER model, please download them by running the provided `download_glove.sh` script. + +## Overall number of entities + +``` + 699 art + 561 eve + 45058 geo + 16068 gpe + 252 nat + 36927 org + 34241 per + 26861 tim +``` + +## Training + +We train with the full set of data, and the entire run across all the provided models can be fairly time consuming. If it is desired to keep the training time reasonable, you can train only with 5000 sentences by uncommenting lines 47-49 in `test_models.py`. + +## Results + +### Dictionary NER (from_dictionary=False) + +``` + precision recall f1-score support + + art 0.01 0.15 0.02 215 + eve 0.22 0.43 0.29 169 + geo 0.35 0.74 0.48 13724 + gpe 0.93 0.90 0.92 4850 + nat 0.27 0.53 0.36 94 + org 0.41 0.67 0.51 10884 + per 0.77 0.74 0.75 10342 + tim 0.14 0.92 0.25 8140 + + micro avg 0.32 0.77 0.45 48418 + macro avg 0.39 0.64 0.45 48418 +weighted avg 0.48 0.77 0.55 48418 +``` + +### CRF NER (c1=0.1, c2=0.1, max_iter=100, featurizer=Default) + +``` + precision recall f1-score support + + art 0.28 0.05 0.08 215 + eve 0.54 0.33 0.41 169 + geo 0.87 0.89 0.88 13724 + gpe 0.95 0.92 0.94 4850 + nat 0.71 0.32 0.44 94 + org 0.80 0.78 0.79 10884 + per 0.88 0.88 0.88 10342 + tim 0.93 0.87 0.90 8140 + + micro avg 0.87 0.85 0.86 48418 + macro avg 0.74 0.63 0.66 48418 +weighted avg 0.87 0.85 0.86 48418 + +``` + +The entity types which have enough examples have good results! + +### SpaCy NER (dropout=0.1, max_iter=20, batch_size=32) + +``` + precision recall f1-score support + + art 0.26 0.07 0.10 215 + eve 0.61 0.24 0.34 169 + geo 0.87 0.87 0.87 13724 + gpe 0.94 0.93 0.93 4850 + nat 0.87 0.28 0.42 94 + org 0.79 0.77 0.78 10884 + per 0.85 0.90 0.88 10342 + tim 0.96 0.83 0.89 8140 + + micro avg 0.87 0.85 0.86 48418 + macro avg 0.77 0.61 0.65 48418 +weighted avg 0.87 0.85 0.86 48418 + +``` + +### BiLSTM-CRF NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, use_char=True, use_crf=True, batch_size=16, learning_rate=0.001, max_iter=10) + +``` + precision recall f1-score support + + art 0.25 0.09 0.14 215 + eve 0.37 0.29 0.33 169 + geo 0.84 0.89 0.87 13724 + gpe 0.95 0.93 0.94 4850 + nat 0.71 0.31 0.43 94 + org 0.84 0.72 0.77 10884 + per 0.87 0.90 0.89 10342 + tim 0.89 0.89 0.89 8140 + + micro avg 0.86 0.85 0.86 48418 + macro avg 0.72 0.63 0.66 48418 +weighted avg 0.86 0.85 0.85 48418 + +``` + +### ELMo NER (word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, embeddings=None, embeddings_file="glove.6B.100d.txt", batch_size=16, learning_rate=0.001, max_iter=2) + +``` + precision recall f1-score support + + art 0.13 0.15 0.14 215 + eve 0.35 0.46 0.40 169 + geo 0.88 0.89 0.88 13724 + gpe 0.94 0.94 0.94 4850 + nat 0.71 0.21 0.33 94 + org 0.82 0.76 0.79 10884 + per 0.86 0.93 0.89 10342 + tim 0.91 0.88 0.90 8140 + + micro avg 0.87 0.87 0.87 48418 + macro avg 0.70 0.65 0.66 48418 +weighted avg 0.87 0.87 0.86 48418 + +``` + +### FLAIR NER (hidden_dim=256, embeddings=StackedEmbeddings(WordEmbeddings("glove"), CharEmbeddings()), use_crf=True, use_rnn=True, num_rnn_layers=1, dropout=0.0, word_dropout=0.05, locked_dropout=0.5, optimizer="sgd", learning_rate=0.1, batch_size=32, max_iter=10) + +``` + precision recall f1-score support + + art 0.00 0.00 0.00 215 + eve 0.71 0.20 0.31 169 + geo 0.84 0.91 0.87 13724 + gpe 0.95 0.92 0.94 4850 + nat 0.50 0.06 0.11 94 + org 0.85 0.67 0.75 10884 + per 0.84 0.92 0.88 10342 + tim 0.90 0.88 0.89 8140 + + micro avg 0.86 0.84 0.85 48418 + macro avg 0.70 0.57 0.59 48418 +weighted avg 0.86 0.84 0.85 48418 + +``` + +### Transformer NER (lang_model_family="bert", lang_model_name="bert-base-cased", max_sequence_length=128, batch_size=32, max_iter=4, learning_rate=4e-5, padding_tag="O", random_state=42) + +``` + precision recall f1-score support + + art 0.11 0.24 0.15 97 + eve 0.41 0.55 0.47 126 + geo 0.90 0.88 0.89 14016 + gpe 0.94 0.96 0.95 4724 + nat 0.34 0.80 0.48 40 + org 0.80 0.81 0.81 10669 + per 0.91 0.90 0.90 10402 + tim 0.89 0.93 0.91 7739 + + micro avg 0.87 0.88 0.88 47813 + macro avg 0.66 0.76 0.69 47813 +weighted avg 0.88 0.88 0.88 47813 + +``` + +### Majority voting ensemble (pretrained Dictionary NER, CRF NER, SpaCy NER, and BiLSTM-CRF NER) + +``` + precision recall f1-score support + + art 0.17 0.08 0.11 215 + eve 0.47 0.22 0.30 169 + geo 0.83 0.87 0.85 13724 + gpe 0.98 0.89 0.93 4850 + nat 0.76 0.31 0.44 94 + org 0.84 0.64 0.73 10884 + per 0.93 0.71 0.81 10342 + tim 0.90 0.86 0.88 8140 + + micro avg 0.87 0.78 0.82 48418 + macro avg 0.73 0.57 0.63 48418 +weighted avg 0.87 0.78 0.82 48418 +``` diff --git a/examples/GMB/download_glove.sh b/examples/GMB/download_glove.sh new file mode 100755 index 0000000..4ecf263 --- /dev/null +++ b/examples/GMB/download_glove.sh @@ -0,0 +1,4 @@ +#!/bin/bash +wget http://nlp.stanford.edu/data/glove.6B.zip +unzip -a glove.6B.zip +rm glove.6B.200d.txt glove.6B.300d.txt glove.6B.50d.txt glove.6B.zip diff --git a/examples/GMB/test_models.py b/examples/GMB/test_models.py new file mode 100644 index 0000000..c1a583e --- /dev/null +++ b/examples/GMB/test_models.py @@ -0,0 +1,170 @@ +import csv +import os +import shutil + +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report +from sklearn.utils import shuffle + +from nerds.models import ( + DictionaryNER, SpacyNER, CrfNER, BiLstmCrfNER, + ElmoNER, FlairNER, BertNER, TransformerNER, + EnsembleNER +) +from nerds.utils import * + + +def convert_to_iob_format(input_file, output_file): + num_written = 0 + fout = open(output_file, "w") + with open(input_file, "r", encoding="iso-8859-1") as fin: + csv_reader = csv.reader(fin, delimiter=',', quotechar='"') + # skip header + next(csv_reader) + for line in csv_reader: + sid, token, pos, tag = line + if num_written > 0: + if len(sid) != 0: + # end of sentence marker + fout.write("\n") + fout.write("\t".join([token, tag]) + "\n") + num_written += 1 + + fout.write("\n") + fout.close() + + +# convert GMB dataset to our standard IOB format +if not os.path.exists("train.iob"): + convert_to_iob_format("train.csv", "train.iob") + +# these are our entities +entity_labels = ["art", "eve", "geo", "gpe", "nat", "org", "per", "tim"] + +# make model directory to store our models +if not os.path.exists("models"): + os.makedirs("models") + +# read IOB file +data, labels = load_data_and_labels("train.iob", encoding="iso-8859-1") +# optional: restrict dataset to 5000 sentences +# data_s, labels_s = shuffle(data, labels, random_state=42) +# data = data_s +# labels = labels_s +print(len(data), len(labels)) + +# split into train and test set +xtrain, xtest, ytrain, ytest = train_test_split(data, labels, + test_size=0.3, random_state=42) +print(len(xtrain), len(ytrain), len(xtest), len(ytest)) + +# train and test the dictionary NER +model = DictionaryNER() +model.fit(xtrain, ytrain) +model.save("models/dict_model") +trained_model = model.load("models/dict_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the CRF NER +model = CrfNER() +model.fit(xtrain, ytrain) +model.save("models/crf_model") +trained_model = model.load("models/crf_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the SpaCy NER +model = SpacyNER() +model.fit(xtrain, ytrain) +model.save("models/spacy_model") +trained_model = model.load("models/spacy_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the BiLSTM-CRF NER +model = BiLstmCrfNER() +model.fit(xtrain, ytrain) +model.save("models/bilstm_model") +trained_model = model.load("models/bilstm_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the ELMo NER +if os.path.exists("glove.6B.100d.txt"): + model = ElmoNER() + model.fit(xtrain, ytrain) + model.save("models/elmo_model") + trained_model = model.load("models/elmo_model") + ypred = trained_model.predict(xtest) + print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the FLAIR NER +model = FlairNER("models/flair_model") +model.fit(xtrain, ytrain) +model.save("models/flair_model") +trained_model = model.load("models/flair_model") +ypred = trained_model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test the BERT NER +model = BertNER(padding_tag="X") +model.fit(xtrain, ytrain) +model.save("models/bert_model") +trained_model = model.load("models/bert_model") +ypred = trained_model.predict(xtest) +ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X") +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# train and test Transformer NER +model = TransformerNER( + model_dir="models/transformer_model", + padding_tag="X") +model.fit(xtrain, ytrain) +model.save() +trained_model = model.load() +ypred = trained_model.predict(xtest) +ytest, ypred = align_labels_and_predictions(ypred, ytest, padding_tag="X") +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# create and test an ensemble +dict_model = DictionaryNER() +dict_model.load("models/dict_model") +crf_model = CrfNER() +crf_model.load("models/crf_model") +spacy_model = SpacyNER() +spacy_model.load("models/spacy_model") +bilstm_model = BiLstmCrfNER() +bilstm_model.load("models/bilstm_model") +estimators = [ + ("dict_model", dict_model), + ("crf_model", crf_model), + ("spacy_model", spacy_model), + ("bilstm_model", bilstm_model) +] +model = EnsembleNER(estimators=estimators, is_pretrained=True) +ypred = model.predict(xtest) +print(classification_report(flatten_list(ytest, strip_prefix=True), + flatten_list(ypred, strip_prefix=True), + labels=entity_labels)) + +# clean up +shutil.rmtree("models") +os.remove("train.iob") +os.remove("glove.6B.100d.txt") \ No newline at end of file diff --git a/nerds/examples/GMB/train.csv b/examples/GMB/train.csv similarity index 100% rename from nerds/examples/GMB/train.csv rename to examples/GMB/train.csv diff --git a/nerds/core/model/ner/dictionary.py b/nerds/core/model/ner/dictionary.py deleted file mode 100644 index b8ca078..0000000 --- a/nerds/core/model/ner/dictionary.py +++ /dev/null @@ -1,78 +0,0 @@ -from os.path import isfile - -import ahocorasick - -from nerds.core.model.input.annotation import Annotation -from nerds.core.model.input.document import AnnotatedDocument -from nerds.core.model.ner.base import NERModel -from nerds.util.logging import get_logger - -log = get_logger() - - -class ExactMatchDictionaryNER(NERModel): - - def __init__(self, path_to_dictionary_file, entity_label): - super().__init__(entity_label) - self.key = "em-dict" - - if path_to_dictionary_file is not None: - self.path_to_dictionary_file = path_to_dictionary_file - self._create_automaton() - else: - # Must get a dictionary as an input! - raise Exception("No dictionary provided!") - - def _create_automaton(self): - - if not isfile(self.path_to_dictionary_file): - raise Exception("%s is not a file." % self.path_to_dictionary_file) - - # Initialize automaton. - self.automaton = ahocorasick.Automaton() - - # Index counter. - count = 0 - - # Dictionary must be one word per line. - log.debug("Started loading dictionary at {}".format( - self.path_to_dictionary_file)) - with open(self.path_to_dictionary_file, 'r') as dict_file: - for search_expr in dict_file: - search_expr = search_expr.strip() - if search_expr != "": - self.automaton.add_word(search_expr, (count, search_expr)) - count += 1 - log.debug("Successfully loaded dictionary") - - self.automaton.make_automaton() - - def transform(self, X, y=None): - """ Annotates the list of `Document` objects that are provided as - input and returns a list of `AnnotatedDocument` objects. - - In a dictionary based approach, a dictionary of keywords is used - to create a FSA which is then used to search with. See [1]. - [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm - """ - annotated_documents = [] - for document in X: - annotations = [] - doc_content_str = document.plain_text_ - for item in self.automaton.iter(doc_content_str): - end_position, (index, word) = item - - start_position = (end_position - len(word) + 1) - end_position = end_position + 1 - - annotations.append(Annotation( - word, - self.entity_label, - (start_position, end_position))) - - annotated_documents.append(AnnotatedDocument( - document.content, - annotations=annotations, - encoding=document.encoding)) - - return annotated_documents diff --git a/nerds/examples/GMB/README.md b/nerds/examples/GMB/README.md deleted file mode 100644 index af94284..0000000 --- a/nerds/examples/GMB/README.md +++ /dev/null @@ -1,109 +0,0 @@ -# Dataset description - -Annotated Corpus for Named Entity Recognition using GMB (Groningen Meaning Bank) corpus for entity classification with enhanced and popular features by Natural Language Processing applied to the data set. - -# Source - -Downloaded from [Kaggle](https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus) - -# Overall number of entities - -```python -{'O': 1146068', - 'geo-nam': 58388, - 'org-nam': 48034, - 'per-nam': 23790, - 'gpe-nam': 20680, - 'tim-dat': 12786, - 'tim-dow': 11404, - 'per-tit': 9800, - 'per-fam': 8152, - 'tim-yoc': 5290, - 'tim-moy': 4262, - 'per-giv': 2413, - 'tim-clo': 891, - 'art-nam': 866, - 'eve-nam': 602, - 'nat-nam': 300, - 'tim-nam': 146, - 'eve-ord': 107, - 'per-ini': 60, - 'org-leg': 60, - 'per-ord': 38, - 'tim-dom': 10, - 'per-mid': 1, - 'art-add': 1} -``` - -## Training - -To keep the training time reasonable, we shuffle the sentences from the dataset (with a fixed seed) and train with only the first 5000 instances. - -## Results - -### CRF (max_iterations=100, c1=0.1, c2=0.1) - -``` -Label: art 0.1875 0.0410958904109589 0.06741573033707865 -Label: org 0.696551724137931 0.6350665054413543 0.6643896268184694 -Label: geo 0.763373190685966 0.7818744359932964 0.7725130556616991 -Label: nat 0.23076923076923078 0.07317073170731707 0.1111111111111111 -Label: gpe 0.9410415984277759 0.9048818897637795 0.9226075786769428 -Label: per 0.7281134401972873 0.7013064133016627 0.7144585601935873 -Label: eve 0.5348837209302325 0.3709677419354839 0.4380952380952381 -Label: tim 0.887613454351308 0.8149509803921569 0.849731663685152 -``` - -The entity types which have enough examples have good results! - -## Spacy (num_epochs=20, dropout=0.1) - -``` -Label: art 0.09090909090909091 0.0410958904109589 0.056603773584905655 -Label: org 0.6371129427489418 0.6756437514764942 0.6558128869525338 -Label: geo 0.8506005015177511 0.8454676636494818 0.8480263157894736 -Label: nat 0.3333333333333333 0.075 0.12244897959183673 -Label: gpe 0.9200373366521468 0.9295818924866395 0.9247849882720875 -Label: per 0.7063857801184991 0.6349112426035503 0.6687441570582736 -Label: eve 0.625 0.24193548387096775 0.3488372093023256 -Label: tim 0.8101326899879373 0.8220318237454101 0.8160388821385176 -``` - -## BiLSTM-CRF (char_emb_size=32, word_emb_size=128, char_lstm_units=32, word_lstm_units=128, dropout=0.1, batch_size=16, learning_rate=0.001, num_epochs=10) - -``` -Label: art 0.05714285714285714 0.0273972602739726 0.037037037037037035 -Label: org 0.29821791112113694 0.31824747231584016 0.3079073017351811 -Label: geo 0.6431408898305084 0.6262248581743166 0.634570159393781 -Label: nat 0.0 0.0 0.0 -Label: gpe 0.890119760479042 0.9167437557816837 0.9032356068661704 -Label: per 0.1862842070557204 0.15786532550991897 0.17090139140955837 -Label: eve 0.3870967741935484 0.1935483870967742 0.25806451612903225 -Label: tim 0.7139974779319042 0.671967718965108 0.6923453167033504 -``` - -## Pooling ensemble - -``` -Label: art 0.0759493670886076 0.0821917808219178 0.07894736842105263 -Label: org 0.398450340455506 0.7880195031344324 0.5292787524366471 -Label: geo 0.6124665775401069 0.9238719435341568 0.7366093859913576 -Label: nat 0.19230769230769232 0.125 0.15151515151515152 -Label: gpe 0.830937167199148 0.9550183598531212 0.8886674259681092 -Label: per 0.43697978596908443 0.813503043718871 0.5685554051440727 -Label: eve 0.37142857142857144 0.41935483870967744 0.393939393939394 -Label: tim 0.6545580349420516 0.888262910798122 0.7537097898615676 -``` - -## Majority voting ensemble - -``` -Label: art 0.4 0.0273972602739726 0.05128205128205128 -Label: org 0.7885109114249037 0.5937651039149348 0.6774193548387097 -Label: geo 0.882788868723533 0.7560880829015544 0.814540887524421 -Label: nat 1.0 0.024390243902439025 0.047619047619047616 -Label: gpe 0.9580103359173127 0.9286161552911709 0.9430842607313196 -Label: per 0.8227743271221533 0.5885663507109005 0.686237264721119 -Label: eve 0.8421052631578947 0.25806451612903225 0.3950617283950617 -Label: tim 0.9210599721059972 0.80615234375 0.8597838823069914 -``` diff --git a/nerds/examples/GMB/read_data.py b/nerds/examples/GMB/read_data.py deleted file mode 100644 index fe3f119..0000000 --- a/nerds/examples/GMB/read_data.py +++ /dev/null @@ -1,64 +0,0 @@ -import csv - -from nerds.core.model.input.document import Document -from nerds.util.convert import transform_bio_tags_to_annotated_document - - -PATH_TO_FILE = "train.csv" - - -def read_kaggle_data(): - sentences = [] - pos = [] - tag = [] - - tmp_sentence = [] - tmp_pos = [] - tmp_tag = [] - - with open(PATH_TO_FILE, "rt") as csvfile: - csv_reader = csv.reader(csvfile, delimiter=',', quotechar='"') - # Ignore the header - next(csv_reader) - - for row in csv_reader: - - if row[0].startswith("Sentence: "): - if len(tmp_sentence) != 0: - sentences.append(tmp_sentence) - pos.append(tmp_pos) - tag.append(tmp_tag) - - tmp_sentence = [] - tmp_pos = [] - tmp_tag = [] - - tmp_sentence.append(row[1]) - tmp_pos.append(row[2]) - tmp_tag.append(row[3].replace("-", "_")) - - if len(tmp_sentence) != 0: - sentences.append(tmp_sentence) - pos.append(tmp_pos) - - return sentences, pos, tag - - -def data_to_annotated_docs(): - sentences, pos, tags = read_kaggle_data() - - documents = [Document(u" ".join(sentence).encode("utf-8")) - for sentence in sentences] - - ann_docs = [] - for i in range(len(documents)): - try: - sentence = sentences[i] - tag = tags[i] - document = documents[i] - ann_docs.append(transform_bio_tags_to_annotated_document(sentence, - tag, - document)) - except IndexError: - continue - return ann_docs diff --git a/nerds/examples/GMB/test_models.py b/nerds/examples/GMB/test_models.py deleted file mode 100644 index 475326b..0000000 --- a/nerds/examples/GMB/test_models.py +++ /dev/null @@ -1,111 +0,0 @@ -import random -import shutil - -from nerds.core.model.ner.crf import CRF -from nerds.core.model.ner.spacy import SpaCyStatisticalNER -from nerds.core.model.ner.bilstm import BidirectionalLSTM -from nerds.core.model.ner.ensemble import ( - NERModelEnsembleMajorityVote, NERModelEnsemblePooling) -from nerds.core.model.eval.score import calculate_precision_recall_f1score - -from read_data import data_to_annotated_docs - -X = data_to_annotated_docs() -print("Loaded data: ", len(X), "data points") -random.Random(42).shuffle(X) - -entity_names = ['art', 'org', 'geo', 'nat', 'gpe', 'per', 'eve', 'tim'] -print("All labels: ", entity_names) - -train_test_split = 0.8 -train_X = X[:int(0.8 * len(X))] -test_X = X[int(0.8 * len(X)):] -print("Training: ", len(train_X)) -print("Training: ", len(test_X)) - - -def test_CRF(): - crf_model = CRF() - crf_model.fit(train_X[:5000]) - - X_pred = crf_model.transform(test_X) - - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - # Save for ensemble usage to avoid training again. - crf_model.save("tmp") - - -def test_spacy(): - spacy_model = SpaCyStatisticalNER() - # Using the entire dataset will make Spacy die! - spacy_model.fit(train_X[:5000]) - - X_pred = spacy_model.transform(test_X) - - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - # Save for ensemble usage to avoid training again. - spacy_model.save("tmp") - - -def test_LSTM(): - lstm_model = BidirectionalLSTM() - lstm_model.fit(train_X[:5000]) - - X_pred = lstm_model.transform(test_X) - - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - # Save for ensemble usage to avoid training again. - lstm_model.save("tmp") - - -def test_ensembles(): - lstm_model = BidirectionalLSTM() - lstm_model.load("tmp") - spacy_model = SpaCyStatisticalNER() - spacy_model.load("tmp") - crf_model = CRF() - crf_model.load("tmp") - - models = [lstm_model, crf_model, spacy_model] - ens1 = NERModelEnsembleMajorityVote(models) - ens2 = NERModelEnsemblePooling(models) - - X_pred_1 = ens1.transform(test_X) - print("Majority Vote: \n") - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred_1, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - X_pred_2 = ens2.transform(test_X) - print("Pooling: \n") - for l in entity_names: - p, r, f = calculate_precision_recall_f1score(X_pred_2, - test_X, - entity_label=l) - print("Label: ", l, p, r, f) - - -test_LSTM() -test_CRF() -test_spacy() -test_ensembles() - -# Clean-up the model dirs. -shutil.rmtree("tmp/") diff --git a/nerds/models/__init__.py b/nerds/models/__init__.py new file mode 100644 index 0000000..ed06df1 --- /dev/null +++ b/nerds/models/__init__.py @@ -0,0 +1,24 @@ +from nerds.models.base import NERModel + +from nerds.models.bert import BertNER +from nerds.models.bilstm import BiLstmCrfNER +from nerds.models.crf import CrfNER +from nerds.models.dictionary import DictionaryNER +from nerds.models.elmo import ElmoNER +from nerds.models.ensemble import EnsembleNER +from nerds.models.flair import FlairNER +from nerds.models.spacy import SpacyNER +from nerds.models.transformer import TransformerNER + +__all__ = [ + "NERModel", + "DictionaryNER", + "CrfNER", + "SpacyNER", + "BiLstmCrfNER", + "ElmoNER", + "FlairNER", + "BertNER", + "TransformerNER", + "EnsembleNER" +] diff --git a/nerds/models/base.py b/nerds/models/base.py new file mode 100644 index 0000000..f65c12f --- /dev/null +++ b/nerds/models/base.py @@ -0,0 +1,54 @@ +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.metrics import accuracy_score + +from nerds.utils import flatten_list + +class NERModel(BaseEstimator, ClassifierMixin): + """ Provides a basic interface to train NER models and annotate documents. + + This is the core class responsible for training models that perform + named entity recognition, and retrieving named entities from documents. + """ + def __init__(self): + pass + + def fit(self, X, y): + """ Train the model using data (X) and labels (y). Return trained model. + """ + raise NotImplementedError() + + def predict(self, X): + """ Makes predictions using trained model on data (X) and returns them. + """ + raise NotImplementedError() + + def save(self, file_path): + """ Saves a model to the local disk, provided a file path. + Should be overridden. + """ + raise NotImplementedError() + + def load(self, file_path): + """ Loads a model saved locally. Should be overridden. """ + raise NotImplementedError() + + def score(self, X, y, sample_weights=None): + """ Returns score for the model based on predicting on (X, y). This + method is needed for GridSearch like operations. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + y : list(list(str)) + list of list of tags + sample_weights : list(float), not used + + Returns + ------- + score: float + numeric score for estimator. + """ + y_pred = self.predict(X) + return accuracy_score(flatten_list(y), flatten_list(y_pred)) + diff --git a/nerds/models/bert.py b/nerds/models/bert.py new file mode 100644 index 0000000..56aa3f9 --- /dev/null +++ b/nerds/models/bert.py @@ -0,0 +1,523 @@ +import joblib +import numpy as np +import os +import random +import time +import torch + +from nerds.models import NERModel +from nerds.utils import flatten_list, get_logger, write_param_file + +from transformers import AdamW +from transformers import BertForTokenClassification, BertTokenizer +from transformers import get_linear_schedule_with_warmup + +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, + TensorDataset) + +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score + + +log = get_logger() + +class BertNER(NERModel): + + def __init__(self, + lang_model="bert-base-cased", + max_sequence_length=128, + learning_rate=2e-5, + batch_size=32, + max_iter=4, + padding_tag="O", + verbose=False, + random_state=42): + """ Construct a BERT NER model. Uses a pretrained BERT language model + and a Fine tuning model for NER is provided by the HuggingFace + transformers library. + + NOTE: this is an experimental NER that did not perform very well, and + is only here for reference purposes. It has been superseded by the + TransformerNER model, which offers the same functionality (and improved + performance) not only with BERT as the underlying language model (as this + one does), but allows other BERT-like language model backends as well. + + Parameters + ---------- + lang_model : str, optional, default "bert-base-cased" + pre-trained BERT language model to use. + max_sequence_length : int, optional, default 128 + maximum sequence length in tokens for input sentences. Shorter + sentences will be right padded and longer sentences will be + truncated. + learning_rate : float, optional, default 2e-5 + learning rate for the ADAM optimizer. + batch_size : int, optional, default 32 + batch size to use for training. + max_iter : int, default 4 + number of epochs to fine tune. + padding_tag : str, default "O" + tag to pad predictions with if len(tokens) > len(predicted_tags). + verbose : bool, optional, default False + whether to display log messages on console. + random_state : int, optional, default 42 + random state to set for repeatable results + + Attributes + ---------- + model_ : reference to underlying BertForTokenClassification object. + tokenizer_ : reference to underlying BertTokenizer object. + label2id_ : mapping from string labels to internal int ids. + id2label_ : mapping from internal int label ids to string labels. + train_losses_ : list(float) of per epoch training losses. + val_accs_ : list(float) of per epoch validation accuracies. + special_tokens_ : set of tokenizer special tokens. + """ + super().__init__() + # model parameters + self.lang_model = lang_model + self.max_sequence_length = max_sequence_length + self.learning_rate = learning_rate + self.batch_size = batch_size + self.max_iter = max_iter + self.padding_tag = padding_tag + self.verbose = verbose + self.random_state = random_state + self._set_random_state(random_state) + # model attributes + self.model_ = None + self.tokenizer_ = None + self.label2id_ = None + self.id2label_ = None + self.train_losses_ = None + self.val_accs_ = None + self.special_tokens_ = None + # hidden variables + self._pad_label_id = torch.nn.CrossEntropyLoss().ignore_index + self._device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + + + def fit(self, X, y): + """ Trains the NER model. Input is list of list of tokens and tags. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self + """ + log.info("Converting data and labels to features...") + self.label2id_, self.id2label_ = self._build_label_id_mappings(y) + + Xtrain, Xval, ytrain, yval = train_test_split( + X, y, test_size=0.1, random_state=self.random_state) + + self.tokenizer_ = BertTokenizer.from_pretrained( + self.lang_model, do_basic_tokenize=False) + self.special_tokens_ = set([ + self.tokenizer_.pad_token, self.tokenizer_.unk_token, + self.tokenizer_.sep_token, self.tokenizer_.cls_token]) + + train_feats = self._data_labels_to_features(Xtrain, ytrain) + train_loader = self._create_dataloader(train_feats, "random") + val_feats = self._data_labels_to_features(Xval, yval) + val_loader = self._create_dataloader(val_feats, "sequential") + + log.info("Building model...") + self.model_ = BertForTokenClassification.from_pretrained(self.lang_model, + num_labels=len(self.label2id_), + output_attentions=False, + output_hidden_states=False) + self.model_.to(self._device) + + total_steps = len(train_loader) * self.max_iter + optimizer = AdamW(self.model_.parameters(), lr=self.learning_rate, eps=1e-8) + scheduler = get_linear_schedule_with_warmup(optimizer, + num_warmup_steps=0, num_training_steps=total_steps) + + self.train_losses_, self.val_accs_ = [], [] + for epoch in range(self.max_iter): + + log.info("==== Epoch {:d}/{:d}".format(epoch + 1, self.max_iter)) + log.info("Training...") + t0 = time.time() + total_loss = 0 + self.model_.train() + + for step, batch in enumerate(train_loader): + if step % 100 == 0: + elapsed = time.time() - t0 + log.info(" Batch {:d} of {:d}, elapsed: {:.3f}s".format( + step + 1, len(train_loader), elapsed)) + b_input_ids = batch[0].to(self._device) + b_attention_mask = batch[1].to(self._device) + b_token_type_ids = batch[2].to(self._device) + b_label_ids = batch[3].to(self._device) + + self.model_.zero_grad() + outputs = self.model_(b_input_ids, + attention_mask=b_attention_mask, + token_type_ids=b_token_type_ids, + labels=b_label_ids) + + loss = outputs[0] + total_loss += loss.item() + loss.backward() + + torch.nn.utils.clip_grad_norm_(self.model_.parameters(), 1.0) + optimizer.step() + scheduler.step() + + avg_train_loss = total_loss / len(train_loader) + self.train_losses_.append(avg_train_loss) + + log.info(" Average training loss: {:.3f}".format(avg_train_loss)) + log.info(" Training epoch took: {:.3f}s".format(time.time() - t0)) + + log.info("Validation...") + t0 = time.time() + self.model_.eval() + + val_acc, val_steps = 0, 0 + for batch in val_loader: + batch = tuple(b.to(self._device) for b in batch) + b_input_ids, b_attention_mask, b_token_type_ids, b_label_ids, _ = batch + with torch.no_grad(): + outputs = self.model_(b_input_ids, + attention_mask=b_attention_mask, + token_type_ids=b_token_type_ids) + logits = outputs[0].detach().cpu().numpy() + b_preds = np.argmax(logits, axis=-1).flatten() + b_labels = b_label_ids.detach().cpu().numpy().flatten() + b_val_acc = accuracy_score(b_preds, b_labels) + val_acc += b_val_acc + val_steps += 1 + + val_acc = val_acc / val_steps + + self.val_accs_.append(val_acc) + log.info(" Accuracy: {:.3f}".format(val_acc)) + log.info(" Validation took {:.3f}s".format(time.time() - t0)) + + log.info("==== Training complete ====") + return self + + + def predict(self, X): + """ Predicts using the NER model. Note that because of the + way BERT re-tokenizes incoming tokens to word-pieces, it + is possible that some incoming tokens may not be presented + to the model for NER tagging, and hence the list of predicted + tags will padded with a pseudo-tag (default 'O'). If you chose + a different pseudo-tag, you will need to re-align labels and + predictions using nerds.utils.align_lists(). + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.model_ is None or self.tokenizer_ is None: + raise ValueError("No model and/or tokenizer found, either run fit() to train or load() to load a trained model.") + + log.info("Converting data to features...") + test_feats = self._data_labels_to_features(X, None) + test_loader = self._create_dataloader(test_feats, "sequential") + + log.info("Predicting from model...") + predictions = [] + self.model_.eval() + for batch in test_loader: + batch = tuple(b.to(self._device) for b in batch) + b_input_ids, b_attention_mask, b_token_type_ids, b_ids = batch + with torch.no_grad(): + outputs = self.model_(b_input_ids, + attention_mask=b_attention_mask, + token_type_ids=b_token_type_ids) + logits = outputs[0].detach().cpu().numpy() + b_pred_ids = np.argmax(logits, axis=-1) + b_ids = b_ids.detach().cpu().numpy() + b_id_min, b_id_max = b_ids[0], b_ids[-1] + b_X = X[b_id_min : b_id_max + 1] + predictions.extend(self._align_predictions(b_X, b_pred_ids)) + + return predictions + + + def save(self, dirpath): + """ Saves model and related artifacts to specified folder on disk + + Parameters + ---------- + dirpath : str + a directory where model artifacts are to be saved. Artifacts for + this NER are the HuggingFace model and tokenizer, a pickled file + containing the label-to-id and id-to-label mappings, and the NER + configuration YAML file. + + Returns + ------- + None + """ + if self.model_ is None or self.tokenizer_ is None: + raise ValueError("No model artifacts to save, either run fit() to train or load() pretrained model.") + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + self.model_.save_pretrained(dirpath) + self.tokenizer_.save_pretrained(dirpath) + label_map = { + "label2id": self.label2id_, + "id2label": self.id2label_, + "special_tokens": self.special_tokens_ + } + joblib.dump(label_map, os.path.join(dirpath, "label_mappings.pkl")) + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath): + """ Loads a trained model from specified folder on disk. + + Parameters + ---------- + dirpath : str + directory from which model artifacts should be loaded + + Returns + ------- + self + """ + if not os.path.exists(dirpath): + raise ValueError("Model directory not found: {:s}".format(dirpath)) + + label_mappings = joblib.load(os.path.join(dirpath, "label_mappings.pkl")) + self.label2id_ = label_mappings["label2id"] + self.id2label_ = label_mappings["id2label"] + self.special_tokens_ = label_mappings["special_tokens"] + self.model_ = BertForTokenClassification.from_pretrained(dirpath, + num_labels=len(self.label2id_), + output_attentions=False, + output_hidden_states=False) + self.model_.to(self._device) + self.tokenizer_ = BertTokenizer.from_pretrained(dirpath, + do_basic_tokenize=False) + + return self + + + def _set_random_state(self, seed): + """ Set the random seed for reproducible results. + + Parameters + ---------- + seed : int + a numeric random seed. + + Returns + ------- + None + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + + def _build_label_id_mappings(self, labels): + """ Build label (string) to label_id (int) mappings + + Parameters + ---------- + labels : list(list(str)) + labels as provided by the utils.load_data_and_labels() function. + + Returns + ------- + label2id, id2label + """ + label2id = {l:i for i, l in + enumerate(sorted(list(set(flatten_list(labels, strip_prefix=False)))))} + id2label = {v:k for k, v in label2id.items()} + return label2id, id2label + + + def _data_labels_to_features(self, data, labels): + """ Convert data and labels from utils.load_data_and_labels() function + to list of features needed by the HuggingFace BertForTokenClassification + object. + + Parameters + ---------- + data : list(list(str)) + list of list of input tokens + labels : list(list(str)), can be None. + list of list of input BIO tags. + + Returns + ------- + input_ids : list(list(int)) + list of zero-padded fixed length token_ids. + attention_mask : list(list(int)) + mask to avoid performing attention on padding tokens. + token_type_ids : list(list(int)) + segment token indices, all zero since we consider single sequence. + label_ids : list(list(int)) or None + list of zero-padded fixed length label_ids. Set to None if + labels parameter is None. + """ + input_ids, attention_mask, token_type_ids, label_ids = [], [], [], [] + # if labels is None (not supplied), then replace with pseudo labels + labels_supplied = True + if labels is None: + labels_supplied = False + labels = [] + for tokens in data: + labels.append(["O"] * len(tokens)) + + # input is (list(list(str)), list(list(str))) + # format of input is: [CLS] sentence [SEP] + for i, (tokens, tags) in enumerate(zip(data, labels)): + tokens_sent, tags_sent = [], [] + for token, tag in zip(tokens, tags): + subwords = self.tokenizer_.tokenize(token) + if len(subwords) == 0: + tokens_sent.append(token) + else: + tokens_sent.extend(subwords) + tags_sent.append(self.label2id_[tag]) + if len(subwords) > 1: + tags_sent.extend([self._pad_label_id] * (len(subwords) - 1)) + # if len(subwords) > 1: + # # repeat tag for all subwords following the specified word, see + # # https://github.com/google-research/bert/issues/646#issuecomment-519868110 + # if tag.startswith("B-"): + # tag = tag.replace("B-", "I-") + # tags_sent.extend([self.label2id_[tag]] * (len(subwords) - 1)) + + # truncate to max_sequence_length - 2 (account for special tokens CLS and SEP) + tokens_sent = tokens_sent[0:self.max_sequence_length - 2] + tags_sent = tags_sent[0:self.max_sequence_length - 2] + + # prepend [CLS] and append [SEP] + tokens_sent = [self.tokenizer_.cls_token] + tokens_sent + [self.tokenizer_.sep_token] + tags_sent = [self._pad_label_id] + tags_sent + [self._pad_label_id] + + # pad upto the max_sequence_length - 2 (account for special tokens CLS and SEP) + tokens_to_pad = self.max_sequence_length - len(tokens_sent) + tokens_sent.extend([self.tokenizer_.pad_token] * tokens_to_pad) + tags_sent.extend([self._pad_label_id] * tokens_to_pad) + + # feature: input_ids + input_ids.append(self.tokenizer_.convert_tokens_to_ids(tokens_sent)) + # feature: attention_mask + attention_mask.append([0 if t == self.tokenizer_.pad_token else 1 for t in tokens_sent]) + # feature: token_type_ids + token_type_ids.append([0] * self.max_sequence_length) + # feature: label_ids + label_ids.append(tags_sent) + + if self.verbose and i < 5: + log.info("row[{:d}].features:".format(i)) + log.info(" input_tokens:", tokens_sent) + log.info(" input_ids:", input_ids[i]) + log.info(" attention_mask:", attention_mask[i]) + log.info(" token_type_ids:", token_type_ids[i]) + log.info(" label_ids:", label_ids[i]) + + if labels_supplied: + return input_ids, attention_mask, token_type_ids, label_ids + else: + return input_ids, attention_mask, token_type_ids, None + + + def _create_dataloader(self, features, sampling): + """ Converts features to Torch DataLoader for different data splits. + + Parameters + ---------- + features : (input_ids, attention_mask, token_type_ids, label_ids) + sampling : "random" (training) or "sequential" (everything else) + + Returns + ------- + dataloader : reference to Torch DataLoader. + """ + input_ids, attention_mask, token_type_ids, label_ids = features + # convert to torch tensors + input_ids_t = torch.tensor(input_ids, dtype=torch.long) + attention_mask_t = torch.tensor(attention_mask, dtype=torch.long) + token_type_ids_t = torch.tensor(token_type_ids, dtype=torch.long) + ids_t = torch.tensor(np.arange(len(token_type_ids)), dtype=torch.long) + + # wrap tensors into dataset + if label_ids is not None: + label_ids_t = torch.tensor(label_ids, dtype=torch.long) + dataset = TensorDataset(input_ids_t, attention_mask_t, + token_type_ids_t, label_ids_t, ids_t) + else: + dataset = TensorDataset(input_ids_t, attention_mask_t, + token_type_ids_t, ids_t) + + # wrap dataset into dataloader and return dataloader + if sampling == "random": + sampler = RandomSampler(dataset) + else: + sampler = SequentialSampler(dataset) + dataloader = DataLoader(dataset, sampler=sampler, batch_size=self.batch_size) + return dataloader + + + def _align_predictions(self, data, pred_ids): + """ Align internal predictions from model that are aligned to + wordpieces with external labels that are aligned to tokens. + + Parameters + ---------- + data : list(list(str)) + list of jagged list of input tokens. + pred_ids : list(list(long)) + list of same size list of prediction ids. + + Returns + ------- + predictions : list(list(str)) + list of list of predictions aligned to input tokens + and using same class names as input labels. + """ + data_a, preds_a = [], [] + for tokens, pred_tag_ids in zip(data, pred_ids): + tokens_x = [] + for token in tokens: + tokens_x.extend(self.tokenizer_.tokenize(token)) + tokens_r, preds_r = [], [] + for t, p in zip(tokens_x, pred_tag_ids): + if t in self.special_tokens_: + continue + if t.startswith("##"): + tokens_r[-1] = tokens_r[-1] + t[2:] + else: + tokens_r.append(t) + preds_r.append(self.id2label_[p]) + + if len(tokens_r) < len(tokens): + # pad any truncated sentences with [PAD]/O + num_pad_tokens = len(tokens) - len(tokens_r) + tokens_r.extend([self.tokenizer_.pad_token] * num_pad_tokens) + preds_r.extend([self.padding_tag] * num_pad_tokens) + + data_a.append(tokens_r) + preds_a.append(preds_r) + + return preds_a + diff --git a/nerds/models/bilstm.py b/nerds/models/bilstm.py new file mode 100644 index 0000000..187ff72 --- /dev/null +++ b/nerds/models/bilstm.py @@ -0,0 +1,218 @@ +from anago.models import BiLSTMCRF, save_model, load_model +from anago.preprocessing import IndexTransformer +from anago.trainer import Trainer +from anago.tagger import Tagger + +from keras.optimizers import Adam + +from nerds.models import NERModel +from nerds.utils import get_logger, write_param_file + +from sklearn.model_selection import train_test_split + +import os + +log = get_logger() + +class BiLstmCrfNER(NERModel): + + def __init__(self, + word_embedding_dim=100, + char_embedding_dim=25, + word_lstm_size=100, + char_lstm_size=25, + fc_dim=100, + dropout=0.5, + embeddings=None, + use_char=True, + use_crf=True, + batch_size=16, + learning_rate=0.001, + max_iter=10): + """ Construct a BiLSTM-CRF NER model. Model is augmented with character + level embeddings as well as word embeddings by default. Implementation + is provided by the Anago project. + + Parameters + ---------- + word_embedding_dim : int, optional, default 100 + word embedding dimensions. + char_embedding_dim : int, optional, default 25 + character embedding dimensions. + word_lstm_size : int, optional, default 100 + character LSTM feature extractor output dimensions. + char_lstm_size : int, optional, default 25 + word tagger LSTM output dimensions. + fc_dim : int, optional, default 100 + output fully-connected layer size. + dropout : float, optional, default 0.5 + dropout rate. + embeddings : numpy array + word embedding matrix. + use_char : bool, optional, default True + add char feature. + use_crf : bool, optional, default True + use crf as last layer. + batch_size : int, optional, default 16 + training batch size. + learning_rate : float, optional, default 0.001 + learning rate for Adam optimizer + max_iter : int + number of epochs of training + + Attributes + ---------- + preprocessor_ : reference to preprocessor + model_ : reference to generated model + trainer_ : internal reference to Anago Trainer (model) + tagger_ : internal reference to Anago Tagger (predictor) + """ + super().__init__() + self.word_embedding_dim = word_embedding_dim + self.char_embedding_dim = char_embedding_dim + self.word_lstm_size = word_lstm_size + self.char_lstm_size = char_lstm_size + self.fc_dim = fc_dim + self.dropout = dropout + self.embedding = None + self.use_char = True + self.use_crf = True + self.batch_size = batch_size + self.learning_rate = learning_rate + self.max_iter = max_iter + # populated by fit() and load(), expected by save() and transform() + self.preprocessor_ = None + self.model_ = None + self.trainer_ = None + self.tagger_ = None + + + def fit(self, X, y): + """ Trains the NER model. Input is list of list of tokens and tags. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + y : list(list(str)) + list of list of BIO tags + + Returns + ------- + self + """ + log.info("Preprocessing dataset...") + self.preprocessor_ = IndexTransformer(use_char=self.use_char) + self.preprocessor_.fit(X, y) + + log.info("Building model...") + self.model_ = BiLSTMCRF( + char_embedding_dim=self.char_embedding_dim, + word_embedding_dim=self.word_embedding_dim, + char_lstm_size=self.char_lstm_size, + word_lstm_size=self.word_lstm_size, + char_vocab_size=self.preprocessor_.char_vocab_size, + word_vocab_size=self.preprocessor_.word_vocab_size, + num_labels=self.preprocessor_.label_size, + dropout=self.dropout, + use_char=self.use_char, + use_crf=self.use_crf) + self.model_, loss = self.model_.build() + optimizer = Adam(lr=self.learning_rate) + self.model_.compile(loss=loss, optimizer=optimizer) + self.model_.summary() + + log.info('Training the model...') + self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) + + x_train, x_valid, y_train, y_valid = train_test_split(X, y, + test_size=0.1, random_state=42) + self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, + batch_size=self.batch_size, epochs=self.max_iter) + + self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) + + return self + + + def predict(self, X): + """ Predicts using the NER model. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.tagger_ is None: + raise ValueError("No tagger found, either run fit() to train or load() a trained model") + + log.info("Predicting from model...") + ypreds = [self.tagger_.predict(" ".join(x)) for x in X] + return ypreds + + + def save(self, dirpath): + """ Saves model to local disk, given a dirpath + + Parameters + ---------- + dirpath : str + a directory where model artifacts will be saved. + Model saves a weights.h5 weights file, a params.json parameter + file, and a preprocessor.pkl preprocessor file. + + Returns + ------- + None + """ + if self.model_ is None or self.preprocessor_ is None: + raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + weights_file = os.path.join(dirpath, "weights.h5") + params_file = os.path.join(dirpath, "params.json") + preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") + + save_model(self.model_, weights_file, params_file) + self.preprocessor_.save(preprocessor_file) + + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath): + """ Loads a trained model from local disk, given the dirpath + + Parameters + ---------- + dirpath : str + a directory where model artifacts are saved. + + Returns + ------- + self + """ + if not os.path.exists(dirpath): + raise ValueError("Model directory not found: {:s}".format(dirpath)) + + weights_file = os.path.join(dirpath, "weights.h5") + params_file = os.path.join(dirpath, "params.json") + preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") + + if not (os.path.exists(weights_file) or + os.path.exists(params_file) or + os.path.exists(preprocessor_file)): + raise ValueError("Model files may be corrupted, exiting") + + self.model_ = load_model(weights_file, params_file) + self.preprocessor_ = IndexTransformer.load(preprocessor_file) + self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) + + return self + diff --git a/nerds/models/crf.py b/nerds/models/crf.py new file mode 100644 index 0000000..5c03f61 --- /dev/null +++ b/nerds/models/crf.py @@ -0,0 +1,268 @@ +from nerds.models import NERModel +from nerds.utils import get_logger, write_param_file + +import os +import joblib +import sklearn_crfsuite +import spacy + +log = get_logger() + + +class CrfNER(NERModel): + + def __init__(self, + max_iter=100, + c1=0.1, + c2=0.1, + featurizer=None): + """ Construct a Conditional Random Fields (CRF) based NER. Implementation + of CRF NER is provided by sklearn.crfsuite.CRF. + + Parameters + ---------- + max_iter : int, optional, default 100 + maximum number of iterations to run CRF training + c1 : float, optional, default 0.1 + L1 regularization coefficient. + c2 : float, optional, default 0.1 + L2 regularization coefficient. + featurizer : function, default None + if None, the default featurizer _sent2features() is used to convert + list of tokens for each sentence to a list of features, where each + feature is a dictionary of name-value pairs. For custom features, a + featurizer function must be provided that takes in a list of tokens + (sentence) and returns a list of features. + + Attributes + ---------- + model_ : reference to the internal sklearn_crfsuite.CRF model. + """ + super().__init__() + self.max_iter = max_iter + self.c1 = c1 + self.c2 = c2 + self.featurizer = featurizer + self._nlp = None + self.model_ = None + + + def fit(self, X, y): + """ Build feature vectors and train CRF model. Wrapper for + sklearn_crfsuite.CRF model. + + Parameters + ---------- + X : list(list(str)) + list of sentences. Sentences are tokenized into list + of words. + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self + """ + if self.featurizer is None: + features = [self._sent2features(sent) for sent in X] + else: + features = [self.featurizer(sent) for sent in X] + + log.info("Building model...") + self.model_ = sklearn_crfsuite.CRF( + algorithm="lbfgs", + c1=self.c1, + c2=self.c2, + max_iterations=self.max_iter, + all_possible_transitions=True, + verbose=True) + + log.info("Training model...") + self.model_.fit(features, y) + + return self + + + def predict(self, X): + """ Predicts using trained CRF model. + + Parameters + ---------- + X : list(list(str)) + list of sentences. Sentences are tokenized into list of words. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.model_ is None: + raise ValueError("CRF model not found, run fit() to train or load() pre-trained model") + + if self.featurizer is None: + features = [self._sent2features(sent) for sent in X] + else: + features = [self.featurizer(sent) for sent in X] + + return self.model_.predict(features) + + + def save(self, dirpath): + """ Save a trained CRF model at dirpath. + + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + None + """ + if self.model_ is None: + raise ValueError("No model to save, run fit() to train or load() pre-trained model") + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + model_file = os.path.join(dirpath, "crf-model.pkl") + joblib.dump(self.model_, model_file) + + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath): + """ Load a pre-trained CRF model from dirpath. + + Parameters + ----------- + dirpath : str + path to model directory. + + Returns + -------- + self + """ + model_file = os.path.join(dirpath, "crf-model.pkl") + if not os.path.exists(model_file): + raise ValueError("No CRF model to load at {:s}, exiting.".format(model_file)) + + self.model_ = joblib.load(model_file) + return self + + + def _load_language_model(self): + return spacy.load("en") + + + def _sent2features(self, sent): + """ Converts a list of tokens to a list of features for CRF. + Each feature is a dictionary of feature name value pairs. + + Parameters + ---------- + sent : list(str)) + a list of tokens representing a sentence. + + Returns + ------- + feats : list(dict(str, obj)) + a list of features, where each feature represents a token + as a dictionary of name-value pairs. + """ + if self._nlp is None: + self._nlp = self._load_language_model() + doc = self._nlp(" ".join(sent)) + postags = [token.pos_ for token in doc] + features = [self._word2featdict(sent, postags, i) for i in range(len(sent))] + return features + + + def _word2featdict(self, sent, postags, pos): + """ Build up a default feature dictionary for each word in sentence. + The default considers a window size of 2 around each word, so it + includes word-1, word-2, word, word+1, word+2. For each word, we + consider: + - prefix and suffix of size 2 and 3 + - the word itself, lowercase + - is_upper, is_lower, begin with upper, is_digit + - POS tag, and POS tag prefix of size 2 + """ + # current word + word = sent[pos] + postag = postags[pos] + feat_dict = { + 'bias': 1.0, + 'word[-2]': word[-2:], + 'word[-3:]': word[-3:], + 'word.lower()': word.lower(), + 'word.isupper()': word.isupper(), + 'word.istitle()': word.istitle(), + 'word.isdigit()': word.isdigit(), + 'postag': postag, + 'postag[:2]': postag[0:2], + } + # word - 2 + if pos > 1: + prev_word2 = sent[pos - 2] + prev_postag2 = postags[pos - 2] + feat_dict.update({ + '-2:word[-2]': prev_word2[-2:], + '-2:word[-3]': prev_word2[-3:], + '-2:word.lower()': prev_word2.lower(), + '-2:word.istitle()': prev_word2.istitle(), + '-2:word.isupper()': prev_word2.isupper(), + '-2:word.isdigit()': prev_word2.isdigit(), + '-2:postag': prev_postag2, + '-2:postag[:2]': prev_postag2[:2], + }) + # word - 1 + if pos > 0: + prev_word = sent[pos - 1] + prev_postag = postags[pos - 1] + feat_dict.update({ + '-1:word[-2]': prev_word[-2:], + '-1:word[-3]': prev_word[-3:], + '-1:word.lower()': prev_word.lower(), + '-1:word.istitle()': prev_word.istitle(), + '-1:word.isupper()': prev_word.isupper(), + '-1:word.isdigit()': prev_word.isdigit(), + '-1:postag': prev_postag, + '-1:postag[:2]': prev_postag[:2], + }) + # first word + if pos == 0: + feat_dict['BOS'] = True + # word + 1 + if pos < len(sent) - 1: + next_word = sent[pos + 1] + next_postag = postags[pos + 1] + feat_dict.update({ + '+1:word[-2]': next_word[-2:], + '+1:word[-3]': next_word[-3:], + '+1:word.lower()': next_word.lower(), + '+1:word.istitle()': next_word.istitle(), + '+1:word.isupper()': next_word.isupper(), + '+1:word.isdigit()': next_word.isdigit(), + '+1:postag': next_postag, + '+1:postag[:2]': next_postag[:2], + }) + # word + 2 + if pos < len(sent) - 2: + next_word2 = sent[pos + 2] + next_postag2 = postags[pos + 2] + feat_dict.update({ + '+2:word[-2]': next_word2[-2:], + '+2:word[-3]': next_word2[-3:], + '+2:word.lower()': next_word2.lower(), + '+2:word.istitle()': next_word2.istitle(), + '+2:word.isupper()': next_word2.isupper(), + '+2:word.isdigit()': next_word2.isdigit(), + '+2:postag': next_postag2, + '+2:postag[:2]': next_postag2[:2], + }) + # last word + if pos == len(sent) - 1: + feat_dict['EOS'] = True + return feat_dict + diff --git a/nerds/models/dictionary.py b/nerds/models/dictionary.py new file mode 100644 index 0000000..d7020f5 --- /dev/null +++ b/nerds/models/dictionary.py @@ -0,0 +1,253 @@ +from nerds.models import NERModel +from nerds.utils import get_logger, spans_to_tokens, write_param_file + +import ahocorasick +import joblib +import os +import spacy + +log = get_logger() + +class DictionaryNER(NERModel): + + def __init__(self, from_dictionary=False): + """ Construct a DictionaryNER object. The DictionaryNER functions + like a gazetteer, and is based on the Aho-Corasick algorithm + implemented by the pyAhoCorasick package. + + Parameters + ---------- + from_dictionary : bool, optional, default False + if True, input is multi-word phrases representing entities, + otherwise input is potentially multi-word phrases annotated as + a sequence of (token, tag) pairs. See fit(X, y) for more + information. + + Attributes + ---------- + model_ : reference to internal pyAhoCorasick Automaton. + """ + super().__init__() + self.from_dictionary = from_dictionary + self._spacy_lm = None + self.model_ = None + + + def fit(self, X, y): + """ Build dictionary of phrases of different entity types. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens or phrases. + y : list(list(str)) + list of list of labels. + + Returns + ------- + self + """ + self.model_ = ahocorasick.Automaton() + if self.from_dictionary: + for token, label in zip(X, y): + self.model_.add_word(token, (label, token)) + else: + for idx, (tokens, labels) in enumerate(zip(X, y)): + phrase_tokens, phrase_labels = self._combine_tokens(tokens, labels) + for phrase, label in zip(phrase_tokens, phrase_labels): + self.model_.add_word(phrase, (label, phrase)) + self.model_.make_automaton() + return self + + + def predict(self, X): + """ Finds matches in text from entries in the Automaton object. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + + Returns + ------- + ypred : list(list(str)) + list of list of predicted BIO tags. + """ + if self.model_ is None: + raise ValueError("No model found, use fit() to train or load() pretrained.") + + predictions = [] + for tokens in X: + sent = " ".join(tokens) + matched_phrases = [] + for end_index, (tag, phrase) in self.model_.iter(sent): + start_index = end_index - len(phrase) + 1 + # filter out spurious matches on partial words + self._add_if_not_spurious_match( + start_index, end_index, tag, sent, matched_phrases) + # remove subsumed phrases + longest_phrases = self._remove_subsumed_matches(matched_phrases, 1) + # convert longest matches to IOB format + if self._spacy_lm is None: + self._spacy_lm = self._load_language_model() + _, pred = spans_to_tokens(sent, longest_phrases, self._spacy_lm) + predictions.append(pred) + + return predictions + + + def save(self, dirpath=None): + """ Saves picked automaton object into dirpath. + + Parameters + ---------- + dirpath : str + path to directory where model will be saved + + Returns + ------- + None + """ + if self.model_ is None: + raise ValueError("No model found, use fit() to train or load() pretrained.") + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + log.info("Saving model...") + model_file = os.path.join(dirpath, "dictionary-ner.pkl") + joblib.dump(self.model_, model_file) + + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath=None): + """ Loads model from disk from dirpath. + + Parameters + ---------- + dirpath : str + path to directory where model will be retrieved. + + Returns + ------- + self + """ + model_file = os.path.join(dirpath, "dictionary-ner.pkl") + if not os.path.exists(model_file): + raise ValueError("Saved model {:s} not found.".format(model_file)) + + self.model_ = joblib.load(model_file) + return self + + + def _load_language_model(self): + return spacy.load("en") + + + def _combine_tokens(self, tokens, labels): + """ Combine consecutive word tokens for some given entity type + to create phrase tokens. + + Parameters + ---------- + tokens : list(str) + a list of tokens representing a sentence. + labels : list(str) + a list of IOB tags for sentence. + + Returns + ------- + phrases : list(str) + list of multi-word phrases. + phrase_labels : list(str) + list of phrase entity types. + """ + phrases, phrase_labels = [], [] + phrase_tokens = [] + for token, label in zip(tokens, labels): + if label == "O" and len(phrase_tokens) > 0: + phrases.append(" ".join(phrase_tokens)) + phrase_labels.append(prev_label.split("-")[-1]) + phrase_tokens = [] + if label.startswith("B-"): + phrase_tokens = [token] + if label.startswith("I-"): + phrase_tokens.append(token) + prev_label = label + + if len(phrase_tokens) > 0: + phrases.append(" ".join(phrase_tokens)) + phrase_labels.append(prev_label.split("-")[-1]) + + return phrases, phrase_labels + + + def _add_if_not_spurious_match(self, start_index, end_index, tag, + sentence, matched_phrases): + """ Aho-Corasick can match across word boundaries, and often matches + parts of longer words. This function checks to make sure any + matches it reports don't do so. + + Parameters + ---------- + start_index : int + reported start index of matched phrase. + end_index : int + reported end index of matched phrase. + tag : str + the entity type. + sentence : str + the sentence in which match occurs. + matched_phrases : list(str) + list of matched phrases, updated in place by function. + + Returns + ------- + None + """ + if start_index == 0: + if end_index + 1 < len(sentence): + if sentence[end_index + 1] == " ": + matched_phrases.append((start_index, end_index + 1, tag)) + elif end_index + 1 == len(sentence): + if start_index > 0: + if sentence[start_index - 1] == " ": + matched_phrases.append((start_index, end_index + 1, tag)) + else: + if sentence[start_index - 1] == " " and sentence[end_index + 1] == " ": + matched_phrases.append((start_index, end_index + 1, tag)) + + + def _remove_subsumed_matches(self, matched_phrases, k): + """ Remove matches that are subsumed in longer matches. This ensures + that the matches reported are the longest ones. Function works as + follows -- we sort the list by longest phrase first, and then check + to see if any shorter phrases are contained within the longest one + and remove them if so. We then recursively apply this same function + to the remaining list, moving one position down for the longest + phrase to match against. Function stops when we have seen all the + phrases. + + Parameters + ---------- + matched_phrases : list((start, end, iob_tag)) + list of matched phrase tuples. + k : int + starting position. + + Returns + ------- + matched_phrases without shorter subsumed phrase tuples. + """ + if k >= len(matched_phrases): + return matched_phrases + sorted_matches = sorted(matched_phrases, key=lambda x: x[1]-x[0], reverse=True) + longest_matches = sorted_matches[0:k] + ref_offsets = (longest_matches[-1][0], longest_matches[-1][1]) + for phrase in sorted_matches[k:]: + if phrase[0] >= ref_offsets[0] and phrase[1] <= ref_offsets[1]: + continue + else: + longest_matches.append(phrase) + return self._remove_subsumed_matches(longest_matches, k+1) diff --git a/nerds/models/elmo.py b/nerds/models/elmo.py new file mode 100644 index 0000000..2185ae1 --- /dev/null +++ b/nerds/models/elmo.py @@ -0,0 +1,229 @@ +from anago.utils import load_data_and_labels, load_glove, filter_embeddings +from anago.models import ELModel, save_model, load_model +from anago.preprocessing import ELMoTransformer +from anago.trainer import Trainer +from anago.tagger import Tagger + +from keras.optimizers import Adam + +from nerds.models import NERModel +from nerds.utils import get_logger, write_param_file + +from sklearn.model_selection import train_test_split + +import os + +log = get_logger() + + +class ElmoNER(NERModel): + + def __init__(self, + word_embedding_dim=100, + char_embedding_dim=25, + word_lstm_size=100, + char_lstm_size=25, + fc_dim=100, + dropout=0.5, + embeddings=None, + embeddings_file="glove.6B.100d.txt", + batch_size=16, + learning_rate=0.001, + max_iter=2): + """ Construct a ELMo based NER model. Model is similar to the BiLSTM-CRF + model except that the word embeddings are contextual, since they are + returned by a trained ELMo model. ELMo model requires an additional + embedding, which is Glove-100 by default. ELMo model is provided by + the (dev) Anago project. + + Parameters + ---------- + word_embedding_dim : int, optional, default 100 + word embedding dimensions. + char_embedding_dim : int, optional, default 25 + character embedding dimensions. + word_lstm_size: int, optional, default 100 + character LSTM feature extractor output dimensions. + char_lstm_size : int, optional, default 25 + word tagger LSTM output dimensions. + fc_dim : int, optional, default 100 + output fully-connected layer size. + dropout : float, optional, default 0.5 + dropout rate. + embeddings : numpy array + word embedding matrix. + embeddings_file : str + path to embedding file. + batch_size : int, optional, default 16 + training batch size. + learning_rate : float, optional, default 0.001 + learning rate for Adam optimizer. + max_iter : int, optional, default 2 + number of epochs of training. + + Attributes + ---------- + preprocessor_ : reference to Anago preprocessor. + model_ : reference to the internal Anago ELModel + trainer_ : reference to the internal Anago Trainer object. + tagger_ : reference to the internal Anago Tagger object. + """ + super().__init__() + self.word_embedding_dim = word_embedding_dim + self.char_embedding_dim = char_embedding_dim + self.word_lstm_size = word_lstm_size + self.char_lstm_size = char_lstm_size + self.fc_dim = fc_dim + self.dropout = dropout + self.embeddings = embeddings + self.embeddings_file = embeddings_file + self.batch_size = batch_size + self.learning_rate = learning_rate + self.max_iter = max_iter + # populated by fit() and load(), expected by save() and transform() + self.preprocessor_ = None + self.model_ = None + self.trainer_ = None + self.tagger_ = None + + + def fit(self, X, y): + """ Trains the NER model. Input is list of AnnotatedDocuments. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + y : list(list(str)) + list of list of BIO tags + + Returns + ------- + self + """ + if self.embeddings is None and self.embeddings_file is None: + raise ValueError("Either embeddings or embeddings_file should be provided, exiting.") + + log.info("Preprocessing dataset...") + self.preprocessor_ = ELMoTransformer() + self.preprocessor_.fit(X, y) + + if self.embeddings is None: + self.embeddings = load_glove(self.embeddings_file) + embeddings_dim != self.embeddings[list(self.embeddings.keys())[0]].shape[0] + self.embeddings = filter_embeddings(self.embeddings, + self.preprocessor_._word_vocab.vocab, + embeddings_dim) + + log.info("Building model...") + self.model_ = ELModel( + char_embedding_dim=self.char_embedding_dim, + word_embedding_dim=self.word_embedding_dim, + char_lstm_size=self.char_lstm_size, + word_lstm_size=self.word_lstm_size, + char_vocab_size=self.preprocessor_.char_vocab_size, + word_vocab_size=self.preprocessor_.word_vocab_size, + num_labels=self.preprocessor_.label_size, + embeddings=self.embeddings, + dropout=self.dropout) + + self.model_, loss = self.model_.build() + optimizer = Adam(lr=self.learning_rate) + self.model_.compile(loss=loss, optimizer=optimizer) + self.model_.summary() + + log.info('Training the model...') + self.trainer_ = Trainer(self.model_, preprocessor=self.preprocessor_) + + x_train, x_valid, y_train, y_valid = train_test_split(X, y, + test_size=0.1, random_state=42) + self.trainer_.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, + batch_size=self.batch_size, epochs=self.max_iter) + + self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) + + return self + + + def predict(self, X): + """ Predicts using the NER model. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.tagger_ is None: + raise ValueError("No tagger found, either run fit() to train or load() a trained model") + + log.info("Predicting from model...") + ypreds = [self.tagger_.predict(" ".join(x)) for x in X] + return ypreds + + + def save(self, dirpath): + """ Saves model to local disk, given a dirpath + + Parameters + ----------- + dirpath : str + a directory where model artifacts will be saved. Model saves a + weights.h5 weights file, a params.json parameter file, and a + preprocessor.pkl preprocessor file. + + Returns + ------- + None + """ + if self.model_ is None or self.preprocessor_ is None: + raise ValueError("No model artifacts to save, either run fit() to train or load() a trained model") + + if not os.path.exists(dirpath): + os.makedirs(dirpath) + + weights_file = os.path.join(dirpath, "weights.h5") + params_file = os.path.join(dirpath, "params.json") + preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") + + save_model(self.model_, weights_file, params_file) + self.preprocessor_.save(preprocessor_file) + + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath): + """ Loads a trained model from local disk, given the dirpath + + Parameters + ---------- + dirpath : str + a directory where model artifacts are saved. + + Returns + ------- + self + """ + if not os.path.exists(dirpath): + raise ValueError("Model directory not found: {:s}".format(dirpath)) + + weights_file = os.path.join(dirpath, "weights.h5") + params_file = os.path.join(dirpath, "params.json") + preprocessor_file = os.path.join(dirpath, "preprocessor.pkl") + + if not (os.path.exists(weights_file) or + os.path.exists(params_file) or + os.path.exists(preprocessor_file)): + raise ValueError("Model files may be corrupted, exiting") + + self.model_ = load_model(weights_file, params_file) + self.preprocessor_ = ELMoTransformer.load(preprocessor_file) + self.tagger_ = Tagger(self.model_, preprocessor=self.preprocessor_) + + return self + + diff --git a/nerds/models/ensemble.py b/nerds/models/ensemble.py new file mode 100644 index 0000000..1819e52 --- /dev/null +++ b/nerds/models/ensemble.py @@ -0,0 +1,184 @@ +from nerds.models import NERModel +from nerds.utils import get_logger, write_param_file + +from sklearn.preprocessing import LabelEncoder + +import joblib +import numpy as np +import os + +log = get_logger() + +class EnsembleNER(NERModel): + + def __init__(self, + estimators=[], + weights=None, + n_jobs=1, + is_pretrained=False): + """ Constructor for Voting Ensemble NER. + + Parameters + ---------- + estimators : list((str, NERModel)) + list of (name, NERModel) tuples of models in the ensemble. + weights : list(int), optional + sequence of weights to apply to predicted class labels from + each estimator. If None, then predictions from all estimators + are weighted equally. + n_jobs : int, default=1 + number of jobs to run in parallel, default is to single-thread. + -1 means to use all available resources. + is_pretrained : bool, default False + if True, estimators are assumed to be pretrained and fit() + is skipped. + + Attributes + ---------- + None + """ + super().__init__() + # these are set by fit and load, required by predict and save + self.estimators = estimators + self.weights = weights + self.n_jobs = n_jobs + self.is_pretrained=is_pretrained + + + def fit(self, X, y): + """ Train ensemble by training underlying NERModels. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens. + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self + """ + if self.estimators is None or len(self.estimators) == 0: + raise ValueError("Non-empty list of estimators required to fit ensemble.") + if self.weights is None: + self.weights = [1] * len(self.estimators) + else: + if len(self.estimators) != len(self.weights): + raise ValueError("Number of weights must correspond to number of estimators.") + + if self.is_pretrained: + return self + + fitted_estimators = joblib.Parallel(n_jobs=self.n_jobs, backend="threading")( + joblib.delayed(self._fit_estimator)(clf, X, y) + for name, clf in self.estimators) + self.estimators = [(name, fitted) for (name, clf), fitted + in zip(self.estimators, fitted_estimators)] + + return self + + + def predict(self, X): + """ + Predicts using each estimator in the ensemble, then merges the + predictions using a voting scheme given by the vote() method + (subclasses can override voting policy by overriding vote()). + + Parameters + ---------- + X : list(list(str)) + list of list of tokens to predict from. + + Returns + ------- + ypred : list(list(str)) + list of list of BIO tags predicted by model. + """ + if self.estimators is None or self.weights is None: + raise ValueError("Model not ready to predict. Call fit() first, or if using pre-trained models, call fit() with is_pretrained=True") + + predictions = joblib.Parallel(n_jobs=self.n_jobs, backend="threading")( + joblib.delayed(self._predict_estimator)(clf, X) + for name, clf in self.estimators) + + return self._vote(predictions) + + + def load(dirpath): + raise NotImplementedError() + + + def save(dirpath): + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def _fit_estimator(self, estimator, X, y): + fitted_estimator = estimator.fit(X, y) + return fitted_estimator + + + def _predict_estimator(self, estimator, X): + return estimator.predict(X) + + + def _vote(self, predictions): + """ + Voting mechanism (can be overriden by subclass if desired). + + Parameters + ---------- + predictions : list(list(list(str))) + List of list of list of BIO tags predicted by each NER in + the ensemble. Each NER outputs a list of list of BIO tags + where the outer list corresponds to sentences and the inner + list corresponds to tokens. + + Returns + ------- + voted_predictions : list(list(str)) + List of list of BIO tags. Output BIO tag at each position + is the one that is predicted by the majority of NERs in + the ensemble. + """ + tag2int, int2tag = self._build_label_vocab(predictions) + + best_preds = [] + for row_id in range(len(predictions[0])): + + row_preds = [] + # gather all predictions for this row + for est_id in range(len(predictions)): + sent_pred = np.array([tag2int[y] for y in predictions[est_id][row_id]]) + # weighted by weights if any + for weight in range(self.weights[est_id]): + row_preds.append(sent_pred) + + # convert to numpy matrix for performance + R = np.array(row_preds) + + # we now find the most frequent tag at each position + B = np.zeros((R.shape[1]), dtype="int32") + for col_id in range(R.shape[1]): + col = R[:, col_id] + values, indices = np.unique(col, return_inverse=True) + B[col_id] = values[np.argmax(np.bincount(indices))] + + # append the labels associated with the most frequent tags + best_preds.append([int2tag[x] for x in B.tolist()]) + + return best_preds + + + def _build_label_vocab(self, predictions): + """ build lookup table from token to int and back (for performance) """ + tags, tag2int, int2tag = [], {}, {} + label_encoder = LabelEncoder() + for est_pred in predictions: + for sent_pred in est_pred: + for tok_pred in sent_pred: + tags.append(tok_pred) + label_encoder.fit(tags) + tag2int = {t:i for i, t in enumerate(label_encoder.classes_)} + int2tag = {i:t for t, i in tag2int.items()} + return tag2int, int2tag diff --git a/nerds/models/flair.py b/nerds/models/flair.py new file mode 100644 index 0000000..067d2dd --- /dev/null +++ b/nerds/models/flair.py @@ -0,0 +1,265 @@ +import flair +import os +import torch + +from flair.data import Corpus, Sentence, Token +from flair.embeddings import (CharacterEmbeddings, TokenEmbeddings, + WordEmbeddings, StackedEmbeddings) +from flair.models import SequenceTagger +from flair.trainers import ModelTrainer + +from sklearn.model_selection import train_test_split +from torch.optim import SGD, Adam + +from nerds.models import NERModel +from nerds.utils import get_logger, write_param_file + +log = get_logger() + +class FlairNER(NERModel): + + def __init__(self, + basedir, + hidden_dim=256, + embeddings=None, + use_crf=True, + use_rnn=True, + num_rnn_layers=1, + dropout=0.0, + word_dropout=0.05, + locked_dropout=0.5, + optimizer="sgd", + learning_rate=0.1, + batch_size=32, + max_iter=10): + """ Construct a FLAIR NER. + + Parameters + ---------- + basedir : str + directory where all model artifacts will be written. + hidden_dim : int, optional, default 256 + dimension of RNN hidden layer. + embeddings : flair.embeddings.TokenEmbeddings, optional + if not provided, default embedding used is stacked GloVe + WordEmbeddings and CharacterEmbeddings. + use_crf : bool, default True + if True, CRF decoder layer is used in model, otherwise absent. + use_rnn : bool, default True + if True, RNN layer used after Embeddings, otherwise absent. + dropout : float, optional, default 0.0 + dropout probability. + word_dropout : float, optional, default 0.05 + word dropout probability. + locked_dropout : float, optional, default 0.5 + locked dropout probability. + optimizer : str, optional, default "sgd" + valid values are "sgd" and "adam" + learning_rate : float, optional, default 0.1 + learning rate for (SGD) optimizer. + batch_size : int, optional, default 32 + batch size to use during training. + max_iter : int, optional, default 10 + number of epochs to train. + + Attributes + ---------- + model_ : reference to the underlying flair.models.SequenceTagger model. + """ + super().__init__() + self.basedir = basedir + self.hidden_dim = hidden_dim + self.embeddings = embeddings + self.use_crf = use_crf + self.use_rnn = use_rnn + self.num_rnn_layers = num_rnn_layers + self.dropout = dropout + self.word_dropout = word_dropout + self.locked_dropout = locked_dropout + self.optimizer = optimizer + self.learning_rate = learning_rate + self.batch_size = batch_size + self.max_iter = max_iter + self.model_ = None + + + def fit(self, X, y): + """ Build feature vectors and train FLAIR model. + + Parameters + ---------- + X : list(list(str)) + list of sentences. Sentences are tokenized into list + of words. + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self + """ + log.info("Creating FLAIR corpus...") + Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1) + sents_train = self._convert_to_flair(Xtrain, ytrain) + sents_val = self._convert_to_flair(Xval, yval) + corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus") + + tag_dict = corpus_train.make_tag_dictionary(tag_type="ner") + + if self.embeddings is None: + embedding_types = [ + WordEmbeddings("glove"), + CharacterEmbeddings() + ] + self.embeddings = StackedEmbeddings(embeddings=embedding_types) + + log.info("Building FLAIR NER...") + self.model_ = SequenceTagger(hidden_size=self.hidden_dim, + embeddings=self.embeddings, + tag_dictionary=tag_dict, + tag_type="ner", + use_crf=self.use_crf, + use_rnn=self.use_rnn, + rnn_layers=self.num_rnn_layers, + dropout=self.dropout, + word_dropout=self.word_dropout, + locked_dropout=self.locked_dropout) + + log.info("Training FLAIR NER...") + opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam + trainer = ModelTrainer(self.model_, corpus_train, opt) + trainer.train(base_path=self.basedir, + learning_rate=self.learning_rate, + mini_batch_size=self.batch_size, + max_epochs=self.max_iter) + + return self + + + def predict(self, X): + """ Predicts using trained FLAIR model. + + Parameters + ---------- + X : list(list(str)) + list of sentences. Sentences are tokenized into list of words. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.model_ is None: + raise ValueError("Cannot predict with empty model, run fit() to train or load() pretrained model.") + + log.info("Generating predictions...") + sents_test = self._convert_to_flair(X) + sents_pred = self.model_.predict(sents_test, + mini_batch_size=self.batch_size, + all_tag_prob=True) + _, ypred = self._convert_from_flair(sents_pred) + + return ypred + + + def save(self, dirpath): + """ Save trained FLAIR NER model at dirpath. + + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + None + """ + if self.model_ is None: + raise ValueError("Cannot save empty model, run fit() to train or load() pretrained model.") + + if not(os.path.exists(dirpath) and os.path.isdir(dirpath)): + os.makedirs(dirpath) + self.model_.save(os.path.join(dirpath, "final-model.pt")) + + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath): + """ Load a pre-trained FLAIR NER model from dirpath. + + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + self + """ + if not(os.path.exists(dirpath) and os.path.isdir(dirpath)): + raise ValueError("Model directory {:s} not found".format(dirpath)) + + if not os.path.exists(os.path.join(dirpath, "final-model.pt")): + raise ValueError("No model file in directory {:d}".format(dirpath)) + + self.model_ = SequenceTagger.load(os.path.join(dirpath, "final-model.pt")) + + return self + + + def _convert_to_flair(self, data, labels=None): + """ Convert data and labels into a list of flair.data.Sentence objects. + + Parameters + ---------- + data : list(list(str)) + list of list of tokens, each inner list represents a list of + tokens or words in sentence, and each outer list represents + a sentence. + labels : list(list(str)), can be None + list of list of NER tags corresponding to tokens in data. + + Returns + ------- + sentences : list(flair.data.Sentence) + """ + sentences = [] + if labels is None: + labels = data + use_dummy_labels = True + else: + use_dummy_labels = False + for tokens, tags in zip(data, labels): + sentence = Sentence() + for token, tag in zip(tokens, tags): + t = Token(token) + if not use_dummy_labels: + t.add_tag("ner", tag) + sentence.add_token(t) + sentences.append(sentence) + return sentences + + + def _convert_from_flair(self, sentences): + """ Convert a list of flair.data.Sentence objects to parallel lists for + data and label lists. + + Parameters + ---------- + sentences : list(flair.data.Sentence) + list of FLAIR Sentence objects populated with tag predictions. + + Returns + ------- + data : list(list(str)) + list of list of tokens. + labels : list(list(str)) + list of list of tags. + """ + data, labels = [], [] + for sentence in sentences: + tokens = [t.text for t in sentence.tokens] + tags = [t.tags["ner"].value for t in sentence.tokens] + data.append(tokens) + labels.append(tags) + return data, labels diff --git a/nerds/models/spacy.py b/nerds/models/spacy.py new file mode 100644 index 0000000..15b7491 --- /dev/null +++ b/nerds/models/spacy.py @@ -0,0 +1,217 @@ +from nerds.models import NERModel +from nerds.utils import (get_logger, write_param_file, + spans_to_tokens, tokens_to_spans) + +from spacy.util import minibatch + +import itertools +import os +import random +import spacy + +log = get_logger() + + +class SpacyNER(NERModel): + + def __init__(self, + dropout=0.1, + max_iter=20, + batch_size=32): + """ Construct a SpaCy based NER. The SpaCy library provides an EntityRecognizer + class to do Named Entity Recognition. + + Parameters + ---------- + dropout : float, optional, default 0.1 + rate of dropout during training between 0 and 1. + max_iter : int, optional, default 20 + number of epochs of training. + batch_size : int, optional, default 32 + batch size to use during training + + Attributes + ---------- + model_ : reference to internal SpaCy EntityRecognizer model. + """ + super().__init__() + self.dropout = dropout + self.max_iter = max_iter + self.batch_size = batch_size + self._spacy_lm = spacy.load("en") + self.model_ = None + + + def fit(self, X, y): + """ Trains the SpaCy NER model. + + Parameters + ---------- + X : list(list(str)) + list of tokenized sentences, or list of list of tokens. + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self + """ + log.info("Reformatting data to SpaCy format...") + features = [self._convert_to_spacy(tokens, labels) + for tokens, labels in zip(X, y)] + + log.info("Building SpaCy NER model...") + self.model_ = spacy.blank("en") + if "ner" not in self.model_.pipe_names: + ner = self.model_.create_pipe("ner") + self.model_.add_pipe(ner) + else: + ner = self.model_.get_pipe("ner") + + unique_labels = set() + for _, annotations in features: + for ent in annotations.get("entities"): + unique_labels.add(ent[2]) + ner.add_label(ent[2]) + + for label in list(unique_labels): + ner.add_label(label) + + log.info("Training SpaCy NER model...") + optimizer = self.model_.begin_training() + + other_pipes = [p for p in self.model_.pipe_names if p != "ner"] + with self.model_.disable_pipes(*other_pipes): + for it in range(self.max_iter): + random.shuffle(features) + losses = {} + batches = minibatch(features, size=self.batch_size) + for batch in batches: + texts, annotations = zip(*batch) + self.model_.update(texts, annotations, + sgd=optimizer, + drop=self.dropout, + losses=losses) + loss_value = losses["ner"] + log.info("Epoch: {:d} loss: {:.5f}".format(it, loss_value)) + + return self + + + def predict(self, X): + """ Predicts using trained SpaCy NER model. + + Parameters + ---------- + X : list(list(str)) + list of tokenized sentences. + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.model_ is None: + raise ValueError("Cannot predict with empty model, run fit() to train or load() pretrained model.") + + log.info("Generating predictions...") + preds = [] + for sent_tokens in X: + sent = " ".join(sent_tokens) + doc = self.model_(sent) + sent_preds = self._convert_from_spacy(sent, doc.ents) + preds.append(sent_preds) + + return preds + + + def save(self, dirpath): + """ Save trained SpaCy NER model at dirpath. + + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + None + """ + if self.model_ is None: + raise ValueError("Cannot save empty model, run fit() to train or load() pretrained model") + + log.info("Saving model...") + if not os.path.exists(dirpath): + os.makedirs(dirpath) + self.model_.to_disk(dirpath) + + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath): + """ Load a pre-trained SpaCy NER model from dirpath. + + Parameters + ---------- + dirpath : str + path to model directory. + + Returns + ------- + self + """ + if not os.path.exists(dirpath): + raise ValueError("Model directory {:s} not found".format(dirpath)) + + log.info("Loading model...") + self.model_ = spacy.load(dirpath) + return self + + + def _convert_to_spacy(self, tokens, labels): + """ Convert data and labels for single sentence to SpaCy specific format: + + Parameters + ---------- + tokens : list(str) + list of tokens. + labels : list(str) + list of BIO tags. + + Returns + -------- + list of tuples in SpaCy format as shown below: + ( + "The quick brown fox jumps over the lazy dog", + { + "entities": [ + (16, 19, "ANIMAL"), + (40, 43, "ANIMAL") + ] + } + ) + """ + sentence, spans = tokens_to_spans(tokens, labels, allow_multiword_spans=False) + return (sentence, {"entities": spans}) + + + def _convert_from_spacy(self, sent, entities): + """ Converts SpaCy predictions to standard form. + + Parameters + ---------- + sent : str + the sentence as a string. + entities : list(entities) + a list of SpaCy Entity(start_char, end_char, label_) objects. + + Returns + ------- + predictions : list(str) + a list of BIO tags for a single sentence. + """ + spans = [(e.start_char, e.end_char, e.label_) for e in entities] + tokens, tags = spans_to_tokens(sent, spans, self._spacy_lm, + spans_are_multiword=False) + return tags + diff --git a/nerds/models/transformer.py b/nerds/models/transformer.py new file mode 100644 index 0000000..4b8a668 --- /dev/null +++ b/nerds/models/transformer.py @@ -0,0 +1,265 @@ +import joblib +import nerds +import os +import pandas as pd +import random +import torch + +from simpletransformers.ner.ner_model import NERModel as ST_NERModel + +from nerds.models import NERModel +from nerds.utils import (flatten_list, get_logger, + write_param_file, get_labels_from_data) + +from sklearn.model_selection import train_test_split + +log = get_logger() + +class TransformerNER(NERModel): + + def __init__(self, + lang_model_family="bert", + lang_model_name="bert-base-cased", + model_dir="models", + max_sequence_length=128, + batch_size=32, + max_iter=4, + learning_rate=4e-5, + padding_tag="O", + random_state=42): + """ Construct a Transformer NER model. This is a generic front-end + NER class that can work with multiple Transformer architectures. + + Parameters + ---------- + model_dir : str, optional, default "./models" + the directory to which model artifacts will be written out to. + lang_model_family : str, optional, default "bert" + the Transformer Language Model (LM) Family to use. Following LM + families are supported - BERT, RoBERTa, DistilBERT, CamemBERT, + and XLM-RoBERTa. + lang_model_name : str, optional, default "bert-base-cased" + name of the pre-trained LM to use. + model_dir : string, optional, default "models" + directory path to folder where model artifacts will be written + max_sequence_length : int, optional, default 128 + maximum number of tokens in each input sentence. Note that + because of word-piece tokenization, this is not the actual + number of tokens, but the number of word-pieces. + batch_size : int, optional, default 32 + the batch size to use during training and prediction. + max_iter : int, optional, default 4 + the number of epochs to train the model. + learning_rate: float, optional, default 4e-5 + learning rate for Adam optimizer. + padding_tag : str, default "O" + padding tag to use when number of predicted tags is smaller + than the number of label tags because of word-piece tokenization. + Default value ensures that you won't have to align, at the cost + of a drop in reported performance. You should choose a non-default + value and align using nerds.utils.align_labels_and_predictions(). + random_state : int, optional, default 42 + random state to set. + + Attributes + ---------- + model_ : reference to the SimpleTranformers NERModel object. + model_args_ : flat dictionary composed of values from constructor. + labels_ : list of labels to use in model. + """ + super().__init__() + self.model_dir = model_dir + self.lang_model_family = lang_model_family + self.lang_model_name = lang_model_name + self.max_sequence_length = max_sequence_length + self.batch_size = batch_size + self.max_iter = max_iter + self.learning_rate = learning_rate + self.padding_tag = padding_tag + self.random_state = random_state + # attributes + self.model_ = None + self.model_args_ = None + self.labels_ = None + + + def fit(self, X, y): + """ Trains the NER model. Input is list of list of tokens and tags. + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + y : list(list(str)) + list of list of BIO tags. + + Returns + ------- + self + """ + self._build_model_args() + self.labels_ = get_labels_from_data(y) + self.model_ = ST_NERModel( + self.lang_model_family, + self.lang_model_name, + labels=self.labels_, + use_cuda=torch.cuda.is_available(), + args=self.model_args_) + + os.makedirs(self.model_dir, exist_ok=True) + + Xtrain, Xval, ytrain, yval = train_test_split(X, y, + test_size=0.1, random_state=self.random_state) + train_df = self._build_dataframe_from_data_labels(Xtrain, ytrain) + eval_df = self._build_dataframe_from_data_labels(Xval, yval) + self.model_.train_model(train_df, eval_df=eval_df) + return self + + + def predict(self, X): + """ Predicts using the NER model + + Parameters + ---------- + X : list(list(str)) + list of list of tokens + + Returns + ------- + y : list(list(str)) + list of list of predicted BIO tags. + """ + if self.model_ is None: + raise ValueError("No model found, either run fit() to train or load() to load a trained model.") + + predictions, _ = self.model_.predict([" ".join(toks) for toks in X]) + # predictions are list of {token:tag} dicts + predictions = [[tag for token_tag_dict in prediction + for (token, tag) in token_tag_dict.items()] + for prediction in predictions] + # handle possible truncation of prediction (and subsequent mismatch + # with labels) because of too long token list. + predictions_a = [] + for prediction, tokens in zip(predictions, X): + if len(prediction) < len(tokens): + prediction.extend( + [self.padding_tag] * (len(tokens) - len(prediction))) + predictions_a.append(prediction) + return predictions_a + + + def save(self, dirpath=None): + """ This is a no-op for this NER, model artifacts are saved automatically + after every epoch. + + Parameters + ---------- + dirpath : str, optional + directory to which the param file will be written. If not + specified, it will use the folder specified by the model's + output_dir. + + Returns + ------- + None + """ + if self.model_ is None: + raise ValueError("No model artifacts to save, either run fit() to train or load() pretrained model.") + if dirpath is None: + self._build_model_args() + dirpath = self.model_args_["output_dir"] + attr_dict = { + "model_args": self.model_args_, + "labels": self.labels_ + } + joblib.dump(attr_dict, os.path.join(dirpath, "attr_dict.pkl")) + write_param_file(self.get_params(), os.path.join(dirpath, "params.yaml")) + + + def load(self, dirpath=None): + """ Loads a trained model from specified folder on disk. + + Parameters + ---------- + dirpath : str, optional + directory from which model artifacts should be loaded. If + not provided, uses the model_args_["output_dir]. + + Returns + ------- + self + """ + if dirpath is None: + self._build_model_args() + dirpath = self.model_args_["output_dir"] + if not os.path.exists(dirpath): + raise ValueError("Model directory not found: {:s}".format(dirpath)) + attr_dict = joblib.load(os.path.join(dirpath, "attr_dict.pkl")) + self.model_args_ = attr_dict["model_args"] + self.labels_ = attr_dict["labels"] + self.model_ = ST_NERModel(self.lang_model_family, dirpath, + args=self.model_args_, + labels=self.labels_, + use_cuda=torch.cuda.is_available()) + return self + + + def _build_model_args(self): + """ Builds the model_arg dictionary from constructor parameters. + + Parameters + ---------- + none + + Returns + ------- + none + """ + self.model_args_ = { + "output_dir": os.path.join(self.model_dir, "outputs"), + "cache_dir": os.path.join(self.model_dir, "cache"), + "fp16": False, + "fp16_opt_level": "01", + "max_seq_length": self.max_sequence_length, + "train_batch_size": self.batch_size, + "gradient_accumulation_steps": 1, + "num_train_epochs": self.max_iter, + "weight_decay": 0, + "learning_rate": self.learning_rate, + "adam_epsilon": 1e-8, + "warmup_ratio": 0.06, + "warmup_steps": 0, + "max_grad_norm": 1.0, + "eval_batch_size": self.batch_size, + "logging_steps": 50, + "save_steps": 2000, + "overwrite_output_dir": True, + "reprocess_input_data": True, + "evaluate_during_training": True, + "process_count": os.cpu_count() - 2 if os.cpu_count() > 2 else 1, + "n_gpu": torch.cuda.device_count() if torch.cuda.is_available() else 0 + } + + + def _build_dataframe_from_data_labels(self, data, labels): + """ Builds Pandas dataframe from data and labels. + + Parameters + ---------- + data : list(list(str)) + list of list of tokens + labels : list(list(str)) + list of list of tags + + Returns + ------- + Pandas DataFrame with columns (sentence_id, words, labels). + """ + columns = ["sentence_id", "words", "labels"] + recs = [] + for sid, (tokens, tags) in enumerate(zip(data, labels)): + for token, tag in zip(tokens, tags): + recs.append((sid, token, tag)) + data_df = pd.DataFrame.from_records(recs, columns=columns) + return data_df + diff --git a/nerds/test/data/example.ents b/nerds/test/data/example.ents new file mode 100644 index 0000000..b836ed9 --- /dev/null +++ b/nerds/test/data/example.ents @@ -0,0 +1,6 @@ +Pierre Vinken PER +Mr . Vinken PER +Elsevier N . V . ORG +61 years old DATE +Nov . 29 DATE +Dutch NORP diff --git a/nerds/test/data/example.iob b/nerds/test/data/example.iob new file mode 100644 index 0000000..0dea1e6 --- /dev/null +++ b/nerds/test/data/example.iob @@ -0,0 +1,38 @@ +Pierre B-PER +Vinken I-PER +, O +61 B-DATE +years I-DATE +old I-DATE +, O +will O +join O +the O +board O +as O +a O +nonexecutive O +director O +Nov B-DATE +. I-DATE +29 I-DATE +. O + +Mr B-PER +. I-PER +Vinken I-PER +is O +chairman O +of O +Elsevier B-ORG +N I-ORG +. I-ORG +V I-ORG +. I-ORG +, O +the O +Dutch B-NORP +publishing O +group O +. O + diff --git a/nerds/test/test_base_ner.py b/nerds/test/test_base_ner.py new file mode 100644 index 0000000..f2014d0 --- /dev/null +++ b/nerds/test/test_base_ner.py @@ -0,0 +1,30 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_raises + +from nerds.models import NERModel + +def test_fit(): + model = NERModel() + assert_raises(NotImplementedError, model.fit, [], []) + + +def test_predict(): + model = NERModel() + assert_raises(NotImplementedError, model.predict, []) + + +def test_load(): + model = NERModel() + assert_raises(NotImplementedError, model.load, "") + + +def test_save(): + model = NERModel() + assert_raises(NotImplementedError, model.save, "") + + +def test_score(): + model = NERModel() + assert_raises(NotImplementedError, model.score, [], []) diff --git a/nerds/test/test_bert_ner.py b/nerds/test/test_bert_ner.py new file mode 100644 index 0000000..a3133c5 --- /dev/null +++ b/nerds/test/test_bert_ner.py @@ -0,0 +1,22 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import BertNER +from nerds.utils import load_data_and_labels + +import numpy as np +import shutil + + +def test_bert_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = BertNER(max_iter=1) + model.fit(X, y) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) + assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal") + assert_equal(len(y[0]), len(y_pred[0]), "Size of first Label and prediction must be equal") + shutil.rmtree("nerds/test/data/models") diff --git a/nerds/test/test_bilstm_ner.py b/nerds/test/test_bilstm_ner.py new file mode 100644 index 0000000..c9e4241 --- /dev/null +++ b/nerds/test/test_bilstm_ner.py @@ -0,0 +1,22 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import BiLstmCrfNER +from nerds.utils import load_data_and_labels + +import shutil + +def test_bilstm_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = BiLstmCrfNER(max_iter=1) + model.fit(X, y) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) + # there is not enough data to train this model properly, so decent + # asserts are unlikely to succeed. + assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal.") + shutil.rmtree("nerds/test/data/models") + diff --git a/nerds/test/test_crf_ner.py b/nerds/test/test_crf_ner.py new file mode 100644 index 0000000..770e35b --- /dev/null +++ b/nerds/test/test_crf_ner.py @@ -0,0 +1,35 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import CrfNER +from nerds.utils import load_data_and_labels + +import shutil + +def test_crf_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = CrfNER() + model.fit(X, y) + model.save("nerds/test/data/models") + r_model = model.load("nerds/test/data/models") + y_pred = r_model.predict(X) + assert_equal(y, y_pred, "Label and prediction must be equal") + assert_equal(1.0, model.score(X, y)) + shutil.rmtree("nerds/test/data/models") + + +def test_crf_ner_with_nondefault_features(): + def my_test_featurizer(sentence): + return [{"word":token} for token in sentence] + + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = CrfNER(featurizer=my_test_featurizer) + model.fit(X, y) + y_pred = model.predict(X) + # our features are not good enough to do good predictions, so just + # check the lengths of labels vs predictions to make sure it worked + assert_equal(len(y), len(y_pred), "Number of label and predictions must be equal.") + assert_equal(len(y[0]), len(y_pred[0]), "Size of label and predictions must match (1).") + assert_equal(len(y[1]), len(y_pred[1]), "Size of label and predictions must match (2).") diff --git a/nerds/test/test_dictionary_ner.py b/nerds/test/test_dictionary_ner.py new file mode 100644 index 0000000..094714f --- /dev/null +++ b/nerds/test/test_dictionary_ner.py @@ -0,0 +1,38 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import DictionaryNER +from nerds.utils import load_data_and_labels + +import shutil + +def test_dictionary_ner_from_conll(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = DictionaryNER() + model.fit(X, y) + model.save("nerds/test/data/models") + r_model = model.load("nerds/test/data/models") + y_pred = r_model.predict(X) + assert_equal(y, y_pred, "Label and prediction must be equal") + assert_equal(1.0, model.score(X, y)) + shutil.rmtree("nerds/test/data/models") + + +def test_dictionary_ner_from_dict(): + # load and fit model from dictionary + xs, ys = [], [] + fdict = open("nerds/test/data/example.ents", "r") + for line in fdict: + x, y = line.strip().split('\t') + xs.append(x) + ys.append(y) + fdict.close() + model = DictionaryNER(from_dictionary=True) + model.fit(xs, ys) + # predict using example + X, y = load_data_and_labels("nerds/test/data/example.iob") + y_pred = model.predict(X) + assert_equal(y, y_pred, "Label and prediction must be equal") + assert_equal(1.0, model.score(X, y)) diff --git a/nerds/test/test_dictionary_ner_model.py b/nerds/test/test_dictionary_ner_model.py deleted file mode 100644 index bffeea5..0000000 --- a/nerds/test/test_dictionary_ner_model.py +++ /dev/null @@ -1,25 +0,0 @@ -from nose.tools import assert_equal, assert_in - -from nerds.core.model.input.document import Document -from nerds.core.model.ner.dictionary import ExactMatchDictionaryNER - - -def test_ExactMatchDictionaryNER(): - document = Document(b""" - There are many publishers in the world, like - Elsevier, Springer and also Wiley""") - - dictionary_ner = ExactMatchDictionaryNER( - "nerds/test/data/dictionary/orgdictionary.txt", "ORGANIZATION") - annotated = dictionary_ner.transform([document]) - - annotations = annotated[0].annotations - - assert_equal( - 3, len(annotations), "Must have matched the three publishers.") - - unique_annotations = set([ann.text for ann in annotations]) - - assert_in("Elsevier", unique_annotations) - assert_in("Springer", unique_annotations) - assert_in("Wiley", unique_annotations) diff --git a/nerds/test/test_elmo_ner.py b/nerds/test/test_elmo_ner.py new file mode 100644 index 0000000..c0ae355 --- /dev/null +++ b/nerds/test/test_elmo_ner.py @@ -0,0 +1,24 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import ElmoNER +from nerds.utils import load_data_and_labels + +import numpy as np +import shutil + +def test_elmo_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + # there are 28 unique words in our "vocabulary" + embeddings = np.random.random((28, 100)) + model = ElmoNER(embeddings=embeddings, max_iter=1) + model.fit(X, y) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) + # there is not enough data to train this model properly, so decent + # asserts are unlikely to succeed. + assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal.") + shutil.rmtree("nerds/test/data/models") \ No newline at end of file diff --git a/nerds/test/test_ensemble_ner.py b/nerds/test/test_ensemble_ner.py new file mode 100644 index 0000000..9e51fad --- /dev/null +++ b/nerds/test/test_ensemble_ner.py @@ -0,0 +1,36 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import DictionaryNER, CrfNER, SpacyNER, EnsembleNER +from nerds.utils import load_data_and_labels + +from sklearn.ensemble import VotingClassifier + +def test_ensemble_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + estimators = [ + ("dict_ner", DictionaryNER()), + ("crf_ner", CrfNER(max_iter=1)), + ("spacy_ner", SpacyNER(max_iter=1)) + ] + model = EnsembleNER(estimators=estimators) + model.fit(X, y) + y_pred = model.predict(X) + assert_equal(len(y), len(y_pred), "Number of predicted and label documents must be same.") + assert_equal(len(y[0]), len(y_pred[0]), "Number of predicted and label tags must be same.") + + +def test_ensemble_ner_multithreaded(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + estimators = [ + ("dict_ner", DictionaryNER()), + ("crf_ner", CrfNER(max_iter=1)), + ("spacy_ner", SpacyNER(max_iter=1)) + ] + model = EnsembleNER(estimators=estimators, n_jobs=-1) + model.fit(X, y) + y_pred = model.predict(X) + assert_equal(len(y), len(y_pred), "Number of predicted and label documents must be same.") + assert_equal(len(y[0]), len(y_pred[0]), "Number of predicted and label tags must be same.") diff --git a/nerds/test/test_flair_ner.py b/nerds/test/test_flair_ner.py new file mode 100644 index 0000000..44772d8 --- /dev/null +++ b/nerds/test/test_flair_ner.py @@ -0,0 +1,22 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import FlairNER +from nerds.utils import load_data_and_labels + +import shutil + +def test_flair_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = FlairNER("nerds/test/data/models", max_iter=1) + model.fit(X, y) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) + # FLAIR NER needs more data to train than provided, so pointless testing + # for prediction quality, just make sure prediction produces something sane + assert_equal(len(y), len(y_pred), "Size of Label and prediction must be equal") + assert_equal(len(y[0]), len(y_pred[0]), "Size of first Label and prediction must be equal") + shutil.rmtree("nerds/test/data/models") diff --git a/nerds/test/test_spacy_ner.py b/nerds/test/test_spacy_ner.py new file mode 100644 index 0000000..ea91464 --- /dev/null +++ b/nerds/test/test_spacy_ner.py @@ -0,0 +1,20 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import SpacyNER +from nerds.utils import load_data_and_labels + +import shutil + +def test_spacy_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = SpacyNER() + model.fit(X, y) + model.save("nerds/test/data/models") + model_r = model.load("nerds/test/data/models") + y_pred = model_r.predict(X) + assert_equal(y, y_pred, "Label and prediction must be equal") + assert_equal(1.0, model.score(X, y)) + shutil.rmtree("nerds/test/data/models") diff --git a/nerds/test/test_transformer_ner.py b/nerds/test/test_transformer_ner.py new file mode 100644 index 0000000..493d3e9 --- /dev/null +++ b/nerds/test/test_transformer_ner.py @@ -0,0 +1,22 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.models import TransformerNER +from nerds.utils import load_data_and_labels + +import numpy as np +import shutil + + +def test_bert_ner(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + model = TransformerNER(model_dir="nerds/test/data/models", max_iter=1) + model.fit(X, y) + model.save() + model_r = model.load() + y_pred = model_r.predict(X) + assert_equal(len(y), len(y_pred), "Number of labels and predictions must be equal") + assert_equal(len(y[0]), len(y_pred[0]), "Size of first Label and prediction must be equal") + shutil.rmtree("nerds/test/data/models") diff --git a/nerds/test/test_utils.py b/nerds/test/test_utils.py new file mode 100644 index 0000000..2a223ed --- /dev/null +++ b/nerds/test/test_utils.py @@ -0,0 +1,131 @@ +import warnings +warnings.filterwarnings("ignore") + +from nose.tools import assert_equal, assert_true + +from nerds.utils import * +from nerds.models import CrfNER + +import os +import spacy + +spacy_lm = spacy.load("en") + +def test_load_data_and_labels(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + assert_true(len(X) == 2, "There should be 2 sentences in X") + assert_equal(len(X), len(y), "There should be tags for 2 sentences in y") + assert_equal(len(X[0]), len(y[0]), "Number of tokens should be equal to number of tags") + + +def test_get_labels_from_data(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + raw_labels = get_labels_from_data(y) + assert_equal(8, len(raw_labels), "There should be 8 unique raw labels") + class_labels = get_labels_from_data(y, strip_prefix=True) + assert_equal(5, len(class_labels), "There should be 5 unique class labels") + + +def test_write_param_file(): + model = CrfNER() + param_filepath = "nerds/test/data/crf_params.yaml" + write_param_file(model.get_params(), param_filepath) + lines = [] + with open(param_filepath, "r") as fp: + for line in fp: + lines.append(line.strip()) + assert_equal(4, len(lines)) + os.remove(param_filepath) + + +def test_flatten_and_unflatten_list(): + X, y = load_data_and_labels("nerds/test/data/example.iob") + yflat = flatten_list(y, strip_prefix=True) + assert_equal(36, len(yflat), "There should be 36 tags in all") + assert_equal(5, len([y for y in yflat if y == "PER"]), "There should be 5 PER tags") + y_lengths = compute_list_lengths(y) + y_unflat = unflatten_list(yflat, y_lengths) + assert_equal(len(y), len(y_unflat), "Reconstructed y (y_unflat) should be identical to y") + assert_equal(len(y[0]), len(y_unflat[0]), "Reconstructed y (y_unflat) should be identical to y") + + +def test_tokens_to_spans(): + data, labels = load_data_and_labels("nerds/test/data/example.iob") + tokens, tags = data[0], labels[0] + sentence, spans = tokens_to_spans(tokens, tags, allow_multiword_spans=True) + assert_equal( + "Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov . 29 .", + sentence, "Sentence reconstruction is incorrect") + assert_equal(3, len(spans), "Should be exactly 3 spans") + assert_equal(0, spans[0][0], "spans[0].start should be 0") + assert_equal(13, spans[0][1], "spans[0].end should be 13") + assert_equal("PER", spans[0][2], "spans[0].cls should be PER") + assert_equal(16, spans[1][0], "spans[1].start should be 16") + assert_equal(28, spans[1][1], "spans[1].end should be 28") + assert_equal("DATE", spans[1][2], "spans[1].cls should be DATE") + assert_equal(78, spans[2][0], "spans[2].start should be 78") + assert_equal(86, spans[2][1], "spans[2].end should be 86") + assert_equal("DATE", spans[2][2], "spans[2].cls should be DATE") + + +def test_tokens_to_spans_no_multiword_spans(): + data, labels = load_data_and_labels("nerds/test/data/example.iob") + tokens, tags = data[0], labels[0] + # convert to single token per span format + tags = ["O" if t == "O" else "B-" + t.split("-")[1] for t in tags] + sentence, spans = tokens_to_spans(tokens, tags, allow_multiword_spans=False) + assert_equal(8, len(spans), "Should be exactly 8 spans") + assert_equal(0, spans[0][0], "spans[0].start should be 0") + assert_equal(6, spans[0][1], "spans[0].end should be 6") + assert_equal("PER", spans[0][2], "spans[0].cls should be PER") + assert_equal(16, spans[2][0], "spans[2].start should be 16") + assert_equal(18, spans[2][1], "spans[2].end should be 18") + assert_equal("DATE", spans[2][2], "spans[2].cls should be DATE") + assert_equal(78, spans[5][0], "spans[5].start should be 78") + assert_equal(81, spans[5][1], "spans[5].end should be 81") + assert_equal("DATE", spans[5][2], "spans[5].cls should be DATE") + + +def test_spans_to_tokens(): + sentence = "Mr . Vinken is chairman of Elsevier N . V . , the Dutch publishing group ." + spans = [(0, 11, "PER"), (27, 43, "ORG"), (50, 55, "NORP")] + tokens, tags = spans_to_tokens(sentence, spans, spacy_lm, spans_are_multiword=True) + # reference tokens and tags for comparison + data, labels = load_data_and_labels("nerds/test/data/example.iob") + ref_tokens, ref_tags = data[1], labels[1] + assert_equal(len(tokens), len(ref_tokens), "Number of tokens should be identical") + for token, ref_token in zip(tokens, ref_tokens): + assert_equal(ref_token, token, "Tokens do not match. {:s} != {:s}".format(ref_token, token)) + assert_equal(len(tags), len(ref_tags), "Number of BIO tags should be identical") + for tag, ref_tag in zip(tags, ref_tags): + assert_equal(ref_tag, tag, "Tags do not match. {:s} != {:s}".format(ref_tag, tag)) + + +def test_spans_to_tokens_no_multiword_spans(): + sentence = "Mr . Vinken is chairman of Elsevier N . V . , the Dutch publishing group ." + spans = [(0, 2, 'PER'), (3, 4, 'PER'), (5, 11, 'PER'), (27, 35, 'ORG'), (36, 37, 'ORG'), (38, 39, 'ORG'), (40, 41, 'ORG'), (42, 43, 'ORG'), (50, 55, 'NORP')] + tokens, tags = spans_to_tokens(sentence, spans, spacy_lm, spans_are_multiword=False) + ref_preds = ['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O'] + for ref_pred, pred in zip(ref_preds, tags): + assert_equal(ref_pred, pred, "Tags do not match. {:s} != {:s}".format(ref_pred, pred)) + + +def test_align_labels_and_predictions_with_padding(): + labels = [['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O']] + preds = [['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'X', 'X', 'X', 'X']] + assert_equal(len(labels[0]), len(preds[0]), "Label and Prediction should have same number of tags") + labels_a, preds_a = align_labels_and_predictions(labels, preds, padding_tag="X") + print(">>>>", len(labels[0]), len(preds[0]), len(labels_a[0]), len(preds_a[0])) + assert_equal(len(labels_a[0]), len(preds_a[0]), "After padded alignment, Label and Prediction should have same number of tags") + assert_equal(len(labels_a[0]), len(labels[0]) - 4, "After padded alignment, labels should be shorter than before.") + assert_equal(len(preds_a[0]), len(preds[0]) - 4, "After padded alignment, predictions should be shorter than before.") + + +def test_align_labels_and_predictions_without_padding(): + labels = [['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NORP', 'O', 'O', 'O']] + preds = [['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O']] + assert_true(len(labels[0]) > len(preds[0]), "Label and Prediction should have same number of tags") + labels_a, preds_a = align_labels_and_predictions(labels, preds) + assert_equal(len(labels_a[0]), len(preds_a[0]), "After unpadded alignment, Label and Prediction should have same number of tags") + assert_equal(len(preds_a[0]), len(preds[0]), "After unpadded alignment, number of prediction tags should be unchanged.") + assert_equal(len(labels_a[0]), len(labels[0]) - 4, "After unpadded alignment, labels should be shorter.") diff --git a/nerds/utils.py b/nerds/utils.py new file mode 100644 index 0000000..7f4c302 --- /dev/null +++ b/nerds/utils.py @@ -0,0 +1,309 @@ +import anago +import itertools +import logging +import os +import yaml + + +def get_logger(log_level="DEBUG"): + # TODO: The log level should be adjusted by some kind of configuration + # file, e.g. the dev build should have DEBUG, while the release build + # should have "WARN" or higher. + f = "%(levelname)s %(asctime)s %(module)s %(filename)s: %(message)s" + logging.basicConfig(format=f) + logger = logging.getLogger(__name__) + logger.setLevel(log_level) + return logger + + +def load_data_and_labels(filepath, encoding="utf-8"): + """ Wrapper to expose anago's load_data_and_labels. Built here as + a wrapper because users of non-neural models are not expected + to be familiar with Anago. + + Parameters + ---------- + filepath : str + path to the file in BIO format to be loaded. + encoding : str, default utf-8 + a standard python encodings, see: + https://docs.python.org/2.4/lib/standard-encodings.html + + Returns + ------- + x : list(list(str)) + list of list of tokens, where list of tokens represent sentences. + y : list(str)) + list of list of tags. + """ + return anago.utils.load_data_and_labels(filepath, encoding) + + +def get_labels_from_data(labels, strip_prefix=False): + unique_labels = list(set([tag for tags in labels for tag in tags])) + if strip_prefix: + unique_labels = list(set([t.split("-")[1] + if t != "O" else t for t in unique_labels])) + return sorted(unique_labels) + + +def write_param_file(param_dict, param_filepath): + """ Write configured model hyperparameters to file for documentation. + + Parameters + ---------- + param_dict : dict(str, obj) + Flat dictionary of constructor parameter names and values + generated by cls.get_params() (from sklearn.base.BaseEstimator). + param_filepath : str + Full path to the parameter file. + + Returns + ------- + None + """ + param_dirpath = os.path.dirname(param_filepath) + if not os.path.isdir(param_dirpath): + os.makedirs(param_dirpath) + with open(param_filepath, "w") as fp: + fp.write(yaml.dump(param_dict)) + + +def flatten_list(xs, strip_prefix=True): + """ Flatten label or predictions from list(list(str)) to list(str). + Flattened list can be input to scikit-learn's standard functions + to compute various metrics. + + Parameters + ---------- + xs : list(list(str)) + list of list of tags (inner list is sentence). + strip_prefix : bool + if True, remove leading I- and B-, else retain. + + Returns + ------- + xs_flat : list(str) + the flattened list. + """ + def strip_bio_prefix(label): + return label.split('-')[-1] + + if strip_prefix: + return [strip_bio_prefix(x) for x in itertools.chain.from_iterable(xs)] + else: + return [x for x in itertools.chain.from_iterable(xs)] + + +def compute_list_lengths(xs): + """ Convenience method to return a list of ints representing lengths of + inner lists in xs. Meant to be used in conjunction with flatten_list + to capture the original sentence lengths, so flattened list(str) can + be restored to list(list(str)) via unflatten_list. + + Parameters + ---------- + xs : list(list(str)) + list of list of tags. + + Returns + ------- + xs_lengths : list(int) + list of lengths of inner list. + """ + return [len(x) for x in xs] + + +def unflatten_list(xs_flat, xs_lengths): + """ Reverse operation of flatten_list. Using the flattened list and the list + of list lengths of the inner list, reconstructs original list(list(str)). + + Parameters + ---------- + xs_flat : list(str) + the flattened list. + xs_lengths : list(int) + list of inner list to group by. + + Returns + ------- + xs_unflat : list(list(str)) + original list of list(list(str)) + """ + xs_unflat = [] + start = 0 + for l in xs_lengths: + end = start + l + xs_unflat.append(xs_flat[start:end]) + start = end + return xs_unflat + + +def align_labels_and_predictions(labels, predictions, padding_tag=None): + """ Tokenizers paired with BERT-like transformer based NERs break up + tokens into word-pieces in order to minimize or eliminate [UNK] + situations. However, these word-pieces count to the max_sequence_length + specified in the NER, which may mean that predictions will have + fewer tags than labels because the last few tokens in the tokenized + input string have been cut off. This function will align the + label and prediction lists by removing these tags from the labels. + + Parameters + ---------- + labels : list(list(str)) + list of list of label tags + predictions : list(list(str)) + list of list of prediction tags + padding_tag : str, default None + special token (not part of label set) to denote padding tag. + + Returns + ------- + labels, predictions : labels list aligned to predictions + """ + if len(labels) != len(predictions): + raise ValueError("Number of tag lists (for sentences) in label and prediction must match.") + + labels_a, predictions_a = [], [] + for tags_l, tags_p in zip(labels, predictions): + if padding_tag is not None: + assert(len(tags_l) == len(tags_p)) + tags_lp = [(l, p) for l, p in zip(tags_l, tags_p) + if p != padding_tag] + tags_l = [l for (l, p) in tags_lp] + tags_p = [p for (l, p) in tags_lp] + labels_a.append(tags_l) + predictions_a.append(tags_p) + else: + if len(tags_l) != len(tags_p): + labels_a.append(tags_l[0:len(tags_p)]) + predictions_a.append(tags_p) + else: + labels_a.append(tags_l) + predictions_a.append(tags_p) + + return labels_a, predictions_a + + +def tokens_to_spans(tokens, tags, allow_multiword_spans=True): + """ Convert from tokens-tags format to sentence-span format. Some NERs + use the sentence-span format, so we need to transform back and forth. + + Parameters + ---------- + tokens : list(str) + list of tokens representing single sentence. + tags : list(str) + list of tags in BIO format. + allow_multiword_spans : bool + if True, offsets for consecutive tokens of the same entity type are + merged into a single span, otherwise tokens are reported as individual + spans. + + Returns + ------- + sentence : str + the sentence as a string. + spans : list((int, int, str)) + list of spans as a 3-tuple of start position, end position, and entity + type. Note that end position is 1 beyond the actual ending position of + the token. + """ + spans = [] + curr, start, end, ent_cls = 0, None, None, None + sentence = " ".join(tokens) + if allow_multiword_spans: + for token, tag in zip(tokens, tags): + if tag == "O": + if ent_cls is not None: + spans.append((start, end, ent_cls)) + start, end, ent_cls = None, None, None + elif tag.startswith("B-"): + ent_cls = tag.split("-")[1] + start = curr + end = curr + len(token) + else: # I-xxx + end += len(token) + 1 + # advance curr + curr += len(token) + 1 + + # handle remaining span + if ent_cls is not None: + spans.append((start, end, ent_cls)) + else: + for token, tag in zip(tokens, tags): + if tag.startswith("B-") or tag.startswith("I-"): + ent_cls = tag.split("-")[1] + start = curr + end = curr + len(token) + spans.append((start, end, ent_cls)) + curr += len(token) + 1 + + return sentence, spans + + +def spans_to_tokens(sentence, spans, spacy_lm, spans_are_multiword=True): + """ Convert from sentence-spans format to tokens-tags format. Some NERs + use the sentence-spans format, so we need to transform back and forth. + + Parameters + ---------- + sentence : str + the sentence as a string. + spans : list((int, int, str)) + list of spans as a 3-tuple of start_position, end_position, and + entity_type. Note that end position is 1 beyond actual end position + of the token. + spacy_lm: we use SpaCy EN language model to tokenizing the sentence to + generate list of tokens. + spans_are_multiword : bool + if True, indicates that spans can be multi-word spans), so consecutive + entries of the same class should be transformed, ie. (B-x, B-x) should + become (B-x, I-x). + + Returns + ------- + tokens : list(str) + list of tokens in sentence + tags : list(str) + list of tags in BIO format. + """ + tokens, tags = [], [] + curr_start, curr_end = 0, 0 + for t in spacy_lm(sentence): + tokens.append(t.text) + curr_end = curr_start + len(t.text) + is_annotated = False + for span_start, span_end, span_cls in spans: + if curr_start == span_start: + tags.append("B-" + span_cls) + is_annotated = True + break + elif curr_start > span_start and curr_end <= span_end: + tags.append("I-" + span_cls) + is_annotated = True + break + else: + continue + if not is_annotated: + tags.append("O") + + # advance pointer across current word + curr_start += len(t.text) + # advance pointer across space if next token separated by space + if curr_start < len(sentence) and sentence[curr_start] == " ": + curr_start += 1 + + # handle consecutive class labels if spans were single word spans + if not spans_are_multiword: + prev_tag, merged_tags = None, [] + for tag in tags: + if prev_tag is None or prev_tag != tag: + merged_tags.append(tag) + else: + merged_tags.append(tag.replace("B-", "I-")) + prev_tag = tag + tags = merged_tags + + return tokens, tags + diff --git a/nerds/core/__init__.py b/nerds_orig/__init__.py similarity index 100% rename from nerds/core/__init__.py rename to nerds_orig/__init__.py diff --git a/nerds/core/model/__init__.py b/nerds_orig/core/__init__.py similarity index 100% rename from nerds/core/model/__init__.py rename to nerds_orig/core/__init__.py diff --git a/nerds/core/model/config/__init__.py b/nerds_orig/core/model/__init__.py similarity index 100% rename from nerds/core/model/config/__init__.py rename to nerds_orig/core/model/__init__.py diff --git a/nerds/core/model/evaluate/__init__.py b/nerds_orig/core/model/config/__init__.py similarity index 100% rename from nerds/core/model/evaluate/__init__.py rename to nerds_orig/core/model/config/__init__.py diff --git a/nerds/core/model/config/base.py b/nerds_orig/core/model/config/base.py similarity index 100% rename from nerds/core/model/config/base.py rename to nerds_orig/core/model/config/base.py diff --git a/nerds/core/model/config/bilstm.py b/nerds_orig/core/model/config/bilstm.py similarity index 100% rename from nerds/core/model/config/bilstm.py rename to nerds_orig/core/model/config/bilstm.py diff --git a/nerds/core/model/config/crf.py b/nerds_orig/core/model/config/crf.py similarity index 100% rename from nerds/core/model/config/crf.py rename to nerds_orig/core/model/config/crf.py diff --git a/nerds/core/model/config/ensemble.py b/nerds_orig/core/model/config/ensemble.py similarity index 100% rename from nerds/core/model/config/ensemble.py rename to nerds_orig/core/model/config/ensemble.py diff --git a/nerds/core/model/config/error.py b/nerds_orig/core/model/config/error.py similarity index 100% rename from nerds/core/model/config/error.py rename to nerds_orig/core/model/config/error.py diff --git a/nerds/core/model/config/spacy.py b/nerds_orig/core/model/config/spacy.py similarity index 100% rename from nerds/core/model/config/spacy.py rename to nerds_orig/core/model/config/spacy.py diff --git a/nerds/core/model/optimize/__init__.py b/nerds_orig/core/model/evaluate/__init__.py similarity index 100% rename from nerds/core/model/optimize/__init__.py rename to nerds_orig/core/model/evaluate/__init__.py diff --git a/nerds/core/model/evaluate/score.py b/nerds_orig/core/model/evaluate/score.py similarity index 69% rename from nerds/core/model/evaluate/score.py rename to nerds_orig/core/model/evaluate/score.py index b20a099..cdd9d9a 100644 --- a/nerds/core/model/evaluate/score.py +++ b/nerds_orig/core/model/evaluate/score.py @@ -63,3 +63,29 @@ def calculate_precision_recall_f1score(y_pred, y_true, entity_label=None): (precision + recall) > 0 else 0. return (precision, recall, f1_score) + + +def classification_report(y_pred, y_true, entity_labels): + """ Pretty prints a classification report based on precision, + recall, and f1-scores from `calculate_precision_recall_f1score` + for each entity label supplied and the aggregate. + + Args: + y_pred (list(AnnotatedDocument)): The predictions of an NER + model in the form of a list of annotated documents. + y_true (list(AnnotatedDocument)): The ground truth set of + annotated documents. + entity_labels (list(str)): The entity labels for which + the scores are calculated. + + Returns: + None + """ + + print(" precision recall f1-score") + for l in sorted(entity_labels): + p, r, f = calculate_precision_recall_f1score(y_pred, y_true, entity_label=l) + print("{:20s} {:.3f} {:.3f} {:.3f}".format(l, p, r, f)) + p, r, f = calculate_precision_recall_f1score(y_pred, y_true) + print("") + print("{:20s} {:.3f} {:.3f} {:.3f}".format("--all--", p, r, f)) diff --git a/nerds/core/model/evaluate/validation.py b/nerds_orig/core/model/evaluate/validation.py similarity index 100% rename from nerds/core/model/evaluate/validation.py rename to nerds_orig/core/model/evaluate/validation.py diff --git a/nerds/core/model/input/__init__.py b/nerds_orig/core/model/input/__init__.py similarity index 100% rename from nerds/core/model/input/__init__.py rename to nerds_orig/core/model/input/__init__.py diff --git a/nerds/core/model/input/annotation.py b/nerds_orig/core/model/input/annotation.py similarity index 100% rename from nerds/core/model/input/annotation.py rename to nerds_orig/core/model/input/annotation.py diff --git a/nerds/core/model/input/base.py b/nerds_orig/core/model/input/base.py similarity index 100% rename from nerds/core/model/input/base.py rename to nerds_orig/core/model/input/base.py diff --git a/nerds/core/model/input/brat.py b/nerds_orig/core/model/input/brat.py similarity index 100% rename from nerds/core/model/input/brat.py rename to nerds_orig/core/model/input/brat.py diff --git a/nerds/core/model/input/document.py b/nerds_orig/core/model/input/document.py similarity index 100% rename from nerds/core/model/input/document.py rename to nerds_orig/core/model/input/document.py diff --git a/nerds/core/model/ner/__init__.py b/nerds_orig/core/model/ner/__init__.py similarity index 80% rename from nerds/core/model/ner/__init__.py rename to nerds_orig/core/model/ner/__init__.py index a226244..84f794f 100644 --- a/nerds/core/model/ner/__init__.py +++ b/nerds_orig/core/model/ner/__init__.py @@ -2,6 +2,7 @@ from nerds.core.model.ner.bilstm import BidirectionalLSTM from nerds.core.model.ner.crf import CRF from nerds.core.model.ner.dictionary import ExactMatchDictionaryNER +from nerds.core.model.ner.dictionary import ExactMatchMultiClassDictionaryNER from nerds.core.model.ner.ensemble import NERModelEnsemble from nerds.core.model.ner.spacy import SpaCyStatisticalNER @@ -9,6 +10,7 @@ "BidirectionalLSTM", "CRF", "ExactMatchDictionaryNER", + "ExactMatchMultiClassDictionaryNER", "NERModel", "NERModelEnsemble", "SpaCyStatisticalNER" diff --git a/nerds/core/model/ner/base.py b/nerds_orig/core/model/ner/base.py similarity index 100% rename from nerds/core/model/ner/base.py rename to nerds_orig/core/model/ner/base.py diff --git a/nerds/core/model/ner/bilstm.py b/nerds_orig/core/model/ner/bilstm.py similarity index 100% rename from nerds/core/model/ner/bilstm.py rename to nerds_orig/core/model/ner/bilstm.py diff --git a/nerds/core/model/ner/crf.py b/nerds_orig/core/model/ner/crf.py similarity index 100% rename from nerds/core/model/ner/crf.py rename to nerds_orig/core/model/ner/crf.py diff --git a/nerds_orig/core/model/ner/dictionary.py b/nerds_orig/core/model/ner/dictionary.py new file mode 100644 index 0000000..270f358 --- /dev/null +++ b/nerds_orig/core/model/ner/dictionary.py @@ -0,0 +1,165 @@ +from os.path import isfile + +import ahocorasick + +from nerds.core.model.input.annotation import Annotation +from nerds.core.model.input.document import AnnotatedDocument +from nerds.core.model.ner.base import NERModel +from nerds.util.logging import get_logger + +log = get_logger() + + +class ExactMatchDictionaryNER(NERModel): + + def __init__(self, path_to_dictionary_file, entity_label): + super().__init__(entity_label) + self.key = "em-dict" + + if path_to_dictionary_file is not None: + self.path_to_dictionary_file = path_to_dictionary_file + self._create_automaton() + else: + # Must get a dictionary as an input! + raise Exception("No dictionary provided!") + + def _create_automaton(self): + + if not isfile(self.path_to_dictionary_file): + raise Exception("%s is not a file." % self.path_to_dictionary_file) + + # Initialize automaton. + self.automaton = ahocorasick.Automaton() + + # Index counter. + count = 0 + + # Dictionary must be one word per line. + log.debug("Started loading dictionary at {}".format( + self.path_to_dictionary_file)) + with open(self.path_to_dictionary_file, 'r') as dict_file: + for search_expr in dict_file: + search_expr = search_expr.strip() + if search_expr != "": + self.automaton.add_word(search_expr, (count, search_expr)) + count += 1 + log.debug("Successfully loaded dictionary") + + self.automaton.make_automaton() + + def transform(self, X, y=None): + """ Annotates the list of `Document` objects that are provided as + input and returns a list of `AnnotatedDocument` objects. + + In a dictionary based approach, a dictionary of keywords is used + to create a FSA which is then used to search with. See [1]. + [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm + """ + annotated_documents = [] + for document in X: + annotations = [] + doc_content_str = document.plain_text_ + for item in self.automaton.iter(doc_content_str): + end_position, (index, word) = item + + start_position = (end_position - len(word) + 1) + end_position = end_position + 1 + + annotations.append(Annotation( + word, + self.entity_label, + (start_position, end_position))) + + annotated_documents.append(AnnotatedDocument( + document.content, + annotations=annotations, + encoding=document.encoding)) + + return annotated_documents + + +class ExactMatchMultiClassDictionaryNER(NERModel): + + def __init__(self, path_to_dictionary_file): + super().__init__() + self.key = "em-dict" + + if path_to_dictionary_file is not None: + self.path_to_dictionary_file = path_to_dictionary_file + self._create_automaton() + else: + # Must get a dictionary as an input! + log.warning("No path to dictionary provided, fit() model to load") + + def _create_automaton(self): + + if not isfile(self.path_to_dictionary_file): + raise Exception("%s is not a file." % self.path_to_dictionary_file) + + # Initialize automaton. + self.automaton = ahocorasick.Automaton() + + # Dictionary must be one word per line. + log.debug("Started loading dictionary at {}".format( + self.path_to_dictionary_file)) + with open(self.path_to_dictionary_file, 'r') as dict_file: + for line in dict_file: + search_expr, entity_type = line.strip().split('\t') + if search_expr != "": + self.automaton.add_word(search_expr, (entity_type, search_expr)) + log.debug("Successfully loaded dictionary") + + self.automaton.make_automaton() + + def fit(self, X, y=None): + # Initialize automaton. + self.automaton = ahocorasick.Automaton() + + # populate automaton from annotation values provided + for annotated_document in X: + for annotation in annotated_document.annotations: + search_expr = annotation.text + entity_type = annotation.label + if search_expr != "": + self.automaton.add_word(search_expr, (entity_type, search_expr)) + log.debug("Successfully loaded dictionary") + + self.automaton.make_automaton() + + def transform(self, X, y=None): + """ Annotates the list of `Document` objects that are provided as + input and returns a list of `AnnotatedDocument` objects. + + In a dictionary based approach, a dictionary of keywords is used + to create a FSA which is then used to search with. See [1]. + [1]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm + """ + annotated_documents = [] + for document in X: + annotations = [] + doc_content_str = document.plain_text_ + for item in self.automaton.iter(doc_content_str): + end_position, (label, word) = item + + start_position = (end_position - len(word) + 1) + end_position = end_position + 1 + + # Aho-Corasick matches partial strings in the input document, which + # leads to spurious matches, so we check to see that the match spans + # a full word before adding it to our list of valid annotations + if ((start_position <= 0 and doc_content_str[end_position] == " ") or + (end_position >= len(doc_content_str) and doc_content_str[start_position - 1] == " ") or + (doc_content_str[start_position - 1] == " " and doc_content_str[end_position] == " ")): + annotations.append(Annotation( + word, + label, + (start_position, end_position))) + + annotated_documents.append(AnnotatedDocument( + document.content, + annotations=annotations, + encoding=document.encoding)) + + return annotated_documents + + diff --git a/nerds/core/model/ner/ensemble.py b/nerds_orig/core/model/ner/ensemble.py similarity index 100% rename from nerds/core/model/ner/ensemble.py rename to nerds_orig/core/model/ner/ensemble.py diff --git a/nerds/core/model/ner/spacy.py b/nerds_orig/core/model/ner/spacy.py similarity index 100% rename from nerds/core/model/ner/spacy.py rename to nerds_orig/core/model/ner/spacy.py diff --git a/nerds/test/__init__.py b/nerds_orig/core/model/optimize/__init__.py similarity index 100% rename from nerds/test/__init__.py rename to nerds_orig/core/model/optimize/__init__.py diff --git a/nerds/core/model/optimize/optimizer.py b/nerds_orig/core/model/optimize/optimizer.py similarity index 100% rename from nerds/core/model/optimize/optimizer.py rename to nerds_orig/core/model/optimize/optimizer.py diff --git a/nerds/core/model/optimize/params.py b/nerds_orig/core/model/optimize/params.py similarity index 100% rename from nerds/core/model/optimize/params.py rename to nerds_orig/core/model/optimize/params.py diff --git a/nerds/core/model/output/__init__.py b/nerds_orig/core/model/output/__init__.py similarity index 100% rename from nerds/core/model/output/__init__.py rename to nerds_orig/core/model/output/__init__.py diff --git a/nerds/core/model/output/brat.py b/nerds_orig/core/model/output/brat.py similarity index 100% rename from nerds/core/model/output/brat.py rename to nerds_orig/core/model/output/brat.py diff --git a/nerds/util/__init__.py b/nerds_orig/test/__init__.py similarity index 100% rename from nerds/util/__init__.py rename to nerds_orig/test/__init__.py diff --git a/nerds/test/data/brat/file1.ann b/nerds_orig/test/data/brat/file1.ann similarity index 100% rename from nerds/test/data/brat/file1.ann rename to nerds_orig/test/data/brat/file1.ann diff --git a/nerds/test/data/brat/file1.txt b/nerds_orig/test/data/brat/file1.txt similarity index 100% rename from nerds/test/data/brat/file1.txt rename to nerds_orig/test/data/brat/file1.txt diff --git a/nerds/test/data/config/sample.yaml b/nerds_orig/test/data/config/sample.yaml similarity index 100% rename from nerds/test/data/config/sample.yaml rename to nerds_orig/test/data/config/sample.yaml diff --git a/nerds/test/data/config/sample_error.yaml b/nerds_orig/test/data/config/sample_error.yaml similarity index 100% rename from nerds/test/data/config/sample_error.yaml rename to nerds_orig/test/data/config/sample_error.yaml diff --git a/nerds_orig/test/data/dictionary/biodictionary.txt b/nerds_orig/test/data/dictionary/biodictionary.txt new file mode 100644 index 0000000..c332f84 --- /dev/null +++ b/nerds_orig/test/data/dictionary/biodictionary.txt @@ -0,0 +1,2 @@ +HUMARA loci DNA +purified eosinophils cell-type diff --git a/nerds/test/data/dictionary/orgdictionary.txt b/nerds_orig/test/data/dictionary/orgdictionary.txt similarity index 100% rename from nerds/test/data/dictionary/orgdictionary.txt rename to nerds_orig/test/data/dictionary/orgdictionary.txt diff --git a/nerds/test/data/not_annotated/file1.txt b/nerds_orig/test/data/not_annotated/file1.txt similarity index 100% rename from nerds/test/data/not_annotated/file1.txt rename to nerds_orig/test/data/not_annotated/file1.txt diff --git a/nerds/test/data/not_annotated/file2.txt b/nerds_orig/test/data/not_annotated/file2.txt similarity index 100% rename from nerds/test/data/not_annotated/file2.txt rename to nerds_orig/test/data/not_annotated/file2.txt diff --git a/nerds/test/test_annotation.py b/nerds_orig/test/test_annotation.py similarity index 100% rename from nerds/test/test_annotation.py rename to nerds_orig/test/test_annotation.py diff --git a/nerds/test/test_base_config.py b/nerds_orig/test/test_base_config.py similarity index 100% rename from nerds/test/test_base_config.py rename to nerds_orig/test/test_base_config.py diff --git a/nerds/test/test_base_ner_class.py b/nerds_orig/test/test_base_ner_class.py similarity index 100% rename from nerds/test/test_base_ner_class.py rename to nerds_orig/test/test_base_ner_class.py diff --git a/nerds/test/test_bilstm_ner_model.py b/nerds_orig/test/test_bilstm_ner_model.py similarity index 100% rename from nerds/test/test_bilstm_ner_model.py rename to nerds_orig/test/test_bilstm_ner_model.py diff --git a/nerds/test/test_brat.py b/nerds_orig/test/test_brat.py similarity index 100% rename from nerds/test/test_brat.py rename to nerds_orig/test/test_brat.py diff --git a/nerds/test/test_crf_ner_model.py b/nerds_orig/test/test_crf_ner_model.py similarity index 100% rename from nerds/test/test_crf_ner_model.py rename to nerds_orig/test/test_crf_ner_model.py diff --git a/nerds_orig/test/test_dictionary_ner_model.py b/nerds_orig/test/test_dictionary_ner_model.py new file mode 100644 index 0000000..db9f67b --- /dev/null +++ b/nerds_orig/test/test_dictionary_ner_model.py @@ -0,0 +1,70 @@ +from nose.tools import assert_equal, assert_in, assert_true + +from nerds.core.model.input.annotation import Annotation +from nerds.core.model.input.document import AnnotatedDocument, Document +from nerds.core.model.ner.dictionary import ExactMatchDictionaryNER +from nerds.core.model.ner.dictionary import ExactMatchMultiClassDictionaryNER + + +def test_ExactMatchDictionaryNER(): + document = Document(b""" + There are many publishers in the world, like + Elsevier, Springer and also Wiley""") + + dictionary_ner = ExactMatchDictionaryNER( + "nerds/test/data/dictionary/orgdictionary.txt", "ORGANIZATION") + annotated = dictionary_ner.transform([document]) + + annotations = annotated[0].annotations + + assert_equal( + 3, len(annotations), "Must have matched the three publishers.") + + unique_annotations = set([ann.text for ann in annotations]) + + assert_in("Elsevier", unique_annotations) + assert_in("Springer", unique_annotations) + assert_in("Wiley", unique_annotations) + + +def test_ExactMatchMultiClassDictionaryNER(): + document = Document(b""" + In this study , we have used the polymerase chain reaction ( PCR ) with nested + primers to analyze X-inactivation patterns of the HUMARA loci in purified eosinophils + from female patients with eosinophilia . + """) + ner = ExactMatchMultiClassDictionaryNER( + "nerds/test/data/dictionary/biodictionary.txt") + annotated = ner.transform([document]) + expected_labels = ["DNA", "cell-type"] + for i, annotation in enumerate(annotated[0].annotations): + pred_text = annotation.text + pred_offsets = annotation.offset + label_text = document.plain_text_[pred_offsets[0]:pred_offsets[1]] + assert_equal(pred_text, label_text, + "predicted {:s} != label {:s}".format(pred_text, label_text)) + assert_equal(annotation.label, expected_labels[i]) + + +def test_ExactMatchMultiClassDictionaryNER2(): + documents = [ + AnnotatedDocument(b""" + In this study , we have used the polymerase chain reaction ( PCR ) with nested + primers to analyze X-inactivation patterns of the HUMARA loci in purified eosinophils + from female patients with eosinophilia . + """, annotations= [ + Annotation("HUMARA loci", "DNA", (139, 150)), + Annotation("purified eosinophils", "cell-type", (154, 174)) + ])] + ner = ExactMatchMultiClassDictionaryNER( + "nerds/test/data/dictionary/biodictionary.txt") + ner.fit(documents) + pred_documents = ner.transform(documents) + for i, annotation in enumerate(pred_documents[0].annotations): + pred_text = annotation.text + pred_offsets = annotation.offset + label_text = documents[0].plain_text_[pred_offsets[0]:pred_offsets[1]] + assert_equal(pred_text, label_text, + "predicted {:s} != label {:s}".format(pred_text, label_text)) + assert_equal(annotation.label, expected_labels[i]) + diff --git a/nerds/test/test_document.py b/nerds_orig/test/test_document.py similarity index 100% rename from nerds/test/test_document.py rename to nerds_orig/test/test_document.py diff --git a/nerds/test/test_ensemble.py b/nerds_orig/test/test_ensemble.py similarity index 100% rename from nerds/test/test_ensemble.py rename to nerds_orig/test/test_ensemble.py diff --git a/nerds/test/test_ensemble_config.py b/nerds_orig/test/test_ensemble_config.py similarity index 100% rename from nerds/test/test_ensemble_config.py rename to nerds_orig/test/test_ensemble_config.py diff --git a/nerds/test/test_eval_scoring.py b/nerds_orig/test/test_eval_scoring.py similarity index 100% rename from nerds/test/test_eval_scoring.py rename to nerds_orig/test/test_eval_scoring.py diff --git a/nerds/test/test_kfold_cv.py b/nerds_orig/test/test_kfold_cv.py similarity index 100% rename from nerds/test/test_kfold_cv.py rename to nerds_orig/test/test_kfold_cv.py diff --git a/nerds/test/test_ner_model_optimizer.py b/nerds_orig/test/test_ner_model_optimizer.py similarity index 100% rename from nerds/test/test_ner_model_optimizer.py rename to nerds_orig/test/test_ner_model_optimizer.py diff --git a/nerds/test/test_pipeline.py b/nerds_orig/test/test_pipeline.py similarity index 100% rename from nerds/test/test_pipeline.py rename to nerds_orig/test/test_pipeline.py diff --git a/nerds/test/test_spacy_ner_model.py b/nerds_orig/test/test_spacy_ner_model.py similarity index 100% rename from nerds/test/test_spacy_ner_model.py rename to nerds_orig/test/test_spacy_ner_model.py diff --git a/nerds/test/test_util_convert.py b/nerds_orig/test/test_util_convert.py similarity index 100% rename from nerds/test/test_util_convert.py rename to nerds_orig/test/test_util_convert.py diff --git a/nerds/test/test_util_file.py b/nerds_orig/test/test_util_file.py similarity index 100% rename from nerds/test/test_util_file.py rename to nerds_orig/test/test_util_file.py diff --git a/nerds/test/test_util_nlp.py b/nerds_orig/test/test_util_nlp.py similarity index 100% rename from nerds/test/test_util_nlp.py rename to nerds_orig/test/test_util_nlp.py diff --git a/nerds/test/test_util_string.py b/nerds_orig/test/test_util_string.py similarity index 100% rename from nerds/test/test_util_string.py rename to nerds_orig/test/test_util_string.py diff --git a/nerds_orig/util/__init__.py b/nerds_orig/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/nerds/util/convert.py b/nerds_orig/util/convert.py similarity index 79% rename from nerds/util/convert.py rename to nerds_orig/util/convert.py index d151dc1..051c210 100644 --- a/nerds/util/convert.py +++ b/nerds_orig/util/convert.py @@ -100,6 +100,7 @@ def transform_annotated_document_to_bio_format( tokens += non_tagged_tokens labels += non_tagged_labels + return tokens, labels @@ -202,6 +203,65 @@ def transform_bio_tags_to_annotated_document(tokens, bio_tags, document): document.content, annotations=annotations, encoding=document.encoding) +def transform_annotated_documents_to_multiclass_dictionary( + annotated_documents, dict_filename, + stopwords=None, write_entity_type=True): + """ Convert a collection of AnnotatedDocument objects to (phrase, + entity_type) tuples and writes them out to dict_filename. + + Args: + annotated_documents -- collection of AnnotatedDocument objects + dict_filename -- path to dictionary file to create + stopwords -- specify set of phrases (usually english stopwords) + that should not be marked up as entities. Default = None + implies no stopword filtering + write_entity_type -- if True, writes out entities as TSV (phrase, + entity_type), else writes out just the phrase, one per line. + Former format suitable for ExactMatchMultiClassDictionaryNER, + latter format suitable for ExactMatchDictionaryNER. + + Returns: + None + """ + + fdict = open(dict_filename, "w") + for annotated_document in annotated_documents: + tokens, tags = transform_annotated_document_to_bio_format(annotated_document) + phrase_tokens, prev_tag, already_seen_phrases = [], None, set() + for token, tag in zip(tokens, tags): + # print("token:", token, "tag:", tag) + if tag == "O": + if len(phrase_tokens) > 0: + phrase = " ".join(phrase_tokens) + prev_tag = prev_tag[2:] # remove B_ and I_ prefix + # print("... phrase:", phrase, "tag:", prev_tag) + if phrase not in already_seen_phrases: + if stopwords is not None and phrase not in stopwords: + if write_entity_type: + fdict.write("{:s}\t{:s}\n".format(phrase, prev_tag)) + else: + fdict.write("{:s}\n".format(phrase)) + already_seen_phrases.add(phrase) + phrase_tokens, prev_tag = [], None + continue + else: + phrase_tokens.append(token) + prev_tag = tag + + if len(phrase_tokens) > 0: + phrase = " ".join(phrase_tokens) + prev_tag = prev_tag[2:] # remove B_ and I_ prefix + # print("... (last) phrase:", phrase, "tag:", prev_tag) + if phrase not in already_seen_phrases: + if stopwords is not None and phrase not in stopwords: + if write_entity_type: + fdict.write("{:s}\t{:s}\n".format(phrase, prev_tag)) + else: + fdict.write("{:s}\n".format(phrase)) + + fdict.close() + + def split_annotated_documents( annotated_documents, splitter=document_to_sentences): """ Wrapper function that applies `split_annotated_document` to a diff --git a/nerds/util/file.py b/nerds_orig/util/file.py similarity index 100% rename from nerds/util/file.py rename to nerds_orig/util/file.py diff --git a/nerds/util/logging.py b/nerds_orig/util/logging.py similarity index 100% rename from nerds/util/logging.py rename to nerds_orig/util/logging.py diff --git a/nerds/util/nlp.py b/nerds_orig/util/nlp.py similarity index 100% rename from nerds/util/nlp.py rename to nerds_orig/util/nlp.py diff --git a/nerds/util/string.py b/nerds_orig/util/string.py similarity index 100% rename from nerds/util/string.py rename to nerds_orig/util/string.py diff --git a/setup.py b/setup.py index edbc4dc..9c55883 100644 --- a/setup.py +++ b/setup.py @@ -4,14 +4,15 @@ name="nerds", author="Elsevier Content & Innovation", install_requires=[ - 'anago', + 'allennlp', + 'anago @ git+https://github.com/Hironsan/anago.git', + 'flair', 'future', 'h5py', 'hyperopt', 'joblib', 'keras', 'networkx==1.11', - 'nltk', 'numpy', 'pyahocorasick', 'pyyaml', @@ -19,8 +20,12 @@ 'scipy', 'sklearn', 'sklearn-crfsuite', - 'spacy==2.0.11', - 'tensorflow' + 'spacy', + 'tensorflow', + 'torch', + 'transformers', + 'pandas', + 'simpletransformers' ], tests_require=[ 'coverage',