diff --git a/.gitignore b/.gitignore index b6e4761..dff3c38 100644 --- a/.gitignore +++ b/.gitignore @@ -1,129 +1,10 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ +.journal +.mainlock +.wlock +.ts +.git +*.pyc .coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ +.htmlcov +.DS_Store +.idea diff --git a/code-env/python/desc.json b/code-env/python/desc.json index bc25f36..c687218 100755 --- a/code-env/python/desc.json +++ b/code-env/python/desc.json @@ -1,6 +1,8 @@ { - "acceptedPythonInterpreters": ["PYTHON27"], - "forceConda": false, - "installCorePackages": true, - "installJupyterSupport": false + "acceptedPythonInterpreters": [ + "PYTHON36" + ], + "forceConda": false, + "installCorePackages": true, + "installJupyterSupport": false } \ No newline at end of file diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt index 60dd177..622bccd 100644 --- a/code-env/python/spec/requirements.txt +++ b/code-env/python/spec/requirements.txt @@ -4,3 +4,5 @@ gensim==3.8.2 scikit-learn==0.20.4 tensorflow==1.15.2 tensorflow-hub==0.5.0 +torch==1.3.1 +transformers \ No newline at end of file diff --git a/custom-recipes/sentence-embedding-compute/recipe.json b/custom-recipes/sentence-embedding-compute/recipe.json index 7749711..428656d 100755 --- a/custom-recipes/sentence-embedding-compute/recipe.json +++ b/custom-recipes/sentence-embedding-compute/recipe.json @@ -42,7 +42,7 @@ "type": "COLUMNS", "description": "", "mandatory": true, - "columnRole":"input_dataset" + "columnRole": "input_dataset" }, { "name": "aggregation_method", @@ -84,14 +84,6 @@ "description": "Used for computing SIF weights.", "type": "DOUBLE", "defaultValue": 0.001 - }, - { - "visibilityCondition": "model.advanced_settings", - "name": "n_principal_components", - "label": "[SIF] Principal Components", - "description": "Number of components to remove in SIF computation.", - "type": "INT", - "defaultValue": 1 } ] -} +} \ No newline at end of file diff --git a/dist/dss-plugin-sentence-embedding-1.2.2.zip b/dist/dss-plugin-sentence-embedding-1.2.2.zip new file mode 100644 index 0000000..8d0c82c Binary files /dev/null and b/dist/dss-plugin-sentence-embedding-1.2.2.zip differ diff --git a/js/script.js b/js/script.js new file mode 100644 index 0000000..9f18307 --- /dev/null +++ b/js/script.js @@ -0,0 +1,72 @@ +var app = angular.module('modelDownloader.build', []); +var non_transformer_models = ["Word2Vec","FastText","Glove","ELMo",'USE'] + +app.controller('modelDownloaderController', function($scope) { + + $scope.$watch('config', function(nv) { + if(nv && nv.language){ + return; + } + $scope.showLanguageList=true; + $scope.showModelList=false; + $scope.showTransformersModelversion=false; + $scope.showOutputFolder=true; + $scope.showModelDescription=false; + }); + + $scope.getModels = function(){ + $scope.callPythonDo({method: "get_models"}).then(function(data){ + $scope.models = data['models'] + }); + $scope.showLanguageList=true; + $scope.showModelList=true; + $scope.showTransformersModelversion=false; + $scope.showOutputFolder=true; + $scope.showModelDescription=false; + }; + + $scope.getTransformerModelVersions = function(){ + $scope.callPythonDo({method:"get_transformer_model_versions"}).then(function(data){ + $scope.transformersModelVersions = data['transformer_model_versions']; + var model_name = data['model_name']; + if(non_transformer_models.includes(model_name)){ + $scope.showTransformersModelversion=false; + } + else{ + $scope.showTransformersModelversion=true; + } + + }); + $scope.showLanguageList=true; + $scope.showModelList=true; + $scope.showOutputFolder=true; + $scope.showModelDescription=false; + + }; + + $scope.getModelDescription = function(){ + $scope.callPythonDo({method: "get_model_description"}).then(function(data){ + $scope.modelDescription = data['model_description'] + }); + $scope.showLanguageList=true; + $scope.showModelList=true; + $scope.showTransformersModelversion=true; + $scope.showOutputFolder=true; + $scope.showModelDescription=true; + }; + + var init = function(){ + $scope.callPythonDo({method: "get_languages"}).then(function(data){ + $scope.languages = data['languages'] + }); + $scope.showLanguageList=true; + $scope.showModelList=false; + $scope.showTransformersModelversion=false; + $scope.showOutputFolder=true; + $scope.showModelDescription=false; + + }; + + init(); + +}); \ No newline at end of file diff --git a/parameter-sets/custom_proxy_config/parameter-set.json b/parameter-sets/custom_proxy_config/parameter-set.json new file mode 100644 index 0000000..914d979 --- /dev/null +++ b/parameter-sets/custom_proxy_config/parameter-set.json @@ -0,0 +1,20 @@ +{ + "meta": { + "label": "Custom Proxy Configuration", + "description": "", + "icon": "icon-powerbi" + }, + "defaultDefinableInline": true, + "defaultDefinableAtProjectLevel": true, + "pluginParams": [ + + ], + "params": [ + { + "name": "custom_proxy_config", + "label": "Custom proxy configuration", + "description": "Needs to follow Python's requests specifications.\nExample:\nhttp --> http://10.10.1.10:3128\nhttps --> http://10.10.1.10:1080", + "type": "MAP" + } + ] +} \ No newline at end of file diff --git a/plugin.json b/plugin.json index 2f7002f..cb769cc 100755 --- a/plugin.json +++ b/plugin.json @@ -8,7 +8,21 @@ "author": "Dataiku (Hicham El Boukkouri)", "icon": "icon-list-alt", "licenseInfo": "Apache Software License", - "url": "https://www.dataiku.com/product/plugins/sentence-embedding/", - "tags": ["NLP", "Machine Learning", "AutoML"] - } + "url": "https://www.dataiku.com/dss/plugins/info/sentence-embedding.html", + "tags": [ + "NLP", + "Machine Learning", + "AutoML" + ] + }, + "params": [ + { + "name": "proxy", + "label": "Proxy settings", + "description": "Needs to follow Python's requests specifications.\nExample:\nhttp --> http://10.10.1.10:3128\nhttps --> http://10.10.1.10:1080", + "type": "MAP", + "mandatory": true + } + ] } + diff --git a/python-lib/macro/language_dict.py b/python-lib/macro/language_dict.py new file mode 100644 index 0000000..ef66e10 --- /dev/null +++ b/python-lib/macro/language_dict.py @@ -0,0 +1,189 @@ +SUPPORTED_LANGUAGES = [ + {'label': 'Afar', 'value': 'aa'}, + {'label': 'Abkhazian', 'value': 'ab'}, + {'label': 'Afrikaans', 'value': 'af'}, + {'label': 'Akan', 'value': 'ak'}, + {'label': 'Amharic', 'value': 'am'}, + {'label': 'Arabic', 'value': 'ar'}, + {'label': 'Aragonese', 'value': 'an'}, + {'label': 'Assamese', 'value': 'as'}, + {'label': 'Avaric', 'value': 'av'}, + {'label': 'Avestan', 'value': 'ae'}, + {'label': 'Aymara', 'value': 'ay'}, + {'label': 'Azerbaijani', 'value': 'az'}, + {'label': 'Bashkir', 'value': 'ba'}, + {'label': 'Bambara', 'value': 'bm'}, + {'label': 'Belarusian', 'value': 'be'}, + {'label': 'Bengali', 'value': 'bn'}, + {'label': 'Bislama', 'value': 'bi'}, + {'label': 'Tibetan', 'value': 'bo'}, + {'label': 'Bosnian', 'value': 'bs'}, + {'label': 'Breton', 'value': 'br'}, + {'label': 'Bulgarian', 'value': 'bg'}, + {'label': 'Catalan', 'value': 'ca'}, + {'label': 'Czech', 'value': 'cs'}, + {'label': 'Chamorro', 'value': 'ch'}, + {'label': 'Chechen', 'value': 'ce'}, + {'label': 'Church Slavic', 'value': 'cu'}, + {'label': 'Chuvash', 'value': 'cv'}, + {'label': 'Cornish', 'value': 'kw'}, + {'label': 'Corsican', 'value': 'co'}, + {'label': 'Cree', 'value': 'cr'}, + {'label': 'Welsh', 'value': 'cy'}, + {'label': 'Danish', 'value': 'da'}, + {'label': 'German', 'value': 'de'}, + {'label': 'Dhivehi', 'value': 'dv'}, + {'label': 'Dzongkha', 'value': 'dz'}, + {'label': 'Modern Greek (1453-)', 'value': 'el'}, + {'label': 'English', 'value': 'en'}, + {'label': 'Esperanto', 'value': 'eo'}, + {'label': 'Estonian', 'value': 'et'}, + {'label': 'Basque', 'value': 'eu'}, + {'label': 'Ewe', 'value': 'ee'}, + {'label': 'Faroese', 'value': 'fo'}, + {'label': 'Persian', 'value': 'fa'}, + {'label': 'Fijian', 'value': 'fj'}, + {'label': 'Finnish', 'value': 'fi'}, + {'label': 'French', 'value': 'fr'}, + {'label': 'Western Frisian', 'value': 'fy'}, + {'label': 'Fulah', 'value': 'ff'}, + {'label': 'Scottish Gaelic', 'value': 'gd'}, + {'label': 'Irish', 'value': 'ga'}, + {'label': 'Galician', 'value': 'gl'}, + {'label': 'Manx', 'value': 'gv'}, + {'label': 'Guarani', 'value': 'gn'}, + {'label': 'Gujarati', 'value': 'gu'}, + {'label': 'Haitian', 'value': 'ht'}, + {'label': 'Hausa', 'value': 'ha'}, + {'label': 'Serbo-Croatian', 'value': 'sh'}, + {'label': 'Hebrew', 'value': 'he'}, + {'label': 'Herero', 'value': 'hz'}, + {'label': 'Hindi', 'value': 'hi'}, + {'label': 'Hiri Motu', 'value': 'ho'}, + {'label': 'Croatian', 'value': 'hr'}, + {'label': 'Hungarian', 'value': 'hu'}, + {'label': 'Armenian', 'value': 'hy'}, + {'label': 'Igbo', 'value': 'ig'}, + {'label': 'Ido', 'value': 'io'}, + {'label': 'Sichuan Yi', 'value': 'ii'}, + {'label': 'Inuktitut', 'value': 'iu'}, + {'label': 'Interlingue', 'value': 'ie'}, + {'label': 'Interlingua (International Auxiliary Language Association)', + 'value': 'ia'}, + {'label': 'Indonesian', 'value': 'id'}, + {'label': 'Inupiaq', 'value': 'ik'}, + {'label': 'Icelandic', 'value': 'is'}, + {'label': 'Italian', 'value': 'it'}, + {'label': 'Javanese', 'value': 'jv'}, + {'label': 'Japanese', 'value': 'ja'}, + {'label': 'Kalaallisut', 'value': 'kl'}, + {'label': 'Kannada', 'value': 'kn'}, + {'label': 'Kashmiri', 'value': 'ks'}, + {'label': 'Georgian', 'value': 'ka'}, + {'label': 'Kanuri', 'value': 'kr'}, + {'label': 'Kazakh', 'value': 'kk'}, + {'label': 'Khmer', 'value': 'km'}, + {'label': 'Kikuyu', 'value': 'ki'}, + {'label': 'Kinyarwanda', 'value': 'rw'}, + {'label': 'Kirghiz', 'value': 'ky'}, + {'label': 'Komi', 'value': 'kv'}, + {'label': 'Kongo', 'value': 'kg'}, + {'label': 'Korean', 'value': 'ko'}, + {'label': 'Kuanyama', 'value': 'kj'}, + {'label': 'Kurdish', 'value': 'ku'}, + {'label': 'Lao', 'value': 'lo'}, + {'label': 'Latin', 'value': 'la'}, + {'label': 'Latvian', 'value': 'lv'}, + {'label': 'Limburgan', 'value': 'li'}, + {'label': 'Lingala', 'value': 'ln'}, + {'label': 'Lithuanian', 'value': 'lt'}, + {'label': 'Luxembourgish', 'value': 'lb'}, + {'label': 'Luba-Katanga', 'value': 'lu'}, + {'label': 'Ganda', 'value': 'lg'}, + {'label': 'Marshallese', 'value': 'mh'}, + {'label': 'Malayalam', 'value': 'ml'}, + {'label': 'Marathi', 'value': 'mr'}, + {'label': 'Macedonian', 'value': 'mk'}, + {'label': 'Malagasy', 'value': 'mg'}, + {'label': 'Maltese', 'value': 'mt'}, + {'label': 'Mongolian', 'value': 'mn'}, + {'label': 'Maori', 'value': 'mi'}, + {'label': 'Malay (macrolanguage)', 'value': 'ms'}, + {'label': 'Burmese', 'value': 'my'}, + {'label': 'Nauru', 'value': 'na'}, + {'label': 'Navajo', 'value': 'nv'}, + {'label': 'South Ndebele', 'value': 'nr'}, + {'label': 'North Ndebele', 'value': 'nd'}, + {'label': 'Ndonga', 'value': 'ng'}, + {'label': 'Nepali (macrolanguage)', 'value': 'ne'}, + {'label': 'Dutch', 'value': 'nl'}, + {'label': 'Norwegian Nynorsk', 'value': 'nn'}, + {'label': 'Norwegian Bokmål', 'value': 'nb'}, + {'label': 'Norwegian', 'value': 'no'}, + {'label': 'Nyanja', 'value': 'ny'}, + {'label': 'Occitan (post 1500)', 'value': 'oc'}, + {'label': 'Ojibwa', 'value': 'oj'}, + {'label': 'Oriya (macrolanguage)', 'value': 'or'}, + {'label': 'Oromo', 'value': 'om'}, + {'label': 'Ossetian', 'value': 'os'}, + {'label': 'Panjabi', 'value': 'pa'}, + {'label': 'Pali', 'value': 'pi'}, + {'label': 'Polish', 'value': 'pl'}, + {'label': 'Portuguese', 'value': 'pt'}, + {'label': 'Pushto', 'value': 'ps'}, + {'label': 'Quechua', 'value': 'qu'}, + {'label': 'Romansh', 'value': 'rm'}, + {'label': 'Romanian', 'value': 'ro'}, + {'label': 'Rundi', 'value': 'rn'}, + {'label': 'Russian', 'value': 'ru'}, + {'label': 'Sango', 'value': 'sg'}, + {'label': 'Sanskrit', 'value': 'sa'}, + {'label': 'Sinhala', 'value': 'si'}, + {'label': 'Slovak', 'value': 'sk'}, + {'label': 'Slovenian', 'value': 'sl'}, + {'label': 'Northern Sami', 'value': 'se'}, + {'label': 'Samoan', 'value': 'sm'}, + {'label': 'Shona', 'value': 'sn'}, + {'label': 'Sindhi', 'value': 'sd'}, + {'label': 'Somali', 'value': 'so'}, + {'label': 'Southern Sotho', 'value': 'st'}, + {'label': 'Spanish', 'value': 'es'}, + {'label': 'Albanian', 'value': 'sq'}, + {'label': 'Sardinian', 'value': 'sc'}, + {'label': 'Serbian', 'value': 'sr'}, + {'label': 'Swati', 'value': 'ss'}, + {'label': 'Sundanese', 'value': 'su'}, + {'label': 'Swahili (macrolanguage)', 'value': 'sw'}, + {'label': 'Swedish', 'value': 'sv'}, + {'label': 'Tahitian', 'value': 'ty'}, + {'label': 'Tamil', 'value': 'ta'}, + {'label': 'Tatar', 'value': 'tt'}, + {'label': 'Telugu', 'value': 'te'}, + {'label': 'Tajik', 'value': 'tg'}, + {'label': 'Tagalog', 'value': 'tl'}, + {'label': 'Thai', 'value': 'th'}, + {'label': 'Tigrinya', 'value': 'ti'}, + {'label': 'Tonga (Tonga Islands)', 'value': 'to'}, + {'label': 'Tswana', 'value': 'tn'}, + {'label': 'Tsonga', 'value': 'ts'}, + {'label': 'Turkmen', 'value': 'tk'}, + {'label': 'Turkish', 'value': 'tr'}, + {'label': 'Twi', 'value': 'tw'}, + {'label': 'Uighur', 'value': 'ug'}, + {'label': 'Ukrainian', 'value': 'uk'}, + {'label': 'Urdu', 'value': 'ur'}, + {'label': 'Uzbek', 'value': 'uz'}, + {'label': 'Venda', 'value': 've'}, + {'label': 'Vietnamese', 'value': 'vi'}, + {'label': 'Volapük', 'value': 'vo'}, + {'label': 'Walloon', 'value': 'wa'}, + {'label': 'Wolof', 'value': 'wo'}, + {'label': 'Xhosa', 'value': 'xh'}, + {'label': 'Yiddish', 'value': 'yi'}, + {'label': 'Yoruba', 'value': 'yo'}, + {'label': 'Zhuang', 'value': 'za'}, + {'label': 'Chinese', 'value': 'zh'}, + {'label': 'Zulu', 'value': 'zu'}, + {'label': 'Ancient Greek', 'value': 'grc'}, + {'label': 'Multilingual', 'value': 'multilingual'} + ] \ No newline at end of file diff --git a/python-lib/macro/macro_utils.py b/python-lib/macro/macro_utils.py new file mode 100644 index 0000000..36a5c5b --- /dev/null +++ b/python-lib/macro/macro_utils.py @@ -0,0 +1,51 @@ +from macro.model_configurations import MODEL_CONFIFURATIONS +from macro.language_dict import SUPPORTED_LANGUAGES + + + +def read_model_inputs(config): + macro_inputs = {} + language_label = config.get("language",None) + macro_inputs["language"] = lang_label_to_iso(language_label) + + + model_name = config.get("modelName",None) + model_id =[x["id"] for x in MODEL_CONFIFURATIONS.values() if x["family"] == model_name][0] + macro_inputs["embedding_model"] = model_id + macro_inputs["embedding_family"] = model_name + + macro_inputs["output_folder_name"] = config.get("outputFolder",None) + macro_inputs["transformer_shortcut_name"] = config.get("transformersModelVersion",None) + + return macro_inputs + +def is_folder_exist(project,output_folder_name): + managed_folders_list = [x["name"] for x in project.list_managed_folders()] + return True if output_folder_name in managed_folders_list else False + +def manage_model_folder(output_folder_name,project_key,client): + project = client.get_project(project_key) + + #If needed, create the managed folder + if not is_folder_exist(project,output_folder_name): + output_folder = project.create_managed_folder(output_folder_name) + + return output_folder + + +def lang_iso_to_label(languages_iso): + languages_labels = [] + for language in languages_iso: + search = [x for x in SUPPORTED_LANGUAGES if x["value"] == language] + if search: + languages_labels.append(search[0]["label"]) + else: + languages_labels.append(language) + return languages_labels + +def lang_label_to_iso(language_label): + search = [x for x in SUPPORTED_LANGUAGES if x["label"] == language_label] + if search: + return search[0]["value"] + else: + return language_label \ No newline at end of file diff --git a/python-lib/macro/model_configurations.py b/python-lib/macro/model_configurations.py new file mode 100644 index 0000000..057d5a8 --- /dev/null +++ b/python-lib/macro/model_configurations.py @@ -0,0 +1,1079 @@ +MODEL_CONFIFURATIONS = { + "word2vec": { + "id": "word2vec", + "family": "Word2Vec", + "language_list": ['en','grc', 'ar', 'eu', 'bg', 'ca', 'zh', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'gl', 'de', 'el', 'he', 'hi', 'hu', 'id', 'ga', 'it', 'ja', 'kk', 'ko', 'la', 'lv', 'nb', 'nn', 'cu', 'fa', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'es', 'sv', 'tr', 'uk', 'ur', 'ug', 'vi'], + "download_info": { + "en": { + "model_link": "https://docs.google.com/uc?export=download", + "id_gdrive": "0B7XkCwpI5KDYNlNUTTlSS21pQmM" + }, + 'grc': {'model_id': 30}, + 'ar': {'model_id': 31}, + 'eu': {'model_id': 32}, + 'bg': {'model_id': 33}, + 'ca': {'model_id': 34}, + 'zh': {'model_id': 35}, + 'hr': {'model_id': 36}, + 'cs': {'model_id': 37}, + 'da': {'model_id': 38}, + 'nl': {'model_id': 39}, + 'et': {'model_id': 41}, + 'fi': {'model_id': 42}, + 'fr': {'model_id': 43}, + 'gl': {'model_id': 44}, + 'de': {'model_id': 45}, + 'el': {'model_id': 46}, + 'he': {'model_id': 47}, + 'hi': {'model_id': 48}, + 'hu': {'model_id': 49}, + 'id': {'model_id': 50}, + 'ga': {'model_id': 51}, + 'it': {'model_id': 52}, + 'ja': {'model_id': 53}, + 'kk': {'model_id': 54}, + 'ko': {'model_id': 55}, + 'la': {'model_id': 56}, + 'lv': {'model_id': 57}, + 'nb': {'model_id': 58}, + 'nn': {'model_id': 59}, + 'cu': {'model_id': 60}, + 'fa': {'model_id': 61}, + 'pl': {'model_id': 62}, + 'pt': {'model_id': 63}, + 'ro': {'model_id': 64}, + 'ru': {'model_id': 65}, + 'sk': {'model_id': 66}, + 'sl': {'model_id': 67}, + 'es': {'model_id': 68}, + 'sv': {'model_id': 69}, + 'tr': {'model_id': 70}, + 'uk': {'model_id': 71}, + 'ur': {'model_id': 72}, + 'ug': {'model_id': 73}, + 'vi': {'model_id': 74} + } + }, + + "fasttext": { + "id": "fasttext", + "family": "FastText", + "language_list": ['af', 'sq', 'am', 'ar', 'an', 'hy', 'as', 'az', 'ba', 'eu', 'be', 'bn', 'bs', 'br', 'bg', 'my', 'ca', 'ce', 'zh', 'cv', 'co', 'hr', 'cs', 'da', 'dv', 'nl', 'pa', 'en', 'eo', 'et', 'fi', 'fr', 'gl', 'ka', 'de', 'el', 'gu', 'ht', 'he', 'hi', 'hu', 'is', 'io', 'id', 'ia', 'ga', 'it', 'ja', 'jv', 'kn', 'kk', 'km', 'ky', 'ko', 'ku', 'la', 'lv', 'li', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'gv', 'mr', 'mn', 'ne', 'no', 'nn', 'oc', 'or', 'os', 'ps', 'fa', 'pl', 'pt', 'qu', 'ro', 'rm', 'ru', 'sa', 'sc', 'gd', 'sr', 'sh', 'sd', 'si', 'sk', 'sl', 'so', 'es', 'su', 'sw', 'sv', 'tl', 'tg', 'ta', 'tt', 'te', 'th', 'bo', 'tr', 'tk', 'uk', 'ur', 'ug', 'uz', 'vi', 'vo', 'wa', 'cy', 'fy', 'yi', 'yo'], + "download_info": { + 'af': 'af', + 'sq': 'sq', + 'am': 'am', + 'ar': 'ar', + 'an': 'an', + 'hy': 'hy', + 'as': 'as', + 'az': 'az', + 'ba': 'ba', + 'eu': 'eu', + 'be': 'be', + 'bn': 'bn', + 'bs': 'bs', + 'br': 'br', + 'bg': 'bg', + 'my': 'my', + 'ca': 'ca', + 'ce': 'ce', + 'zh': 'zh', + 'cv': 'cv', + 'co': 'co', + 'hr': 'hr', + 'cs': 'cs', + 'da': 'da', + 'dv': 'dv', + 'nl': 'nl', + 'pa': 'pa', + 'en': 'en', + 'eo': 'eo', + 'et': 'et', + 'fi': 'fi', + 'fr': 'fr', + 'gl': 'gl', + 'ka': 'ka', + 'de': 'de', + 'el': 'el', + 'gu': 'gu', + 'ht': 'ht', + 'he': 'he', + 'hi': 'hi', + 'hu': 'hu', + 'is': 'is', + 'io': 'io', + 'id': 'id', + 'ia': 'ia', + 'ga': 'ga', + 'it': 'it', + 'ja': 'ja', + 'jv': 'jv', + 'kn': 'kn', + 'kk': 'kk', + 'km': 'km', + 'ky': 'ky', + 'ko': 'ko', + 'ku': 'ku', + 'la': 'la', + 'lv': 'lv', + 'li': 'li', + 'lt': 'lt', + 'lb': 'lb', + 'mk': 'mk', + 'mg': 'mg', + 'ms': 'ms', + 'ml': 'ml', + 'mt': 'mt', + 'gv': 'gv', + 'mr': 'mr', + 'mn': 'mn', + 'ne': 'ne', + 'no': 'no', + 'nn': 'nn', + 'oc': 'oc', + 'or': 'or', + 'os': 'os', + 'ps': 'ps', + 'fa': 'fa', + 'pl': 'pl', + 'pt': 'pt', + 'qu': 'qu', + 'ro': 'ro', + 'rm': 'rm', + 'ru': 'ru', + 'sa': 'sa', + 'sc': 'sc', + 'gd': 'gd', + 'sr': 'sr', + 'sh': 'sh', + 'sd': 'sd', + 'si': 'si', + 'sk': 'sk', + 'sl': 'sl', + 'so': 'so', + 'es': 'es', + 'su': 'su', + 'sw': 'sw', + 'sv': 'sv', + 'tl': 'tl', + 'tg': 'tg', + 'ta': 'ta', + 'tt': 'tt', + 'te': 'te', + 'th': 'th', + 'bo': 'bo', + 'tr': 'tr', + 'tk': 'tk', + 'uk': 'uk', + 'ur': 'ur', + 'ug': 'ug', + 'uz': 'uz', + 'vi': 'vi', + 'vo': 'vo', + 'wa': 'wa', + 'cy': 'cy', + 'fy': 'fy', + 'yi': 'yi', + 'yo': 'yo' + } + }, + + "glove": { + "id": "glove", + "family": "Glove", + "language_list": ["en"], + "download_info": { + "en": "http://nlp.stanford.edu/data/glove.42B.300d.zip" + } + }, + + "elmo": { + "id": "elmo", + "family": "ELMo", + "language_list": ["en"], + "download_info": { + "en": "https://tfhub.dev/google/elmo/3?tf-hub-format=compressed" + } + }, + + "use": { + "id": "use", + "family": "USE", + "language_list": ["en", "multilingual"], + "download_info": { + "en": "https://tfhub.dev/google/universal-sentence-encoder/4?tf-hub-format=compressed", + "multilingual": "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed" + } + }, + + 'TurkuNLP/bert-base-finnish-cased-v1': { 'description': '12-layer, ' + '768-hidden, ' + '12-heads, 110M ' + 'parameters.\n' + 'Trained on cased ' + 'Finnish text.\n' + '\n' + '(see details on ' + 'turkunlp.org).', + 'download_info': {'fi': 'fi'}, + 'family': 'BERT', + 'id': 'TurkuNLP/bert-base-finnish-cased-v1', + 'language_list': ['fi']}, + 'TurkuNLP/bert-base-finnish-uncased-v1': { 'description': '12-layer, ' + '768-hidden, ' + '12-heads, 110M ' + 'parameters.\n' + 'Trained on ' + 'uncased Finnish ' + 'text.\n' + '\n' + '(see details on ' + 'turkunlp.org).', + 'download_info': {'fi': 'fi'}, + 'family': 'BERT', + 'id': 'TurkuNLP/bert-base-finnish-uncased-v1', + 'language_list': ['fi']}, + 'albert-base-v1': { 'description': '12 repeating layers, 128 embedding, ' + '768-hidden, 12-heads, 11M parameters\n' + 'ALBERT base model\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'ALBERT', + 'id': 'albert-base-v1', + 'language_list': ['en']}, + 'albert-base-v2': { 'description': '12 repeating layers, 128 embedding, ' + '768-hidden, 12-heads, 11M parameters\n' + 'ALBERT base model with no dropout, ' + 'additional training data and longer ' + 'training\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'ALBERT', + 'id': 'albert-base-v2', + 'language_list': ['en']}, + 'albert-large-v1': { 'description': '24 repeating layers, 128 embedding, ' + '1024-hidden, 16-heads, 17M ' + 'parameters\n' + 'ALBERT large model\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'ALBERT', + 'id': 'albert-large-v1', + 'language_list': ['en']}, + 'albert-large-v2': { 'description': '24 repeating layers, 128 embedding, ' + '1024-hidden, 16-heads, 17M ' + 'parameters\n' + 'ALBERT large model with no dropout, ' + 'additional training data and longer ' + 'training\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'ALBERT', + 'id': 'albert-large-v2', + 'language_list': ['en']}, + 'albert-xlarge-v1': { 'description': '24 repeating layers, 128 embedding, ' + '2048-hidden, 16-heads, 58M ' + 'parameters\n' + 'ALBERT xlarge model\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'ALBERT', + 'id': 'albert-xlarge-v1', + 'language_list': ['en']}, + 'albert-xlarge-v2': { 'description': '24 repeating layers, 128 embedding, ' + '2048-hidden, 16-heads, 58M ' + 'parameters\n' + 'ALBERT xlarge model with no dropout, ' + 'additional training data and longer ' + 'training\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'ALBERT', + 'id': 'albert-xlarge-v2', + 'language_list': ['en']}, + 'albert-xxlarge-v1': { 'description': '12 repeating layer, 128 embedding, ' + '4096-hidden, 64-heads, 223M ' + 'parameters\n' + 'ALBERT xxlarge model\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'ALBERT', + 'id': 'albert-xxlarge-v1', + 'language_list': ['en']}, + 'albert-xxlarge-v2': { 'description': '12 repeating layer, 128 embedding, ' + '4096-hidden, 64-heads, 223M ' + 'parameters\n' + 'ALBERT xxlarge model with no ' + 'dropout, additional training data ' + 'and longer training\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'ALBERT', + 'id': 'albert-xxlarge-v2', + 'language_list': ['en']}, + 'allenai/longformer-base-4096': { 'description': '12-layer, 768-hidden, ' + '12-heads, ~149M ' + 'parameters\n' + 'Starting from ' + 'RoBERTa-base checkpoint, ' + 'trained on documents of ' + 'max length 4,096', + 'download_info': {'en': 'en'}, + 'family': 'Longformer', + 'id': 'allenai/longformer-base-4096', + 'language_list': ['en']}, + 'allenai/longformer-large-4096': { 'description': '24-layer, 1024-hidden, ' + '16-heads, ~435M ' + 'parameters\n' + 'Starting from ' + 'RoBERTa-large ' + 'checkpoint, trained on ' + 'documents of max length ' + '4,096', + 'download_info': {'en': 'en'}, + 'family': 'Longformer', + 'id': 'allenai/longformer-large-4096', + 'language_list': ['en']}, + 'bert-base-cased': { 'description': '12-layer, 768-hidden, 12-heads, 110M ' + 'parameters.\n' + 'Trained on cased English text.', + 'download_info': {'en': 'en'}, + 'family': 'BERT', + 'id': 'bert-base-cased', + 'language_list': ['en']}, + 'bert-base-cased-finetuned-mrpc': { 'description': '12-layer, 768-hidden, ' + '12-heads, 110M ' + 'parameters.\n' + 'The bert-base-cased ' + 'model fine-tuned on ' + 'MRPC\n' + '\n' + '(see details of ' + 'fine-tuning in the ' + 'example section)', + 'download_info': {'en': 'en'}, + 'family': 'BERT', + 'id': 'bert-base-cased-finetuned-mrpc', + 'language_list': ['en']}, + 'bert-base-chinese': { 'description': '12-layer, 768-hidden, 12-heads, ' + '110M parameters.\n' + 'Trained on cased Chinese Simplified ' + 'and Traditional text.', + 'download_info': {'zh': 'zh'}, + 'family': 'BERT', + 'id': 'bert-base-chinese', + 'language_list': ['zh']}, + 'bert-base-german-cased': { 'description': '12-layer, 768-hidden, ' + '12-heads, 110M parameters.\n' + 'Trained on cased German text ' + 'by Deepset.ai\n' + '\n' + '(see details on deepset.ai ' + 'website).', + 'download_info': {'de': 'de'}, + 'family': 'BERT', + 'id': 'bert-base-german-cased', + 'language_list': ['de']}, + 'bert-base-german-dbmdz-cased': { 'description': '12-layer, 768-hidden, ' + '12-heads, 110M ' + 'parameters.\n' + 'Trained on cased German ' + 'text by DBMDZ\n' + '\n' + '(see details on dbmdz ' + 'repository).', + 'download_info': {'de': 'de'}, + 'family': 'BERT', + 'id': 'bert-base-german-dbmdz-cased', + 'language_list': ['de']}, + 'bert-base-german-dbmdz-uncased': { 'description': '12-layer, 768-hidden, ' + '12-heads, 110M ' + 'parameters.\n' + 'Trained on uncased ' + 'German text by DBMDZ\n' + '\n' + '(see details on dbmdz ' + 'repository).', + 'download_info': {'de': 'de'}, + 'family': 'BERT', + 'id': 'bert-base-german-dbmdz-uncased', + 'language_list': ['de']}, + 'bert-base-multilingual-cased': { 'description': '(New, recommended) ' + '12-layer, 768-hidden, ' + '12-heads, 110M ' + 'parameters.\n' + 'Trained on cased text in ' + 'the top 104 languages ' + 'with the largest ' + 'Wikipedias\n' + '\n' + '.', + 'download_info': { 'multilingual': 'multilingual'}, + 'family': 'BERT', + 'id': 'bert-base-multilingual-cased', + 'language_list': ['multilingual']}, + 'bert-base-multilingual-uncased': { 'description': '(Original, not ' + 'recommended) 12-layer, ' + '768-hidden, 12-heads, ' + '110M parameters.\n' + 'Trained on lower-cased ' + 'text in the top 102 ' + 'languages with the ' + 'largest Wikipedias\n' + '\n' + '.', + 'download_info': { 'multilingual': 'multilingual'}, + 'family': 'BERT', + 'id': 'bert-base-multilingual-uncased', + 'language_list': ['multilingual']}, + 'bert-base-uncased': { 'description': '12-layer, 768-hidden, 12-heads, ' + '110M parameters.\n' + 'Trained on lower-cased English ' + 'text.', + 'download_info': {'en': 'en'}, + 'family': 'BERT', + 'id': 'bert-base-uncased', + 'language_list': ['en']}, + 'bert-large-cased': { 'description': '24-layer, 1024-hidden, 16-heads, ' + '340M parameters.\n' + 'Trained on cased English text.', + 'download_info': {'en': 'en'}, + 'family': 'BERT', + 'id': 'bert-large-cased', + 'language_list': ['en']}, + 'bert-large-cased-whole-word-masking': { 'description': '24-layer, ' + '1024-hidden, ' + '16-heads, 340M ' + 'parameters.\n' + 'Trained on cased ' + 'English text ' + 'using ' + 'Whole-Word-Masking\n' + '\n' + '.', + 'download_info': {'en': 'en'}, + 'family': 'BERT', + 'id': 'bert-large-cased-whole-word-masking', + 'language_list': ['en']}, + 'bert-large-cased-whole-word-masking-finetuned-squad': { 'description': '24-layer, ' + '1024-hidden, ' + '16-heads, ' + '340M ' + 'parameters\n' + 'The ' + 'bert-large-cased-whole-word-masking ' + 'model ' + 'fine-tuned ' + 'on ' + 'SQuAD\n' + '\n' + '(see ' + 'details ' + 'of ' + 'fine-tuning ' + 'in ' + 'the ' + 'example ' + 'section)', + 'download_info': { 'en': 'en'}, + 'family': 'BERT', + 'id': 'bert-large-cased-whole-word-masking-finetuned-squad', + 'language_list': [ 'en']}, + 'bert-large-uncased': { 'description': '24-layer, 1024-hidden, 16-heads, ' + '340M parameters.\n' + 'Trained on lower-cased English ' + 'text.', + 'download_info': {'en': 'en'}, + 'family': 'BERT', + 'id': 'bert-large-uncased', + 'language_list': ['en']}, + 'bert-large-uncased-whole-word-masking': { 'description': '24-layer, ' + '1024-hidden, ' + '16-heads, 340M ' + 'parameters.\n' + 'Trained on ' + 'lower-cased ' + 'English text ' + 'using ' + 'Whole-Word-Masking\n' + '\n' + '.', + 'download_info': {'en': 'en'}, + 'family': 'BERT', + 'id': 'bert-large-uncased-whole-word-masking', + 'language_list': ['en']}, + 'bert-large-uncased-whole-word-masking-finetuned-squad': { 'description': '24-layer, ' + '1024-hidden, ' + '16-heads, ' + '340M ' + 'parameters.\n' + 'The ' + 'bert-large-uncased-whole-word-masking ' + 'model ' + 'fine-tuned ' + 'on ' + 'SQuAD\n' + '\n' + '(see ' + 'details ' + 'of ' + 'fine-tuning ' + 'in ' + 'the ' + 'example ' + 'section).', + 'download_info': { 'en': 'en'}, + 'family': 'BERT', + 'id': 'bert-large-uncased-whole-word-masking-finetuned-squad', + 'language_list': [ 'en']}, + 'camembert-base': { 'description': '12-layer, 768-hidden, 12-heads, 110M ' + 'parameters\n' + 'CamemBERT using the BERT-base ' + 'architecture\n' + '\n', + 'download_info': {'fr': 'fr'}, + 'family': 'CamemBERT', + 'id': 'camembert-base', + 'language_list': ['fr']}, + 'cl-tohoku/bert-base-japanese': { 'description': '12-layer, 768-hidden, ' + '12-heads, 110M ' + 'parameters.\n' + 'Trained on Japanese ' + 'text. Text is tokenized ' + 'with MeCab and ' + 'WordPiece.\n' + 'MeCab is required for ' + 'tokenization.\n' + '\n' + '(see details on ' + 'cl-tohoku repository).', + 'download_info': {'ja': 'ja'}, + 'family': 'BERT', + 'id': 'cl-tohoku/bert-base-japanese', + 'language_list': ['ja']}, + 'cl-tohoku/bert-base-japanese-char': { 'description': '12-layer, ' + '768-hidden, ' + '12-heads, 110M ' + 'parameters.\n' + 'Trained on Japanese ' + 'text. Text is ' + 'tokenized into ' + 'characters.\n' + '\n' + '(see details on ' + 'cl-tohoku ' + 'repository).', + 'download_info': {'ja': 'ja'}, + 'family': 'BERT', + 'id': 'cl-tohoku/bert-base-japanese-char', + 'language_list': ['ja']}, + 'cl-tohoku/bert-base-japanese-char-whole-word-masking': { 'description': '12-layer, ' + '768-hidden, ' + '12-heads, ' + '110M ' + 'parameters.\n' + 'Trained ' + 'on ' + 'Japanese ' + 'text ' + 'using ' + 'Whole-Word-Masking. ' + 'Text ' + 'is ' + 'tokenized ' + 'into ' + 'characters.\n' + '\n' + '(see ' + 'details ' + 'on ' + 'cl-tohoku ' + 'repository).', + 'download_info': { 'ja': 'ja'}, + 'family': 'BERT', + 'id': 'cl-tohoku/bert-base-japanese-char-whole-word-masking', + 'language_list': [ 'ja']}, + 'cl-tohoku/bert-base-japanese-whole-word-masking': { 'description': '12-layer, ' + '768-hidden, ' + '12-heads, ' + '110M ' + 'parameters.\n' + 'Trained ' + 'on ' + 'Japanese ' + 'text ' + 'using ' + 'Whole-Word-Masking. ' + 'Text ' + 'is ' + 'tokenized ' + 'with ' + 'MeCab ' + 'and ' + 'WordPiece.\n' + 'MeCab ' + 'is ' + 'required ' + 'for ' + 'tokenization.\n' + '\n' + '(see ' + 'details ' + 'on ' + 'cl-tohoku ' + 'repository).', + 'download_info': { 'ja': 'ja'}, + 'family': 'BERT', + 'id': 'cl-tohoku/bert-base-japanese-whole-word-masking', + 'language_list': [ 'ja']}, + 'ctrl': { 'description': '48-layer, 1280-hidden, 16-heads, 1.6B ' + 'parameters\n' + 'Salesforceâ\x80\x99s Large-sized CTRL English ' + 'model', + 'download_info': {'en': 'en'}, + 'family': 'CTRL', + 'id': 'ctrl', + 'language_list': ['en']}, + 'distilbert-base-cased': { 'description': '6-layer, 768-hidden, 12-heads, ' + '65M parameters\n' + 'The DistilBERT model distilled ' + 'from the BERT model ' + 'bert-base-cased checkpoint\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'DistilBERT', + 'id': 'distilbert-base-cased', + 'language_list': ['en']}, + 'distilbert-base-cased-distilled-squad': { 'description': '6-layer, ' + '768-hidden, ' + '12-heads, 65M ' + 'parameters\n' + 'The DistilBERT ' + 'model distilled ' + 'from the BERT ' + 'model ' + 'bert-base-cased ' + 'checkpoint, ' + 'with an ' + 'additional ' + 'question ' + 'answering ' + 'layer.\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'DistilBERT', + 'id': 'distilbert-base-cased-distilled-squad', + 'language_list': ['en']}, + 'distilbert-base-german-cased': { 'description': '6-layer, 768-hidden, ' + '12-heads, 66M ' + 'parameters\n' + 'The German DistilBERT ' + 'model distilled from the ' + 'German DBMDZ BERT model ' + 'bert-base-german-dbmdz-cased ' + 'checkpoint.\n' + '\n', + 'download_info': {'de': 'de'}, + 'family': 'DistilBERT', + 'id': 'distilbert-base-german-cased', + 'language_list': ['de']}, + 'distilbert-base-multilingual-cased': { 'description': '6-layer, ' + '768-hidden, ' + '12-heads, 134M ' + 'parameters\n' + 'The multilingual ' + 'DistilBERT model ' + 'distilled from the ' + 'Multilingual BERT ' + 'model ' + 'bert-base-multilingual-cased ' + 'checkpoint.\n' + '\n', + 'download_info': { 'multilingual': 'multilingual'}, + 'family': 'DistilBERT', + 'id': 'distilbert-base-multilingual-cased', + 'language_list': ['multilingual']}, + 'distilbert-base-uncased': { 'description': '6-layer, 768-hidden, ' + '12-heads, 66M parameters\n' + 'The DistilBERT model ' + 'distilled from the BERT model ' + 'bert-base-uncased checkpoint\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'DistilBERT', + 'id': 'distilbert-base-uncased', + 'language_list': ['en']}, + 'distilbert-base-uncased-distilled-squad': { 'description': '6-layer, ' + '768-hidden, ' + '12-heads, 66M ' + 'parameters\n' + 'The ' + 'DistilBERT ' + 'model ' + 'distilled ' + 'from the BERT ' + 'model ' + 'bert-base-uncased ' + 'checkpoint, ' + 'with an ' + 'additional ' + 'linear ' + 'layer.\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'DistilBERT', + 'id': 'distilbert-base-uncased-distilled-squad', + 'language_list': ['en']}, + 'distilgpt2': { 'description': '6-layer, 768-hidden, 12-heads, 82M ' + 'parameters\n' + 'The DistilGPT2 model distilled from the ' + 'GPT2 model gpt2 checkpoint.\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'DistilBERT', + 'id': 'distilgpt2', + 'language_list': ['en']}, + 'distilroberta-base': { 'description': '6-layer, 768-hidden, 12-heads, 82M ' + 'parameters\n' + 'The DistilRoBERTa model distilled ' + 'from the RoBERTa model ' + 'roberta-base checkpoint.\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'RoBERTa', + 'id': 'distilroberta-base', + 'language_list': ['en']}, + 'facebook/bart-large': { 'description': '24-layer, 1024-hidden, 16-heads, ' + '406M parameters\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'Bart', + 'id': 'facebook/bart-large', + 'language_list': ['en']}, + 'facebook/bart-large-cnn': { 'description': '12-layer, 1024-hidden, ' + '16-heads, 406M ' + 'parameters (same as ' + 'base)\n' + 'bart-large base architecture ' + 'finetuned on cnn ' + 'summarization task', + 'download_info': {'en': 'en'}, + 'family': 'Bart', + 'id': 'facebook/bart-large-cnn', + 'language_list': ['en']}, + 'facebook/bart-large-mnli': { 'description': 'Adds a 2 layer ' + 'classification head with 1 ' + 'million parameters\n' + 'bart-large base architecture ' + 'with a classification head, ' + 'finetuned on MNLI', + 'download_info': {'en': 'en'}, + 'family': 'Bart', + 'id': 'facebook/bart-large-mnli', + 'language_list': ['en']}, + 'facebook/mbart-large-en-ro': { 'description': '12-layer, 1024-hidden, ' + '16-heads, 880M parameters\n' + 'bart-large architecture ' + 'pretrained on cc25 ' + 'multilingual data , ' + 'finetuned on WMT english ' + 'romanian translation.', + 'download_info': { 'multilingual': 'multilingual'}, + 'family': 'Bart', + 'id': 'facebook/mbart-large-en-ro', + 'language_list': ['multilingual']}, + 'flaubert/flaubert_base_cased': { 'description': '12-layer, 768-hidden, ' + '12-heads, 138M ' + 'parameters\n' + 'FlauBERT base ' + 'architecture with cased ' + 'vocabulary\n' + '\n', + 'download_info': {'fr': 'fr'}, + 'family': 'FlauBERT', + 'id': 'flaubert/flaubert_base_cased', + 'language_list': ['fr']}, + 'flaubert/flaubert_base_uncased': { 'description': '12-layer, 768-hidden, ' + '12-heads, 137M ' + 'parameters\n' + 'FlauBERT base ' + 'architecture with ' + 'uncased vocabulary\n' + '\n', + 'download_info': {'fr': 'fr'}, + 'family': 'FlauBERT', + 'id': 'flaubert/flaubert_base_uncased', + 'language_list': ['fr']}, + 'flaubert/flaubert_large_cased': { 'description': '24-layer, 1024-hidden, ' + '16-heads, 373M ' + 'parameters\n' + 'FlauBERT large ' + 'architecture\n' + '\n', + 'download_info': {'fr': 'fr'}, + 'family': 'FlauBERT', + 'id': 'flaubert/flaubert_large_cased', + 'language_list': ['fr']}, + 'flaubert/flaubert_small_cased': { 'description': '6-layer, 512-hidden, ' + '8-heads, 54M ' + 'parameters\n' + 'FlauBERT small ' + 'architecture\n' + '\n', + 'download_info': {'fr': 'fr'}, + 'family': 'FlauBERT', + 'id': 'flaubert/flaubert_small_cased', + 'language_list': ['fr']}, + 'gpt2': { 'description': '12-layer, 768-hidden, 12-heads, 117M ' + 'parameters.\n' + 'OpenAI GPT-2 English model', + 'download_info': {'en': 'en'}, + 'family': 'GPT-2', + 'id': 'gpt2', + 'language_list': ['en']}, + 'gpt2-large': { 'description': '36-layer, 1280-hidden, 20-heads, 774M ' + 'parameters.\n' + 'OpenAIâ\x80\x99s Large-sized GPT-2 English ' + 'model', + 'download_info': {'en': 'en'}, + 'family': 'GPT-2', + 'id': 'gpt2-large', + 'language_list': ['en']}, + 'gpt2-medium': { 'description': '24-layer, 1024-hidden, 16-heads, 345M ' + 'parameters.\n' + 'OpenAIâ\x80\x99s Medium-sized GPT-2 ' + 'English model', + 'download_info': {'en': 'en'}, + 'family': 'GPT-2', + 'id': 'gpt2-medium', + 'language_list': ['en']}, + 'gpt2-xl': { 'description': '48-layer, 1600-hidden, 25-heads, 1558M ' + 'parameters.\n' + 'OpenAIâ\x80\x99s XL-sized GPT-2 English model', + 'download_info': {'en': 'en'}, + 'family': 'GPT-2', + 'id': 'gpt2-xl', + 'language_list': ['en']}, + 'openai-gpt': { 'description': '12-layer, 768-hidden, 12-heads, 110M ' + 'parameters.\n' + 'OpenAI GPT English model', + 'download_info': {'en': 'en'}, + 'family': 'GPT', + 'id': 'openai-gpt', + 'language_list': ['en']}, + 'roberta-base': { 'description': '12-layer, 768-hidden, 12-heads, 125M ' + 'parameters\n' + 'RoBERTa using the BERT-base ' + 'architecture\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'RoBERTa', + 'id': 'roberta-base', + 'language_list': ['en']}, + 'roberta-base-openai-detector': { 'description': '12-layer, 768-hidden, ' + '12-heads, 125M ' + 'parameters\n' + 'roberta-base fine-tuned ' + 'by OpenAI on the outputs ' + 'of the 1.5B-parameter ' + 'GPT-2 model.\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'RoBERTa', + 'id': 'roberta-base-openai-detector', + 'language_list': ['en']}, + 'roberta-large': { 'description': '24-layer, 1024-hidden, 16-heads, 355M ' + 'parameters\n' + 'RoBERTa using the BERT-large ' + 'architecture\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'RoBERTa', + 'id': 'roberta-large', + 'language_list': ['en']}, + 'roberta-large-mnli': { 'description': '24-layer, 1024-hidden, 16-heads, ' + '355M parameters\n' + 'roberta-large fine-tuned on MNLI.\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'RoBERTa', + 'id': 'roberta-large-mnli', + 'language_list': ['en']}, + 'roberta-large-openai-detector': { 'description': '24-layer, 1024-hidden, ' + '16-heads, 355M ' + 'parameters\n' + 'roberta-large ' + 'fine-tuned by OpenAI on ' + 'the outputs of the ' + '1.5B-parameter GPT-2 ' + 'model.\n' + '\n', + 'download_info': {'en': 'en'}, + 'family': 'RoBERTa', + 'id': 'roberta-large-openai-detector', + 'language_list': ['en']}, + 't5-base': { 'description': '~220M parameters with 12-layers, ' + '768-hidden-state, 3072 feed-forward ' + 'hidden-state, 12-heads,\n' + 'Trained on English text: the Colossal Clean ' + 'Crawled Corpus (C4)', + 'download_info': {'en': 'en'}, + 'family': 'T5', + 'id': 't5-base', + 'language_list': ['en']}, + 't5-large': { 'description': '~770M parameters with 24-layers, ' + '1024-hidden-state, 4096 feed-forward ' + 'hidden-state, 16-heads,\n' + 'Trained on English text: the Colossal Clean ' + 'Crawled Corpus (C4)', + 'download_info': {'en': 'en'}, + 'family': 'T5', + 'id': 't5-large', + 'language_list': ['en']}, + 't5-small': { 'description': '~60M parameters with 6-layers, ' + '512-hidden-state, 2048 feed-forward ' + 'hidden-state, 8-heads,\n' + 'Trained on English text: the Colossal Clean ' + 'Crawled Corpus (C4)', + 'download_info': {'en': 'en'}, + 'family': 'T5', + 'id': 't5-small', + 'language_list': ['en']}, + 'transfo-xl-wt103': { 'description': '18-layer, 1024-hidden, 16-heads, ' + '257M parameters.\n' + 'English model trained on ' + 'wikitext-103', + 'download_info': {'en': 'en'}, + 'family': 'Transformer-XL', + 'id': 'transfo-xl-wt103', + 'language_list': ['en']}, + 'wietsedv/bert-base-dutch-cased': { 'description': '12-layer, 768-hidden, ' + '12-heads, 110M ' + 'parameters.\n' + 'Trained on cased Dutch ' + 'text.\n' + '\n' + '(see details on ' + 'wietsedv repository).', + 'download_info': {'nl': 'nl'}, + 'family': 'BERT', + 'id': 'wietsedv/bert-base-dutch-cased', + 'language_list': ['nl']}, + 'xlm-clm-ende-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n' + 'XLM English-German model trained ' + 'with CLM (Causal Language Modeling) ' + 'on the concatenation of English and ' + 'German wikipedia', + 'download_info': {'multilingual': 'multilingual'}, + 'family': 'XLM', + 'id': 'xlm-clm-ende-1024', + 'language_list': ['multilingual']}, + 'xlm-clm-enfr-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n' + 'XLM English-French model trained ' + 'with CLM (Causal Language Modeling) ' + 'on the concatenation of English and ' + 'French wikipedia', + 'download_info': {'multilingual': 'multilingual'}, + 'family': 'XLM', + 'id': 'xlm-clm-enfr-1024', + 'language_list': ['multilingual']}, + 'xlm-mlm-100-1280': { 'description': '16-layer, 1280-hidden, 16-heads\n' + 'XLM model trained with MLM (Masked ' + 'Language Modeling) on 100 languages.', + 'download_info': {'multilingual': 'multilingual'}, + 'family': 'XLM', + 'id': 'xlm-mlm-100-1280', + 'language_list': ['multilingual']}, + 'xlm-mlm-17-1280': { 'description': '16-layer, 1280-hidden, 16-heads\n' + 'XLM model trained with MLM (Masked ' + 'Language Modeling) on 17 languages.', + 'download_info': {'multilingual': 'multilingual'}, + 'family': 'XLM', + 'id': 'xlm-mlm-17-1280', + 'language_list': ['multilingual']}, + 'xlm-mlm-en-2048': { 'description': '12-layer, 2048-hidden, 16-heads\n' + 'XLM English model', + 'download_info': {'en': 'en'}, + 'family': 'XLM', + 'id': 'xlm-mlm-en-2048', + 'language_list': ['en']}, + 'xlm-mlm-ende-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n' + 'XLM English-German model trained on ' + 'the concatenation of English and ' + 'German wikipedia', + 'download_info': {'multilingual': 'multilingual'}, + 'family': 'XLM', + 'id': 'xlm-mlm-ende-1024', + 'language_list': ['multilingual']}, + 'xlm-mlm-enfr-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n' + 'XLM English-French model trained on ' + 'the concatenation of English and ' + 'French wikipedia', + 'download_info': {'multilingual': 'multilingual'}, + 'family': 'XLM', + 'id': 'xlm-mlm-enfr-1024', + 'language_list': ['multilingual']}, + 'xlm-mlm-enro-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n' + 'XLM English-Romanian Multi-language ' + 'model', + 'download_info': {'multilingual': 'multilingual'}, + 'family': 'XLM', + 'id': 'xlm-mlm-enro-1024', + 'language_list': ['multilingual']}, + 'xlm-mlm-tlm-xnli15-1024': { 'description': '12-layer, 1024-hidden, ' + '8-heads\n' + 'XLM Model pre-trained with ' + 'MLM + TLM on the 15 XNLI ' + 'languages.', + 'download_info': { 'multilingual': 'multilingual'}, + 'family': 'XLM', + 'id': 'xlm-mlm-tlm-xnli15-1024', + 'language_list': ['multilingual']}, + 'xlm-mlm-xnli15-1024': { 'description': '12-layer, 1024-hidden, 8-heads\n' + 'XLM Model pre-trained with MLM on ' + 'the 15 XNLI languages.', + 'download_info': {'multilingual': 'multilingual'}, + 'family': 'XLM', + 'id': 'xlm-mlm-xnli15-1024', + 'language_list': ['multilingual']}, + 'xlm-roberta-base': { 'description': '~125M parameters with 12-layers, ' + '768-hidden-state, 3072 feed-forward ' + 'hidden-state, 8-heads,\n' + 'Trained on on 2.5 TB of newly ' + 'created clean CommonCrawl data in ' + '100 languages', + 'download_info': { 'xlm-roberta-base': 'xlm-roberta-base'}, + 'family': 'XLM-RoBERTa', + 'id': 'xlm-roberta-base', + 'language_list': ['xlm-roberta-base']}, + 'xlm-roberta-large': { 'description': '~355M parameters with 24-layers, ' + '1027-hidden-state, 4096 ' + 'feed-forward hidden-state, ' + '16-heads,\n' + 'Trained on 2.5 TB of newly created ' + 'clean CommonCrawl data in 100 ' + 'languages', + 'download_info': { 'xlm-roberta-large': 'xlm-roberta-large'}, + 'family': 'XLM-RoBERTa', + 'id': 'xlm-roberta-large', + 'language_list': ['xlm-roberta-large']}, + 'xlnet-base-cased': { 'description': '12-layer, 768-hidden, 12-heads, 110M ' + 'parameters.\n' + 'XLNet English model', + 'download_info': {'en': 'en'}, + 'family': 'XLNet', + 'id': 'xlnet-base-cased', + 'language_list': ['en']}, + 'xlnet-large-cased': { 'description': '24-layer, 1024-hidden, 16-heads, ' + '340M parameters.\n' + 'XLNet Large English model', + 'download_info': {'en': 'en'}, + 'family': 'XLNet', + 'id': 'xlnet-large-cased', + 'language_list': ['en']} + + } + +TRANSFORMERS_LIST = ['bert-base-uncased', 'bert-large-uncased', 'bert-base-cased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-chinese', 'bert-base-german-cased', 'bert-large-uncased-whole-word-masking', 'bert-large-cased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad', 'bert-large-cased-whole-word-masking-finetuned-squad', 'bert-base-cased-finetuned-mrpc', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'cl-tohoku/bert-base-japanese', 'cl-tohoku/bert-base-japanese-whole-word-masking', 'cl-tohoku/bert-base-japanese-char', 'cl-tohoku/bert-base-japanese-char-whole-word-masking', 'TurkuNLP/bert-base-finnish-cased-v1', 'TurkuNLP/bert-base-finnish-uncased-v1', 'wietsedv/bert-base-dutch-cased', 'facebook/bart-large', 'facebook/bart-large-mnli', 'facebook/bart-large-cnn', 'facebook/mbart-large-en-ro', 'openai-gpt', 'transfo-xl-wt103', 'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'distilgpt2', 'ctrl', 'xlnet-base-cased', 'xlnet-large-cased', 'xlm-mlm-en-2048', 'xlm-mlm-ende-1024', 'xlm-mlm-enfr-1024', 'xlm-mlm-enro-1024', 'xlm-mlm-tlm-xnli15-1024', 'xlm-mlm-xnli15-1024', 'xlm-clm-enfr-1024', 'xlm-clm-ende-1024', 'xlm-mlm-17-1280', 'xlm-mlm-100-1280', 'roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', 'roberta-large-openai-detector', 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad', 'distilbert-base-cased', 'distilbert-base-cased-distilled-squad', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased', 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1', 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'camembert-base', 't5-small', 't5-base', 't5-large', 'xlm-roberta-base', 'xlm-roberta-large', 'flaubert/flaubert_small_cased', 'flaubert/flaubert_base_uncased', 'flaubert/flaubert_base_cased', 'flaubert/flaubert_large_cased', 'allenai/longformer-base-4096', 'allenai/longformer-large-4096'] + diff --git a/python-lib/macro/model_downloaders.py b/python-lib/macro/model_downloaders.py new file mode 100644 index 0000000..a8f3caa --- /dev/null +++ b/python-lib/macro/model_downloaders.py @@ -0,0 +1,293 @@ +import json +import requests +import shutil +import gzip +import os +from pathlib import Path +import tarfile +import io +import zipfile +from macro.model_configurations import MODEL_CONFIFURATIONS +import time +from transformers.file_utils import (S3_BUCKET_PREFIX, + CLOUDFRONT_DISTRIB_PREFIX, + hf_bucket_url) +from .model_configurations import MODEL_CONFIFURATIONS + + +WORD2VEC_BASE_URL = "http://vectors.nlpl.eu/repository/20/{}.zip" +FASTTEXT_BASE_URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{}.300.vec.gz" +HG_FILENAMES = ["pytorch_model.bin","config.json","vocab.txt"] + +class BaseDownloader(object): + def __init__(self,folder,macro_inputs,proxy,progress_callback): + self.folder = folder + self.proxy = proxy + self.progress_callback = progress_callback + self.language = macro_inputs["language"] + self.embedding_model = macro_inputs["embedding_model"] + self.embedding_family = macro_inputs["embedding_family"] + self.model_params = MODEL_CONFIFURATIONS[self.embedding_model] + self.model_id = self.embedding_model + '-' + self.language + self.archive_name = '' + + + + def get_stream(self, download_link): + response = requests.get(download_link, stream=True, proxies=self.proxy) + return response + + def download_plain(self, response, bytes_so_far=0): + #Download plain files + total_size = self.get_file_size(response) + update_time = time.time() + with self.folder.get_writer(self.archive_name) as w: + for chunk in response.iter_content(chunk_size=100000): + if chunk: + bytes_so_far += len(chunk) + percent = int(float(bytes_so_far) / total_size * 100) + update_time = self.update_percent(percent, update_time) + w.write(chunk) + return bytes_so_far + + def download_gz(self, response, bytes_so_far=0): + #Download .ga files + total_size = self.get_file_size(response) + update_time = time.time() + destination_writer = self.folder.get_writer(self.archive_name) + + #Write .gz file to folder + for chunk in response.iter_content(chunk_size=32768): + if chunk: # filter out keep-alive new chunks + bytes_so_far += len(chunk) + percent = int(float(bytes_so_far) / total_size * 95) + update_time = self.update_percent(percent, update_time) + destination_writer.write(chunk) + destination_writer.close() + + #Unzip file + write_to_path = self.language + '/' + self.embedding_family + '/' + self.model_id + with self.folder.get_writer(write_to_path) as f_out, self.folder.get_download_stream(self.archive_name) as f_in: + shutil.copyfileobj(gzip.open(f_in), f_out) + + #Remove the .gz file + self.folder.delete_path(self.archive_name) + return bytes_so_far + + def download_tar_gz(self, response, bytes_so_far=0): + #Download .tar files + total_size = self.get_file_size(response) + update_time = time.time() + with self.folder.get_writer(self.archive_name) as w: + for chunk in response.iter_content(chunk_size=100000): + if chunk: + bytes_so_far += len(chunk) + percent = int(float(bytes_so_far) / total_size * 95) + update_time = self.update_percent(percent, update_time) + w.write(chunk) + #Untar file + with self.folder.get_download_stream(self.archive_name) as f_in: + with tarfile.open(fileobj=io.BytesIO(f_in.read())) as tar: + members = tar.getmembers() + for member in members: + if member.isfile(): + write_to_path = self.language + '/' + self.embedding_family + '/' + member.name + with self.folder.get_writer(write_to_path) as f_out: + shutil.copyfileobj(tar.extractfile(member),f_out) + self.folder.delete_path(self.archive_name) + return bytes_so_far + + def download_zip(self, response, bytes_so_far = 0): + #Download .zip files + total_size = self.get_file_size(response) + update_time = time.time() + with self.folder.get_writer(self.archive_name) as w: + for chunk in response.iter_content(chunk_size=100000): + if chunk: + bytes_so_far += len(chunk) + percent = int(float(bytes_so_far) / total_size * 95) + update_time = self.update_percent(percent, update_time) + w.write(chunk) + #Unzip file + with self.folder.get_download_stream(self.archive_name) as f_in: + with zipfile.ZipFile(io.BytesIO(f_in.read())) as fzip: + if self.embedding_model == "word2vec": + archive_name = "model.bin" + elif self.embedding_model == "glove": + archive_name = fzip.namelist()[0] + else: + raise NotImplementedError() + write_to_path = self.language + '/' + self.embedding_family + '/' + self.model_id + with fzip.open(archive_name) as fzip_file, self.folder.get_writer(write_to_path) as f_out: + shutil.copyfileobj(fzip_file, f_out) + self.folder.delete_path(self.archive_name) + return bytes_so_far + + def get_file_size(self,response): + if self.model_id == "word2vec-en": + total_size = 3390000000 #3.39GB + else: + total_size = int(response.headers.get('content-length')) + + return total_size if total_size>0 else 300000000 #300MB + + def update_percent(self,percent, last_update_time): + new_time = time.time() + if (new_time - last_update_time) > 5: + self.progress_callback(percent) + return new_time + else: + return last_update_time + + def get_download_link(self): + raise NotImplementedError() + + def run(self): + raise NotImplementedError() + + + + +class Word2vecDownloader(BaseDownloader): + def __init__(self,folder,macro_inputs,proxy,progress_callback): + BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback) + self.archive_name = self.language + '/' + self.embedding_family + '/' + if self.language == "en": + self.archive_name += self.model_id + ".bin.gz" + else: + self.archive_name += self.model_id + ".zip" + + def get_gdrive_stream(self, download_link): + id_gdrive = self.model_params["download_info"][self.language]["id_gdrive"] + session = requests.Session() + response = session.get(download_link, params={'id': id_gdrive} , stream=True, proxies=self.proxy) + token = self.__get_confirm_token(response) + + if token: + params = {'id': id_gdrive, 'confirm': token} + response = session.get(download_link, params=params, stream=True, proxies=self.proxy) + else: + raise RuntimeError("Google Drive Token could not be verified.") + + return response + + def __get_confirm_token(self,response): + for key, value in response.cookies.items(): + if key.startswith('download_warning'): + return value + return None + + def get_download_link(self): + if self.language == "en": + return self.model_params["download_info"][self.language]["model_link"] + else: + model_id = self.model_params["download_info"][self.language]["model_id"] + return WORD2VEC_BASE_URL.format(model_id) + + def run(self): + if self.language == "en": + download_link = self.get_download_link() + response = self.get_gdrive_stream(download_link) + self.download_gz(response) + else: + download_link = self.get_download_link() + response = self.get_stream(download_link) + self.download_zip(response) + + + + +class FasttextDownloader(BaseDownloader): + def __init__(self,folder,macro_inputs,proxy,progress_callback): + BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback) + self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_id + ".gz" + + def get_download_link(self): + return FASTTEXT_BASE_URL.format(self.model_params["download_info"][self.language]) + + def run(self): + download_link = self.get_download_link() + response = self.get_stream(download_link) + self.download_gz(response) + + + + + +class GloveDownloader(BaseDownloader): + def __init__(self,folder,macro_inputs,proxy,progress_callback): + BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback) + self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_id + ".zip" + + def get_download_link(self): + return self.model_params["download_info"][self.language] + + def run(self): + download_link = self.get_download_link() + response = self.get_stream(download_link) + self.download_zip(response) + + +class ElmoDownloader(BaseDownloader): + def __init__(self,folder,macro_inputs,proxy,progress_callback): + BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback) + self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_id + ".tar.gz" + + def get_download_link(self): + return self.model_params["download_info"][self.language] + + def run(self): + download_link = self.get_download_link() + response = self.get_stream(download_link) + self.download_tar_gz(response) + + + + +class UseDownloader(BaseDownloader): + def __init__(self,folder,macro_inputs,proxy,progress_callback): + BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback) + self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_id + ".tar.gz" + + def get_download_link(self): + return self.model_params["download_info"][self.language] + + def run(self): + download_link = self.get_download_link() + response = self.get_stream(download_link) + self.download_tar_gz(response) + + + +class HuggingFaceDownloader(BaseDownloader): + def __init__(self,folder,macro_inputs,proxy,progress_callback): + BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback) + self.macro_inputs = macro_inputs + self.model_shortcut_name = self.macro_inputs["transformer_shortcut_name"] + + + def run(self): + bytes_so_far = 0 + for filename in HG_FILENAMES: + self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_shortcut_name.replace("/","_") + '/' + filename + download_link = self.get_download_link(filename) + response = self.get_stream(download_link) + if response.status_code == 200: + bytes_so_far = self.download_plain(response, bytes_so_far) + elif response.status_code == 404: + pass + + def get_file_size(self, response=None): + total_size = 0 + for filename in HG_FILENAMES: + download_link = self.get_download_link(filename) + response = self.get_stream(download_link) + if response.status_code == 200: + total_size += int(response.headers.get('content-length')) + elif response.status_code == 404: + total_size += 0 + return total_size + + + def get_download_link(self,filename): + return hf_bucket_url(self.model_shortcut_name,filename) \ No newline at end of file diff --git a/python-runnables/download-pretrained-embedding/runnable.json b/python-runnables/download-pretrained-embedding/runnable.json index 3d95597..85eea75 100755 --- a/python-runnables/download-pretrained-embedding/runnable.json +++ b/python-runnables/download-pretrained-embedding/runnable.json @@ -1,82 +1,18 @@ { - "meta" : { - "label" : "Pre-trained Embeddings", - "description" : "Downloads pre-trained word embeddings. Available models are: Word2vec, GloVe, fastText and ELMo.", - "icon" : "icon-cloud-download" + "meta": { + "label": "Pre-trained Embeddings", + "description": "Downloads pre-trained word embeddings. Available models are: Word2vec, GloVe, fastText and ELMo.", + "icon": "icon-cloud-download" }, - "impersonate" : false, - - "permissions" : ["WRITE_CONF"], - - "resultType" : "HTML", - - "resultLabel" : "model download output", - - "extension" : "txt", - - "mimeType" : "text/plain", - - "params": [ - { - "name": "source", - "label": "Source", - "type": "SELECT", - "selectChoices": [ - { - "label": "Word2vec", - "value": "word2vec" - }, - { - "label": "GloVe", - "value": "glove" - }, - { - "label": "FastText", - "value": "fasttext" - }, - { - "label": "ELMo", - "value": "elmo" - } - ], - "mandatory": true - }, - { - "visibilityCondition": "model.source == 'fasttext'", - "name": "text_language_fasttext", - "label": "Text language", - "type": "SELECT", - "selectChoices": [ - { - "label": "English", - "value": "english" - }, - { - "label": "French", - "value": "french" - } - ], - "mandatory": true - }, - { - "visibilityCondition": "model.source != 'fasttext'", - "name": "text_language_other", - "label": "Text language", - "type": "SELECT", - "selectChoices": [ - { - "label": "English", - "value": "english" - } - ], - "mandatory": true - }, - { - "name": "outputName", - "label" : "Output folder name", - "type": "STRING", - "description":"Use a different folder for each downloaded embeddings.", - "mandatory" : true - } - ] -} + "impersonate": true, + "permissions": [ + "WRITE_CONF" + ], + "resultType": "HTML", + "resultLabel": "model download output", + "extension": "txt", + "mimeType": "text/plain", + "paramsTemplate" : "index.html", + "paramsModule" : "modelDownloader.build", + "paramsPythonSetup": "recipe-helper.py" +} \ No newline at end of file diff --git a/python-runnables/download-pretrained-embedding/runnable.py b/python-runnables/download-pretrained-embedding/runnable.py index 9c10ff7..4bb266c 100755 --- a/python-runnables/download-pretrained-embedding/runnable.py +++ b/python-runnables/download-pretrained-embedding/runnable.py @@ -2,41 +2,18 @@ import dataiku from dataiku.runnables import Runnable - -import os -import gzip +from macro.model_downloaders import (Word2vecDownloader, + FasttextDownloader, + GloveDownloader, + ElmoDownloader, + UseDownloader, + HuggingFaceDownloader + ) +from macro.macro_utils import read_model_inputs +from macro.model_configurations import TRANSFORMERS_LIST import zipfile -import requests -import shutil - - -def download_file_from_google_drive(id, destination): - URL = "https://docs.google.com/uc?export=download" - - session = requests.Session() - response = session.get(URL, params={'id': id}, stream=True) - token = get_confirm_token(response) - - if token: - params = {'id': id, 'confirm': token} - response = session.get(URL, params=params, stream=True) - save_response_content(response, destination) - - -def get_confirm_token(response): - for key, value in response.cookies.items(): - if key.startswith('download_warning'): - return value - return None - - -def save_response_content(response, destination): - CHUNK_SIZE = 32768 - - with open(destination, "wb") as f: - for chunk in response.iter_content(CHUNK_SIZE): - if chunk: # filter out keep-alive new chunks - f.write(chunk) +import json +import os class MyRunnable(Runnable): @@ -52,6 +29,7 @@ def __init__(self, project_key, config, plugin_config): self.config = config self.plugin_config = plugin_config self.client = dataiku.api_client() + def get_progress_target(self): """ @@ -63,14 +41,10 @@ def get_progress_target(self): def run(self, progress_callback): # Retrieving parameters - output_folder_name = self.config.get('outputName', '') - source = self.config.get('source', '') - if source == 'fasttext': - text_language = self.config.get('text_language_fasttext', '') - else: - text_language = self.config.get('text_language_other', '') + macro_inputs = read_model_inputs(self.config) # Creating new Managed Folder if needed + output_folder_name = macro_inputs["output_folder_name"] project = self.client.get_project(self.project_key) output_folder_found = False @@ -86,104 +60,32 @@ def run(self, progress_callback): output_folder = dataiku.Folder(output_folder.get_definition()["id"], project_key=self.project_key) - output_folder_path = output_folder.get_path() - ####################################### # Downloading and extracting the data ####################################### - if source == 'word2vec': - if text_language == 'english': - file_id = '0B7XkCwpI5KDYNlNUTTlSS21pQmM' - else: - raise NotImplementedError("Word2vec vectors are only available for English. Use fastText for other languages.") - - # Download from Google Drive - archive_fname = os.path.join(output_folder_path, "GoogleNews-vectors-negative300.bin.gz") - download_file_from_google_drive(file_id, archive_fname) - - # Decompress in managed folder and rename - """ - decompressed_file = gzip.GzipFile(archive_fname) - with open(os.path.join(output_folder_path, "Word2vec_embeddings"), 'wb') as outfile: - print('))))))))))) WRITING FILE') - outfile.write(decompressed_file.read()) - """ - outfile_path = os.path.join(output_folder_path, "Word2vec_embeddings") - with open(outfile_path, 'wb') as f_out, gzip.open(archive_fname, 'rb') as f_in: - shutil.copyfileobj(f_in, f_out) - - os.remove(archive_fname) - - - elif source == 'fasttext': - if text_language == 'english': - url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec' - elif text_language == 'french': - url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fr.vec' - elif text_language == 'german': - url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.de.vec' - else: - raise NotImplementedError( - "Only English, French and German languages are supported.") - r = requests.get(url, stream=True) - with output_folder.get_writer("fastText_embeddings") as w: - for chunk in r.iter_content(chunk_size=100000): - if chunk: - w.write(chunk) - - - elif source == 'glove': - if text_language == 'english': - url = 'http://nlp.stanford.edu/data/glove.42B.300d.zip' - else: - raise NotImplementedError("GloVe vectors are only available for English. Use fastText for other languages.") - - archive_name = os.path.basename(url) - - # Download archive - r = requests.get(url, stream=True) - with output_folder.get_writer(archive_name) as w: - for chunk in r.iter_content(chunk_size=100000): - if chunk: - w.write(chunk) - - file_basename = os.path.splitext(archive_name)[0] - file_name = file_basename + '.txt' - file_rename = "GloVe_embeddings" - - # Unzip archive into same directory - zip_ref = zipfile.ZipFile(os.path.join( - output_folder_path, archive_name), 'r') - zip_ref.extractall(output_folder_path) - zip_ref.close() - - # remove archive - os.remove(os.path.join(output_folder_path, archive_name)) - # rename embedding file - os.rename(os.path.join(output_folder_path, file_name), os.path.join(output_folder_path, file_rename)) - - - elif source == 'elmo': - if text_language == 'english': - import tensorflow as tf - import tensorflow_hub as hub - - elmo_model_dir = os.path.join(output_folder_path, "ELMo") - - if not os.path.exists(elmo_model_dir): - os.makedirs(elmo_model_dir) - - # Path for saving ELMo - os.environ["TFHUB_CACHE_DIR"] = elmo_model_dir - - # Download ELMo - elmo_model = hub.Module( - "https://tfhub.dev/google/elmo/2", trainable=False) - else: - raise NotImplementedError( - "ELMo is only available for English. Use fastText for other languages.") + embedding_model = macro_inputs["embedding_model"] + proxy = self.plugin_config["proxy"] + if embedding_model == 'word2vec': + Word2vecDownloader(output_folder,macro_inputs,proxy,progress_callback).run() + + + elif embedding_model == 'fasttext': + FasttextDownloader(output_folder,macro_inputs,proxy,progress_callback).run() + + + elif embedding_model == 'glove': + GloveDownloader(output_folder,macro_inputs,proxy,progress_callback).run() + + elif embedding_model == 'elmo': + ElmoDownloader(output_folder,macro_inputs,proxy,progress_callback).run() + + elif embedding_model == 'use': + UseDownloader(output_folder,macro_inputs,proxy,progress_callback).run() + + elif embedding_model in TRANSFORMERS_LIST: + HuggingFaceDownloader(output_folder,macro_inputs,proxy,progress_callback).run() else: - raise NotImplementedError( - "Only Word2vec, GloVe and FastText embeddings are supported.") + raise("Model not found.") + return "
The model was downloaded successfuly !" diff --git a/resource/index.css b/resource/index.css new file mode 100644 index 0000000..614344e --- /dev/null +++ b/resource/index.css @@ -0,0 +1,11 @@ +.error-message{ + color:red; + font-size: 10px; + margin-left: 2px; +} + +.desc-message{ + color:gray; + font-size: 10px; + margin-left: 2px; +} diff --git a/resource/index.html b/resource/index.html new file mode 100644 index 0000000..0b88c1a --- /dev/null +++ b/resource/index.html @@ -0,0 +1,69 @@ + + +
+ + +
+ +
+ +
+ + Please select a language +
+
+ +
+ +
+ + Please select a model +
+ +
+ +
+ +
+ + Please select a model version +
+ +
+ +
+ + +
+ +
+ + Please define an output Folder. If the folder doesn't exist, it will be automatically created. +
+ +
+ +
+ +
\ No newline at end of file diff --git a/resource/recipe-helper.py b/resource/recipe-helper.py new file mode 100644 index 0000000..e8108cd --- /dev/null +++ b/resource/recipe-helper.py @@ -0,0 +1,49 @@ +import dataiku +from macro.model_configurations import MODEL_CONFIFURATIONS +from macro.macro_utils import lang_iso_to_label, lang_label_to_iso + +def do(payload, config, plugin_config, inputs): + if payload["method"] == "get_languages": + return get_languages() + + if payload["method"] == "get_models": + return get_models(config) + + if payload["method"] == "get_transformer_model_versions": + return get_transformer_model_versions(config) + + if payload["method"] == "get_model_description": + return get_model_description(config) + + +def get_languages(): + languages= [] + languages = [m["language_list"] for m in MODEL_CONFIFURATIONS.values()] + languages = list(set([item for sublist in languages for item in sublist])) + languages_labels = lang_iso_to_label(languages) + return {'languages': sorted(languages_labels)} + + +def get_models(config): + language_label = config.get("language") + language = lang_label_to_iso(language_label) + models = [m["family"] for m in MODEL_CONFIFURATIONS.values() if language in m["language_list"]] + return {'models': list(set(models))} + +def get_transformer_model_versions(config): + model = config.get("modelName") + language_label = config.get("language") + language = lang_label_to_iso(language_label) + transformer_model_versions = [x["id"] for x in MODEL_CONFIFURATIONS.values() if x["family"] == model and language in x["language_list"]] + return {"transformer_model_versions": transformer_model_versions, + "model_name" : model} + +def get_model_description(config): + model = config.get("transformersModelVersion") + if model is None: + model_description = "" + else: + model_description = MODEL_CONFIFURATIONS[model]["description"] + return {'model_description': model_description} + + diff --git a/state.json b/state.json new file mode 100644 index 0000000..1615e60 --- /dev/null +++ b/state.json @@ -0,0 +1,17 @@ +{ + "actions": [ + { + "ts": 1583162079832, + "action": "INSTALL", + "user": "admin" + }, + { + "ts": 1588847033731, + "action": "UPGRADE", + "user": "admin" + } + ], + "gitState": { + "enabled": false + } +} \ No newline at end of file