diff --git a/.gitignore b/.gitignore
index b6e4761..dff3c38 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,129 +1,10 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-# Usually these files are written by a python script from a template
-# before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
+.journal
+.mainlock
+.wlock
+.ts
+.git
+*.pyc
.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-.python-version
-
-# pipenv
-# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-# However, in case of collaboration, if having platform-specific dependencies or dependencies
-# having no cross-platform support, pipenv may install dependencies that don't work, or not
-# install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
+.htmlcov
+.DS_Store
+.idea
diff --git a/code-env/python/desc.json b/code-env/python/desc.json
index bc25f36..c687218 100755
--- a/code-env/python/desc.json
+++ b/code-env/python/desc.json
@@ -1,6 +1,8 @@
{
- "acceptedPythonInterpreters": ["PYTHON27"],
- "forceConda": false,
- "installCorePackages": true,
- "installJupyterSupport": false
+ "acceptedPythonInterpreters": [
+ "PYTHON36"
+ ],
+ "forceConda": false,
+ "installCorePackages": true,
+ "installJupyterSupport": false
}
\ No newline at end of file
diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
index 60dd177..622bccd 100644
--- a/code-env/python/spec/requirements.txt
+++ b/code-env/python/spec/requirements.txt
@@ -4,3 +4,5 @@ gensim==3.8.2
scikit-learn==0.20.4
tensorflow==1.15.2
tensorflow-hub==0.5.0
+torch==1.3.1
+transformers
\ No newline at end of file
diff --git a/custom-recipes/sentence-embedding-compute/recipe.json b/custom-recipes/sentence-embedding-compute/recipe.json
index 7749711..428656d 100755
--- a/custom-recipes/sentence-embedding-compute/recipe.json
+++ b/custom-recipes/sentence-embedding-compute/recipe.json
@@ -42,7 +42,7 @@
"type": "COLUMNS",
"description": "",
"mandatory": true,
- "columnRole":"input_dataset"
+ "columnRole": "input_dataset"
},
{
"name": "aggregation_method",
@@ -84,14 +84,6 @@
"description": "Used for computing SIF weights.",
"type": "DOUBLE",
"defaultValue": 0.001
- },
- {
- "visibilityCondition": "model.advanced_settings",
- "name": "n_principal_components",
- "label": "[SIF] Principal Components",
- "description": "Number of components to remove in SIF computation.",
- "type": "INT",
- "defaultValue": 1
}
]
-}
+}
\ No newline at end of file
diff --git a/dist/dss-plugin-sentence-embedding-1.2.2.zip b/dist/dss-plugin-sentence-embedding-1.2.2.zip
new file mode 100644
index 0000000..8d0c82c
Binary files /dev/null and b/dist/dss-plugin-sentence-embedding-1.2.2.zip differ
diff --git a/js/script.js b/js/script.js
new file mode 100644
index 0000000..9f18307
--- /dev/null
+++ b/js/script.js
@@ -0,0 +1,72 @@
+var app = angular.module('modelDownloader.build', []);
+var non_transformer_models = ["Word2Vec","FastText","Glove","ELMo",'USE']
+
+app.controller('modelDownloaderController', function($scope) {
+
+ $scope.$watch('config', function(nv) {
+ if(nv && nv.language){
+ return;
+ }
+ $scope.showLanguageList=true;
+ $scope.showModelList=false;
+ $scope.showTransformersModelversion=false;
+ $scope.showOutputFolder=true;
+ $scope.showModelDescription=false;
+ });
+
+ $scope.getModels = function(){
+ $scope.callPythonDo({method: "get_models"}).then(function(data){
+ $scope.models = data['models']
+ });
+ $scope.showLanguageList=true;
+ $scope.showModelList=true;
+ $scope.showTransformersModelversion=false;
+ $scope.showOutputFolder=true;
+ $scope.showModelDescription=false;
+ };
+
+ $scope.getTransformerModelVersions = function(){
+ $scope.callPythonDo({method:"get_transformer_model_versions"}).then(function(data){
+ $scope.transformersModelVersions = data['transformer_model_versions'];
+ var model_name = data['model_name'];
+ if(non_transformer_models.includes(model_name)){
+ $scope.showTransformersModelversion=false;
+ }
+ else{
+ $scope.showTransformersModelversion=true;
+ }
+
+ });
+ $scope.showLanguageList=true;
+ $scope.showModelList=true;
+ $scope.showOutputFolder=true;
+ $scope.showModelDescription=false;
+
+ };
+
+ $scope.getModelDescription = function(){
+ $scope.callPythonDo({method: "get_model_description"}).then(function(data){
+ $scope.modelDescription = data['model_description']
+ });
+ $scope.showLanguageList=true;
+ $scope.showModelList=true;
+ $scope.showTransformersModelversion=true;
+ $scope.showOutputFolder=true;
+ $scope.showModelDescription=true;
+ };
+
+ var init = function(){
+ $scope.callPythonDo({method: "get_languages"}).then(function(data){
+ $scope.languages = data['languages']
+ });
+ $scope.showLanguageList=true;
+ $scope.showModelList=false;
+ $scope.showTransformersModelversion=false;
+ $scope.showOutputFolder=true;
+ $scope.showModelDescription=false;
+
+ };
+
+ init();
+
+});
\ No newline at end of file
diff --git a/parameter-sets/custom_proxy_config/parameter-set.json b/parameter-sets/custom_proxy_config/parameter-set.json
new file mode 100644
index 0000000..914d979
--- /dev/null
+++ b/parameter-sets/custom_proxy_config/parameter-set.json
@@ -0,0 +1,20 @@
+{
+ "meta": {
+ "label": "Custom Proxy Configuration",
+ "description": "",
+ "icon": "icon-powerbi"
+ },
+ "defaultDefinableInline": true,
+ "defaultDefinableAtProjectLevel": true,
+ "pluginParams": [
+
+ ],
+ "params": [
+ {
+ "name": "custom_proxy_config",
+ "label": "Custom proxy configuration",
+ "description": "Needs to follow Python's requests specifications.\nExample:\nhttp --> http://10.10.1.10:3128\nhttps --> http://10.10.1.10:1080",
+ "type": "MAP"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/plugin.json b/plugin.json
index 2f7002f..cb769cc 100755
--- a/plugin.json
+++ b/plugin.json
@@ -8,7 +8,21 @@
"author": "Dataiku (Hicham El Boukkouri)",
"icon": "icon-list-alt",
"licenseInfo": "Apache Software License",
- "url": "https://www.dataiku.com/product/plugins/sentence-embedding/",
- "tags": ["NLP", "Machine Learning", "AutoML"]
- }
+ "url": "https://www.dataiku.com/dss/plugins/info/sentence-embedding.html",
+ "tags": [
+ "NLP",
+ "Machine Learning",
+ "AutoML"
+ ]
+ },
+ "params": [
+ {
+ "name": "proxy",
+ "label": "Proxy settings",
+ "description": "Needs to follow Python's requests specifications.\nExample:\nhttp --> http://10.10.1.10:3128\nhttps --> http://10.10.1.10:1080",
+ "type": "MAP",
+ "mandatory": true
+ }
+ ]
}
+
diff --git a/python-lib/macro/language_dict.py b/python-lib/macro/language_dict.py
new file mode 100644
index 0000000..ef66e10
--- /dev/null
+++ b/python-lib/macro/language_dict.py
@@ -0,0 +1,189 @@
+SUPPORTED_LANGUAGES = [
+ {'label': 'Afar', 'value': 'aa'},
+ {'label': 'Abkhazian', 'value': 'ab'},
+ {'label': 'Afrikaans', 'value': 'af'},
+ {'label': 'Akan', 'value': 'ak'},
+ {'label': 'Amharic', 'value': 'am'},
+ {'label': 'Arabic', 'value': 'ar'},
+ {'label': 'Aragonese', 'value': 'an'},
+ {'label': 'Assamese', 'value': 'as'},
+ {'label': 'Avaric', 'value': 'av'},
+ {'label': 'Avestan', 'value': 'ae'},
+ {'label': 'Aymara', 'value': 'ay'},
+ {'label': 'Azerbaijani', 'value': 'az'},
+ {'label': 'Bashkir', 'value': 'ba'},
+ {'label': 'Bambara', 'value': 'bm'},
+ {'label': 'Belarusian', 'value': 'be'},
+ {'label': 'Bengali', 'value': 'bn'},
+ {'label': 'Bislama', 'value': 'bi'},
+ {'label': 'Tibetan', 'value': 'bo'},
+ {'label': 'Bosnian', 'value': 'bs'},
+ {'label': 'Breton', 'value': 'br'},
+ {'label': 'Bulgarian', 'value': 'bg'},
+ {'label': 'Catalan', 'value': 'ca'},
+ {'label': 'Czech', 'value': 'cs'},
+ {'label': 'Chamorro', 'value': 'ch'},
+ {'label': 'Chechen', 'value': 'ce'},
+ {'label': 'Church Slavic', 'value': 'cu'},
+ {'label': 'Chuvash', 'value': 'cv'},
+ {'label': 'Cornish', 'value': 'kw'},
+ {'label': 'Corsican', 'value': 'co'},
+ {'label': 'Cree', 'value': 'cr'},
+ {'label': 'Welsh', 'value': 'cy'},
+ {'label': 'Danish', 'value': 'da'},
+ {'label': 'German', 'value': 'de'},
+ {'label': 'Dhivehi', 'value': 'dv'},
+ {'label': 'Dzongkha', 'value': 'dz'},
+ {'label': 'Modern Greek (1453-)', 'value': 'el'},
+ {'label': 'English', 'value': 'en'},
+ {'label': 'Esperanto', 'value': 'eo'},
+ {'label': 'Estonian', 'value': 'et'},
+ {'label': 'Basque', 'value': 'eu'},
+ {'label': 'Ewe', 'value': 'ee'},
+ {'label': 'Faroese', 'value': 'fo'},
+ {'label': 'Persian', 'value': 'fa'},
+ {'label': 'Fijian', 'value': 'fj'},
+ {'label': 'Finnish', 'value': 'fi'},
+ {'label': 'French', 'value': 'fr'},
+ {'label': 'Western Frisian', 'value': 'fy'},
+ {'label': 'Fulah', 'value': 'ff'},
+ {'label': 'Scottish Gaelic', 'value': 'gd'},
+ {'label': 'Irish', 'value': 'ga'},
+ {'label': 'Galician', 'value': 'gl'},
+ {'label': 'Manx', 'value': 'gv'},
+ {'label': 'Guarani', 'value': 'gn'},
+ {'label': 'Gujarati', 'value': 'gu'},
+ {'label': 'Haitian', 'value': 'ht'},
+ {'label': 'Hausa', 'value': 'ha'},
+ {'label': 'Serbo-Croatian', 'value': 'sh'},
+ {'label': 'Hebrew', 'value': 'he'},
+ {'label': 'Herero', 'value': 'hz'},
+ {'label': 'Hindi', 'value': 'hi'},
+ {'label': 'Hiri Motu', 'value': 'ho'},
+ {'label': 'Croatian', 'value': 'hr'},
+ {'label': 'Hungarian', 'value': 'hu'},
+ {'label': 'Armenian', 'value': 'hy'},
+ {'label': 'Igbo', 'value': 'ig'},
+ {'label': 'Ido', 'value': 'io'},
+ {'label': 'Sichuan Yi', 'value': 'ii'},
+ {'label': 'Inuktitut', 'value': 'iu'},
+ {'label': 'Interlingue', 'value': 'ie'},
+ {'label': 'Interlingua (International Auxiliary Language Association)',
+ 'value': 'ia'},
+ {'label': 'Indonesian', 'value': 'id'},
+ {'label': 'Inupiaq', 'value': 'ik'},
+ {'label': 'Icelandic', 'value': 'is'},
+ {'label': 'Italian', 'value': 'it'},
+ {'label': 'Javanese', 'value': 'jv'},
+ {'label': 'Japanese', 'value': 'ja'},
+ {'label': 'Kalaallisut', 'value': 'kl'},
+ {'label': 'Kannada', 'value': 'kn'},
+ {'label': 'Kashmiri', 'value': 'ks'},
+ {'label': 'Georgian', 'value': 'ka'},
+ {'label': 'Kanuri', 'value': 'kr'},
+ {'label': 'Kazakh', 'value': 'kk'},
+ {'label': 'Khmer', 'value': 'km'},
+ {'label': 'Kikuyu', 'value': 'ki'},
+ {'label': 'Kinyarwanda', 'value': 'rw'},
+ {'label': 'Kirghiz', 'value': 'ky'},
+ {'label': 'Komi', 'value': 'kv'},
+ {'label': 'Kongo', 'value': 'kg'},
+ {'label': 'Korean', 'value': 'ko'},
+ {'label': 'Kuanyama', 'value': 'kj'},
+ {'label': 'Kurdish', 'value': 'ku'},
+ {'label': 'Lao', 'value': 'lo'},
+ {'label': 'Latin', 'value': 'la'},
+ {'label': 'Latvian', 'value': 'lv'},
+ {'label': 'Limburgan', 'value': 'li'},
+ {'label': 'Lingala', 'value': 'ln'},
+ {'label': 'Lithuanian', 'value': 'lt'},
+ {'label': 'Luxembourgish', 'value': 'lb'},
+ {'label': 'Luba-Katanga', 'value': 'lu'},
+ {'label': 'Ganda', 'value': 'lg'},
+ {'label': 'Marshallese', 'value': 'mh'},
+ {'label': 'Malayalam', 'value': 'ml'},
+ {'label': 'Marathi', 'value': 'mr'},
+ {'label': 'Macedonian', 'value': 'mk'},
+ {'label': 'Malagasy', 'value': 'mg'},
+ {'label': 'Maltese', 'value': 'mt'},
+ {'label': 'Mongolian', 'value': 'mn'},
+ {'label': 'Maori', 'value': 'mi'},
+ {'label': 'Malay (macrolanguage)', 'value': 'ms'},
+ {'label': 'Burmese', 'value': 'my'},
+ {'label': 'Nauru', 'value': 'na'},
+ {'label': 'Navajo', 'value': 'nv'},
+ {'label': 'South Ndebele', 'value': 'nr'},
+ {'label': 'North Ndebele', 'value': 'nd'},
+ {'label': 'Ndonga', 'value': 'ng'},
+ {'label': 'Nepali (macrolanguage)', 'value': 'ne'},
+ {'label': 'Dutch', 'value': 'nl'},
+ {'label': 'Norwegian Nynorsk', 'value': 'nn'},
+ {'label': 'Norwegian Bokmål', 'value': 'nb'},
+ {'label': 'Norwegian', 'value': 'no'},
+ {'label': 'Nyanja', 'value': 'ny'},
+ {'label': 'Occitan (post 1500)', 'value': 'oc'},
+ {'label': 'Ojibwa', 'value': 'oj'},
+ {'label': 'Oriya (macrolanguage)', 'value': 'or'},
+ {'label': 'Oromo', 'value': 'om'},
+ {'label': 'Ossetian', 'value': 'os'},
+ {'label': 'Panjabi', 'value': 'pa'},
+ {'label': 'Pali', 'value': 'pi'},
+ {'label': 'Polish', 'value': 'pl'},
+ {'label': 'Portuguese', 'value': 'pt'},
+ {'label': 'Pushto', 'value': 'ps'},
+ {'label': 'Quechua', 'value': 'qu'},
+ {'label': 'Romansh', 'value': 'rm'},
+ {'label': 'Romanian', 'value': 'ro'},
+ {'label': 'Rundi', 'value': 'rn'},
+ {'label': 'Russian', 'value': 'ru'},
+ {'label': 'Sango', 'value': 'sg'},
+ {'label': 'Sanskrit', 'value': 'sa'},
+ {'label': 'Sinhala', 'value': 'si'},
+ {'label': 'Slovak', 'value': 'sk'},
+ {'label': 'Slovenian', 'value': 'sl'},
+ {'label': 'Northern Sami', 'value': 'se'},
+ {'label': 'Samoan', 'value': 'sm'},
+ {'label': 'Shona', 'value': 'sn'},
+ {'label': 'Sindhi', 'value': 'sd'},
+ {'label': 'Somali', 'value': 'so'},
+ {'label': 'Southern Sotho', 'value': 'st'},
+ {'label': 'Spanish', 'value': 'es'},
+ {'label': 'Albanian', 'value': 'sq'},
+ {'label': 'Sardinian', 'value': 'sc'},
+ {'label': 'Serbian', 'value': 'sr'},
+ {'label': 'Swati', 'value': 'ss'},
+ {'label': 'Sundanese', 'value': 'su'},
+ {'label': 'Swahili (macrolanguage)', 'value': 'sw'},
+ {'label': 'Swedish', 'value': 'sv'},
+ {'label': 'Tahitian', 'value': 'ty'},
+ {'label': 'Tamil', 'value': 'ta'},
+ {'label': 'Tatar', 'value': 'tt'},
+ {'label': 'Telugu', 'value': 'te'},
+ {'label': 'Tajik', 'value': 'tg'},
+ {'label': 'Tagalog', 'value': 'tl'},
+ {'label': 'Thai', 'value': 'th'},
+ {'label': 'Tigrinya', 'value': 'ti'},
+ {'label': 'Tonga (Tonga Islands)', 'value': 'to'},
+ {'label': 'Tswana', 'value': 'tn'},
+ {'label': 'Tsonga', 'value': 'ts'},
+ {'label': 'Turkmen', 'value': 'tk'},
+ {'label': 'Turkish', 'value': 'tr'},
+ {'label': 'Twi', 'value': 'tw'},
+ {'label': 'Uighur', 'value': 'ug'},
+ {'label': 'Ukrainian', 'value': 'uk'},
+ {'label': 'Urdu', 'value': 'ur'},
+ {'label': 'Uzbek', 'value': 'uz'},
+ {'label': 'Venda', 'value': 've'},
+ {'label': 'Vietnamese', 'value': 'vi'},
+ {'label': 'Volapük', 'value': 'vo'},
+ {'label': 'Walloon', 'value': 'wa'},
+ {'label': 'Wolof', 'value': 'wo'},
+ {'label': 'Xhosa', 'value': 'xh'},
+ {'label': 'Yiddish', 'value': 'yi'},
+ {'label': 'Yoruba', 'value': 'yo'},
+ {'label': 'Zhuang', 'value': 'za'},
+ {'label': 'Chinese', 'value': 'zh'},
+ {'label': 'Zulu', 'value': 'zu'},
+ {'label': 'Ancient Greek', 'value': 'grc'},
+ {'label': 'Multilingual', 'value': 'multilingual'}
+ ]
\ No newline at end of file
diff --git a/python-lib/macro/macro_utils.py b/python-lib/macro/macro_utils.py
new file mode 100644
index 0000000..36a5c5b
--- /dev/null
+++ b/python-lib/macro/macro_utils.py
@@ -0,0 +1,51 @@
+from macro.model_configurations import MODEL_CONFIFURATIONS
+from macro.language_dict import SUPPORTED_LANGUAGES
+
+
+
+def read_model_inputs(config):
+ macro_inputs = {}
+ language_label = config.get("language",None)
+ macro_inputs["language"] = lang_label_to_iso(language_label)
+
+
+ model_name = config.get("modelName",None)
+ model_id =[x["id"] for x in MODEL_CONFIFURATIONS.values() if x["family"] == model_name][0]
+ macro_inputs["embedding_model"] = model_id
+ macro_inputs["embedding_family"] = model_name
+
+ macro_inputs["output_folder_name"] = config.get("outputFolder",None)
+ macro_inputs["transformer_shortcut_name"] = config.get("transformersModelVersion",None)
+
+ return macro_inputs
+
+def is_folder_exist(project,output_folder_name):
+ managed_folders_list = [x["name"] for x in project.list_managed_folders()]
+ return True if output_folder_name in managed_folders_list else False
+
+def manage_model_folder(output_folder_name,project_key,client):
+ project = client.get_project(project_key)
+
+ #If needed, create the managed folder
+ if not is_folder_exist(project,output_folder_name):
+ output_folder = project.create_managed_folder(output_folder_name)
+
+ return output_folder
+
+
+def lang_iso_to_label(languages_iso):
+ languages_labels = []
+ for language in languages_iso:
+ search = [x for x in SUPPORTED_LANGUAGES if x["value"] == language]
+ if search:
+ languages_labels.append(search[0]["label"])
+ else:
+ languages_labels.append(language)
+ return languages_labels
+
+def lang_label_to_iso(language_label):
+ search = [x for x in SUPPORTED_LANGUAGES if x["label"] == language_label]
+ if search:
+ return search[0]["value"]
+ else:
+ return language_label
\ No newline at end of file
diff --git a/python-lib/macro/model_configurations.py b/python-lib/macro/model_configurations.py
new file mode 100644
index 0000000..057d5a8
--- /dev/null
+++ b/python-lib/macro/model_configurations.py
@@ -0,0 +1,1079 @@
+MODEL_CONFIFURATIONS = {
+ "word2vec": {
+ "id": "word2vec",
+ "family": "Word2Vec",
+ "language_list": ['en','grc', 'ar', 'eu', 'bg', 'ca', 'zh', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'gl', 'de', 'el', 'he', 'hi', 'hu', 'id', 'ga', 'it', 'ja', 'kk', 'ko', 'la', 'lv', 'nb', 'nn', 'cu', 'fa', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'es', 'sv', 'tr', 'uk', 'ur', 'ug', 'vi'],
+ "download_info": {
+ "en": {
+ "model_link": "https://docs.google.com/uc?export=download",
+ "id_gdrive": "0B7XkCwpI5KDYNlNUTTlSS21pQmM"
+ },
+ 'grc': {'model_id': 30},
+ 'ar': {'model_id': 31},
+ 'eu': {'model_id': 32},
+ 'bg': {'model_id': 33},
+ 'ca': {'model_id': 34},
+ 'zh': {'model_id': 35},
+ 'hr': {'model_id': 36},
+ 'cs': {'model_id': 37},
+ 'da': {'model_id': 38},
+ 'nl': {'model_id': 39},
+ 'et': {'model_id': 41},
+ 'fi': {'model_id': 42},
+ 'fr': {'model_id': 43},
+ 'gl': {'model_id': 44},
+ 'de': {'model_id': 45},
+ 'el': {'model_id': 46},
+ 'he': {'model_id': 47},
+ 'hi': {'model_id': 48},
+ 'hu': {'model_id': 49},
+ 'id': {'model_id': 50},
+ 'ga': {'model_id': 51},
+ 'it': {'model_id': 52},
+ 'ja': {'model_id': 53},
+ 'kk': {'model_id': 54},
+ 'ko': {'model_id': 55},
+ 'la': {'model_id': 56},
+ 'lv': {'model_id': 57},
+ 'nb': {'model_id': 58},
+ 'nn': {'model_id': 59},
+ 'cu': {'model_id': 60},
+ 'fa': {'model_id': 61},
+ 'pl': {'model_id': 62},
+ 'pt': {'model_id': 63},
+ 'ro': {'model_id': 64},
+ 'ru': {'model_id': 65},
+ 'sk': {'model_id': 66},
+ 'sl': {'model_id': 67},
+ 'es': {'model_id': 68},
+ 'sv': {'model_id': 69},
+ 'tr': {'model_id': 70},
+ 'uk': {'model_id': 71},
+ 'ur': {'model_id': 72},
+ 'ug': {'model_id': 73},
+ 'vi': {'model_id': 74}
+ }
+ },
+
+ "fasttext": {
+ "id": "fasttext",
+ "family": "FastText",
+ "language_list": ['af', 'sq', 'am', 'ar', 'an', 'hy', 'as', 'az', 'ba', 'eu', 'be', 'bn', 'bs', 'br', 'bg', 'my', 'ca', 'ce', 'zh', 'cv', 'co', 'hr', 'cs', 'da', 'dv', 'nl', 'pa', 'en', 'eo', 'et', 'fi', 'fr', 'gl', 'ka', 'de', 'el', 'gu', 'ht', 'he', 'hi', 'hu', 'is', 'io', 'id', 'ia', 'ga', 'it', 'ja', 'jv', 'kn', 'kk', 'km', 'ky', 'ko', 'ku', 'la', 'lv', 'li', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'gv', 'mr', 'mn', 'ne', 'no', 'nn', 'oc', 'or', 'os', 'ps', 'fa', 'pl', 'pt', 'qu', 'ro', 'rm', 'ru', 'sa', 'sc', 'gd', 'sr', 'sh', 'sd', 'si', 'sk', 'sl', 'so', 'es', 'su', 'sw', 'sv', 'tl', 'tg', 'ta', 'tt', 'te', 'th', 'bo', 'tr', 'tk', 'uk', 'ur', 'ug', 'uz', 'vi', 'vo', 'wa', 'cy', 'fy', 'yi', 'yo'],
+ "download_info": {
+ 'af': 'af',
+ 'sq': 'sq',
+ 'am': 'am',
+ 'ar': 'ar',
+ 'an': 'an',
+ 'hy': 'hy',
+ 'as': 'as',
+ 'az': 'az',
+ 'ba': 'ba',
+ 'eu': 'eu',
+ 'be': 'be',
+ 'bn': 'bn',
+ 'bs': 'bs',
+ 'br': 'br',
+ 'bg': 'bg',
+ 'my': 'my',
+ 'ca': 'ca',
+ 'ce': 'ce',
+ 'zh': 'zh',
+ 'cv': 'cv',
+ 'co': 'co',
+ 'hr': 'hr',
+ 'cs': 'cs',
+ 'da': 'da',
+ 'dv': 'dv',
+ 'nl': 'nl',
+ 'pa': 'pa',
+ 'en': 'en',
+ 'eo': 'eo',
+ 'et': 'et',
+ 'fi': 'fi',
+ 'fr': 'fr',
+ 'gl': 'gl',
+ 'ka': 'ka',
+ 'de': 'de',
+ 'el': 'el',
+ 'gu': 'gu',
+ 'ht': 'ht',
+ 'he': 'he',
+ 'hi': 'hi',
+ 'hu': 'hu',
+ 'is': 'is',
+ 'io': 'io',
+ 'id': 'id',
+ 'ia': 'ia',
+ 'ga': 'ga',
+ 'it': 'it',
+ 'ja': 'ja',
+ 'jv': 'jv',
+ 'kn': 'kn',
+ 'kk': 'kk',
+ 'km': 'km',
+ 'ky': 'ky',
+ 'ko': 'ko',
+ 'ku': 'ku',
+ 'la': 'la',
+ 'lv': 'lv',
+ 'li': 'li',
+ 'lt': 'lt',
+ 'lb': 'lb',
+ 'mk': 'mk',
+ 'mg': 'mg',
+ 'ms': 'ms',
+ 'ml': 'ml',
+ 'mt': 'mt',
+ 'gv': 'gv',
+ 'mr': 'mr',
+ 'mn': 'mn',
+ 'ne': 'ne',
+ 'no': 'no',
+ 'nn': 'nn',
+ 'oc': 'oc',
+ 'or': 'or',
+ 'os': 'os',
+ 'ps': 'ps',
+ 'fa': 'fa',
+ 'pl': 'pl',
+ 'pt': 'pt',
+ 'qu': 'qu',
+ 'ro': 'ro',
+ 'rm': 'rm',
+ 'ru': 'ru',
+ 'sa': 'sa',
+ 'sc': 'sc',
+ 'gd': 'gd',
+ 'sr': 'sr',
+ 'sh': 'sh',
+ 'sd': 'sd',
+ 'si': 'si',
+ 'sk': 'sk',
+ 'sl': 'sl',
+ 'so': 'so',
+ 'es': 'es',
+ 'su': 'su',
+ 'sw': 'sw',
+ 'sv': 'sv',
+ 'tl': 'tl',
+ 'tg': 'tg',
+ 'ta': 'ta',
+ 'tt': 'tt',
+ 'te': 'te',
+ 'th': 'th',
+ 'bo': 'bo',
+ 'tr': 'tr',
+ 'tk': 'tk',
+ 'uk': 'uk',
+ 'ur': 'ur',
+ 'ug': 'ug',
+ 'uz': 'uz',
+ 'vi': 'vi',
+ 'vo': 'vo',
+ 'wa': 'wa',
+ 'cy': 'cy',
+ 'fy': 'fy',
+ 'yi': 'yi',
+ 'yo': 'yo'
+ }
+ },
+
+ "glove": {
+ "id": "glove",
+ "family": "Glove",
+ "language_list": ["en"],
+ "download_info": {
+ "en": "http://nlp.stanford.edu/data/glove.42B.300d.zip"
+ }
+ },
+
+ "elmo": {
+ "id": "elmo",
+ "family": "ELMo",
+ "language_list": ["en"],
+ "download_info": {
+ "en": "https://tfhub.dev/google/elmo/3?tf-hub-format=compressed"
+ }
+ },
+
+ "use": {
+ "id": "use",
+ "family": "USE",
+ "language_list": ["en", "multilingual"],
+ "download_info": {
+ "en": "https://tfhub.dev/google/universal-sentence-encoder/4?tf-hub-format=compressed",
+ "multilingual": "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed"
+ }
+ },
+
+ 'TurkuNLP/bert-base-finnish-cased-v1': { 'description': '12-layer, '
+ '768-hidden, '
+ '12-heads, 110M '
+ 'parameters.\n'
+ 'Trained on cased '
+ 'Finnish text.\n'
+ '\n'
+ '(see details on '
+ 'turkunlp.org).',
+ 'download_info': {'fi': 'fi'},
+ 'family': 'BERT',
+ 'id': 'TurkuNLP/bert-base-finnish-cased-v1',
+ 'language_list': ['fi']},
+ 'TurkuNLP/bert-base-finnish-uncased-v1': { 'description': '12-layer, '
+ '768-hidden, '
+ '12-heads, 110M '
+ 'parameters.\n'
+ 'Trained on '
+ 'uncased Finnish '
+ 'text.\n'
+ '\n'
+ '(see details on '
+ 'turkunlp.org).',
+ 'download_info': {'fi': 'fi'},
+ 'family': 'BERT',
+ 'id': 'TurkuNLP/bert-base-finnish-uncased-v1',
+ 'language_list': ['fi']},
+ 'albert-base-v1': { 'description': '12 repeating layers, 128 embedding, '
+ '768-hidden, 12-heads, 11M parameters\n'
+ 'ALBERT base model\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'ALBERT',
+ 'id': 'albert-base-v1',
+ 'language_list': ['en']},
+ 'albert-base-v2': { 'description': '12 repeating layers, 128 embedding, '
+ '768-hidden, 12-heads, 11M parameters\n'
+ 'ALBERT base model with no dropout, '
+ 'additional training data and longer '
+ 'training\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'ALBERT',
+ 'id': 'albert-base-v2',
+ 'language_list': ['en']},
+ 'albert-large-v1': { 'description': '24 repeating layers, 128 embedding, '
+ '1024-hidden, 16-heads, 17M '
+ 'parameters\n'
+ 'ALBERT large model\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'ALBERT',
+ 'id': 'albert-large-v1',
+ 'language_list': ['en']},
+ 'albert-large-v2': { 'description': '24 repeating layers, 128 embedding, '
+ '1024-hidden, 16-heads, 17M '
+ 'parameters\n'
+ 'ALBERT large model with no dropout, '
+ 'additional training data and longer '
+ 'training\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'ALBERT',
+ 'id': 'albert-large-v2',
+ 'language_list': ['en']},
+ 'albert-xlarge-v1': { 'description': '24 repeating layers, 128 embedding, '
+ '2048-hidden, 16-heads, 58M '
+ 'parameters\n'
+ 'ALBERT xlarge model\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'ALBERT',
+ 'id': 'albert-xlarge-v1',
+ 'language_list': ['en']},
+ 'albert-xlarge-v2': { 'description': '24 repeating layers, 128 embedding, '
+ '2048-hidden, 16-heads, 58M '
+ 'parameters\n'
+ 'ALBERT xlarge model with no dropout, '
+ 'additional training data and longer '
+ 'training\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'ALBERT',
+ 'id': 'albert-xlarge-v2',
+ 'language_list': ['en']},
+ 'albert-xxlarge-v1': { 'description': '12 repeating layer, 128 embedding, '
+ '4096-hidden, 64-heads, 223M '
+ 'parameters\n'
+ 'ALBERT xxlarge model\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'ALBERT',
+ 'id': 'albert-xxlarge-v1',
+ 'language_list': ['en']},
+ 'albert-xxlarge-v2': { 'description': '12 repeating layer, 128 embedding, '
+ '4096-hidden, 64-heads, 223M '
+ 'parameters\n'
+ 'ALBERT xxlarge model with no '
+ 'dropout, additional training data '
+ 'and longer training\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'ALBERT',
+ 'id': 'albert-xxlarge-v2',
+ 'language_list': ['en']},
+ 'allenai/longformer-base-4096': { 'description': '12-layer, 768-hidden, '
+ '12-heads, ~149M '
+ 'parameters\n'
+ 'Starting from '
+ 'RoBERTa-base checkpoint, '
+ 'trained on documents of '
+ 'max length 4,096',
+ 'download_info': {'en': 'en'},
+ 'family': 'Longformer',
+ 'id': 'allenai/longformer-base-4096',
+ 'language_list': ['en']},
+ 'allenai/longformer-large-4096': { 'description': '24-layer, 1024-hidden, '
+ '16-heads, ~435M '
+ 'parameters\n'
+ 'Starting from '
+ 'RoBERTa-large '
+ 'checkpoint, trained on '
+ 'documents of max length '
+ '4,096',
+ 'download_info': {'en': 'en'},
+ 'family': 'Longformer',
+ 'id': 'allenai/longformer-large-4096',
+ 'language_list': ['en']},
+ 'bert-base-cased': { 'description': '12-layer, 768-hidden, 12-heads, 110M '
+ 'parameters.\n'
+ 'Trained on cased English text.',
+ 'download_info': {'en': 'en'},
+ 'family': 'BERT',
+ 'id': 'bert-base-cased',
+ 'language_list': ['en']},
+ 'bert-base-cased-finetuned-mrpc': { 'description': '12-layer, 768-hidden, '
+ '12-heads, 110M '
+ 'parameters.\n'
+ 'The bert-base-cased '
+ 'model fine-tuned on '
+ 'MRPC\n'
+ '\n'
+ '(see details of '
+ 'fine-tuning in the '
+ 'example section)',
+ 'download_info': {'en': 'en'},
+ 'family': 'BERT',
+ 'id': 'bert-base-cased-finetuned-mrpc',
+ 'language_list': ['en']},
+ 'bert-base-chinese': { 'description': '12-layer, 768-hidden, 12-heads, '
+ '110M parameters.\n'
+ 'Trained on cased Chinese Simplified '
+ 'and Traditional text.',
+ 'download_info': {'zh': 'zh'},
+ 'family': 'BERT',
+ 'id': 'bert-base-chinese',
+ 'language_list': ['zh']},
+ 'bert-base-german-cased': { 'description': '12-layer, 768-hidden, '
+ '12-heads, 110M parameters.\n'
+ 'Trained on cased German text '
+ 'by Deepset.ai\n'
+ '\n'
+ '(see details on deepset.ai '
+ 'website).',
+ 'download_info': {'de': 'de'},
+ 'family': 'BERT',
+ 'id': 'bert-base-german-cased',
+ 'language_list': ['de']},
+ 'bert-base-german-dbmdz-cased': { 'description': '12-layer, 768-hidden, '
+ '12-heads, 110M '
+ 'parameters.\n'
+ 'Trained on cased German '
+ 'text by DBMDZ\n'
+ '\n'
+ '(see details on dbmdz '
+ 'repository).',
+ 'download_info': {'de': 'de'},
+ 'family': 'BERT',
+ 'id': 'bert-base-german-dbmdz-cased',
+ 'language_list': ['de']},
+ 'bert-base-german-dbmdz-uncased': { 'description': '12-layer, 768-hidden, '
+ '12-heads, 110M '
+ 'parameters.\n'
+ 'Trained on uncased '
+ 'German text by DBMDZ\n'
+ '\n'
+ '(see details on dbmdz '
+ 'repository).',
+ 'download_info': {'de': 'de'},
+ 'family': 'BERT',
+ 'id': 'bert-base-german-dbmdz-uncased',
+ 'language_list': ['de']},
+ 'bert-base-multilingual-cased': { 'description': '(New, recommended) '
+ '12-layer, 768-hidden, '
+ '12-heads, 110M '
+ 'parameters.\n'
+ 'Trained on cased text in '
+ 'the top 104 languages '
+ 'with the largest '
+ 'Wikipedias\n'
+ '\n'
+ '.',
+ 'download_info': { 'multilingual': 'multilingual'},
+ 'family': 'BERT',
+ 'id': 'bert-base-multilingual-cased',
+ 'language_list': ['multilingual']},
+ 'bert-base-multilingual-uncased': { 'description': '(Original, not '
+ 'recommended) 12-layer, '
+ '768-hidden, 12-heads, '
+ '110M parameters.\n'
+ 'Trained on lower-cased '
+ 'text in the top 102 '
+ 'languages with the '
+ 'largest Wikipedias\n'
+ '\n'
+ '.',
+ 'download_info': { 'multilingual': 'multilingual'},
+ 'family': 'BERT',
+ 'id': 'bert-base-multilingual-uncased',
+ 'language_list': ['multilingual']},
+ 'bert-base-uncased': { 'description': '12-layer, 768-hidden, 12-heads, '
+ '110M parameters.\n'
+ 'Trained on lower-cased English '
+ 'text.',
+ 'download_info': {'en': 'en'},
+ 'family': 'BERT',
+ 'id': 'bert-base-uncased',
+ 'language_list': ['en']},
+ 'bert-large-cased': { 'description': '24-layer, 1024-hidden, 16-heads, '
+ '340M parameters.\n'
+ 'Trained on cased English text.',
+ 'download_info': {'en': 'en'},
+ 'family': 'BERT',
+ 'id': 'bert-large-cased',
+ 'language_list': ['en']},
+ 'bert-large-cased-whole-word-masking': { 'description': '24-layer, '
+ '1024-hidden, '
+ '16-heads, 340M '
+ 'parameters.\n'
+ 'Trained on cased '
+ 'English text '
+ 'using '
+ 'Whole-Word-Masking\n'
+ '\n'
+ '.',
+ 'download_info': {'en': 'en'},
+ 'family': 'BERT',
+ 'id': 'bert-large-cased-whole-word-masking',
+ 'language_list': ['en']},
+ 'bert-large-cased-whole-word-masking-finetuned-squad': { 'description': '24-layer, '
+ '1024-hidden, '
+ '16-heads, '
+ '340M '
+ 'parameters\n'
+ 'The '
+ 'bert-large-cased-whole-word-masking '
+ 'model '
+ 'fine-tuned '
+ 'on '
+ 'SQuAD\n'
+ '\n'
+ '(see '
+ 'details '
+ 'of '
+ 'fine-tuning '
+ 'in '
+ 'the '
+ 'example '
+ 'section)',
+ 'download_info': { 'en': 'en'},
+ 'family': 'BERT',
+ 'id': 'bert-large-cased-whole-word-masking-finetuned-squad',
+ 'language_list': [ 'en']},
+ 'bert-large-uncased': { 'description': '24-layer, 1024-hidden, 16-heads, '
+ '340M parameters.\n'
+ 'Trained on lower-cased English '
+ 'text.',
+ 'download_info': {'en': 'en'},
+ 'family': 'BERT',
+ 'id': 'bert-large-uncased',
+ 'language_list': ['en']},
+ 'bert-large-uncased-whole-word-masking': { 'description': '24-layer, '
+ '1024-hidden, '
+ '16-heads, 340M '
+ 'parameters.\n'
+ 'Trained on '
+ 'lower-cased '
+ 'English text '
+ 'using '
+ 'Whole-Word-Masking\n'
+ '\n'
+ '.',
+ 'download_info': {'en': 'en'},
+ 'family': 'BERT',
+ 'id': 'bert-large-uncased-whole-word-masking',
+ 'language_list': ['en']},
+ 'bert-large-uncased-whole-word-masking-finetuned-squad': { 'description': '24-layer, '
+ '1024-hidden, '
+ '16-heads, '
+ '340M '
+ 'parameters.\n'
+ 'The '
+ 'bert-large-uncased-whole-word-masking '
+ 'model '
+ 'fine-tuned '
+ 'on '
+ 'SQuAD\n'
+ '\n'
+ '(see '
+ 'details '
+ 'of '
+ 'fine-tuning '
+ 'in '
+ 'the '
+ 'example '
+ 'section).',
+ 'download_info': { 'en': 'en'},
+ 'family': 'BERT',
+ 'id': 'bert-large-uncased-whole-word-masking-finetuned-squad',
+ 'language_list': [ 'en']},
+ 'camembert-base': { 'description': '12-layer, 768-hidden, 12-heads, 110M '
+ 'parameters\n'
+ 'CamemBERT using the BERT-base '
+ 'architecture\n'
+ '\n',
+ 'download_info': {'fr': 'fr'},
+ 'family': 'CamemBERT',
+ 'id': 'camembert-base',
+ 'language_list': ['fr']},
+ 'cl-tohoku/bert-base-japanese': { 'description': '12-layer, 768-hidden, '
+ '12-heads, 110M '
+ 'parameters.\n'
+ 'Trained on Japanese '
+ 'text. Text is tokenized '
+ 'with MeCab and '
+ 'WordPiece.\n'
+ 'MeCab is required for '
+ 'tokenization.\n'
+ '\n'
+ '(see details on '
+ 'cl-tohoku repository).',
+ 'download_info': {'ja': 'ja'},
+ 'family': 'BERT',
+ 'id': 'cl-tohoku/bert-base-japanese',
+ 'language_list': ['ja']},
+ 'cl-tohoku/bert-base-japanese-char': { 'description': '12-layer, '
+ '768-hidden, '
+ '12-heads, 110M '
+ 'parameters.\n'
+ 'Trained on Japanese '
+ 'text. Text is '
+ 'tokenized into '
+ 'characters.\n'
+ '\n'
+ '(see details on '
+ 'cl-tohoku '
+ 'repository).',
+ 'download_info': {'ja': 'ja'},
+ 'family': 'BERT',
+ 'id': 'cl-tohoku/bert-base-japanese-char',
+ 'language_list': ['ja']},
+ 'cl-tohoku/bert-base-japanese-char-whole-word-masking': { 'description': '12-layer, '
+ '768-hidden, '
+ '12-heads, '
+ '110M '
+ 'parameters.\n'
+ 'Trained '
+ 'on '
+ 'Japanese '
+ 'text '
+ 'using '
+ 'Whole-Word-Masking. '
+ 'Text '
+ 'is '
+ 'tokenized '
+ 'into '
+ 'characters.\n'
+ '\n'
+ '(see '
+ 'details '
+ 'on '
+ 'cl-tohoku '
+ 'repository).',
+ 'download_info': { 'ja': 'ja'},
+ 'family': 'BERT',
+ 'id': 'cl-tohoku/bert-base-japanese-char-whole-word-masking',
+ 'language_list': [ 'ja']},
+ 'cl-tohoku/bert-base-japanese-whole-word-masking': { 'description': '12-layer, '
+ '768-hidden, '
+ '12-heads, '
+ '110M '
+ 'parameters.\n'
+ 'Trained '
+ 'on '
+ 'Japanese '
+ 'text '
+ 'using '
+ 'Whole-Word-Masking. '
+ 'Text '
+ 'is '
+ 'tokenized '
+ 'with '
+ 'MeCab '
+ 'and '
+ 'WordPiece.\n'
+ 'MeCab '
+ 'is '
+ 'required '
+ 'for '
+ 'tokenization.\n'
+ '\n'
+ '(see '
+ 'details '
+ 'on '
+ 'cl-tohoku '
+ 'repository).',
+ 'download_info': { 'ja': 'ja'},
+ 'family': 'BERT',
+ 'id': 'cl-tohoku/bert-base-japanese-whole-word-masking',
+ 'language_list': [ 'ja']},
+ 'ctrl': { 'description': '48-layer, 1280-hidden, 16-heads, 1.6B '
+ 'parameters\n'
+ 'Salesforceâ\x80\x99s Large-sized CTRL English '
+ 'model',
+ 'download_info': {'en': 'en'},
+ 'family': 'CTRL',
+ 'id': 'ctrl',
+ 'language_list': ['en']},
+ 'distilbert-base-cased': { 'description': '6-layer, 768-hidden, 12-heads, '
+ '65M parameters\n'
+ 'The DistilBERT model distilled '
+ 'from the BERT model '
+ 'bert-base-cased checkpoint\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'DistilBERT',
+ 'id': 'distilbert-base-cased',
+ 'language_list': ['en']},
+ 'distilbert-base-cased-distilled-squad': { 'description': '6-layer, '
+ '768-hidden, '
+ '12-heads, 65M '
+ 'parameters\n'
+ 'The DistilBERT '
+ 'model distilled '
+ 'from the BERT '
+ 'model '
+ 'bert-base-cased '
+ 'checkpoint, '
+ 'with an '
+ 'additional '
+ 'question '
+ 'answering '
+ 'layer.\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'DistilBERT',
+ 'id': 'distilbert-base-cased-distilled-squad',
+ 'language_list': ['en']},
+ 'distilbert-base-german-cased': { 'description': '6-layer, 768-hidden, '
+ '12-heads, 66M '
+ 'parameters\n'
+ 'The German DistilBERT '
+ 'model distilled from the '
+ 'German DBMDZ BERT model '
+ 'bert-base-german-dbmdz-cased '
+ 'checkpoint.\n'
+ '\n',
+ 'download_info': {'de': 'de'},
+ 'family': 'DistilBERT',
+ 'id': 'distilbert-base-german-cased',
+ 'language_list': ['de']},
+ 'distilbert-base-multilingual-cased': { 'description': '6-layer, '
+ '768-hidden, '
+ '12-heads, 134M '
+ 'parameters\n'
+ 'The multilingual '
+ 'DistilBERT model '
+ 'distilled from the '
+ 'Multilingual BERT '
+ 'model '
+ 'bert-base-multilingual-cased '
+ 'checkpoint.\n'
+ '\n',
+ 'download_info': { 'multilingual': 'multilingual'},
+ 'family': 'DistilBERT',
+ 'id': 'distilbert-base-multilingual-cased',
+ 'language_list': ['multilingual']},
+ 'distilbert-base-uncased': { 'description': '6-layer, 768-hidden, '
+ '12-heads, 66M parameters\n'
+ 'The DistilBERT model '
+ 'distilled from the BERT model '
+ 'bert-base-uncased checkpoint\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'DistilBERT',
+ 'id': 'distilbert-base-uncased',
+ 'language_list': ['en']},
+ 'distilbert-base-uncased-distilled-squad': { 'description': '6-layer, '
+ '768-hidden, '
+ '12-heads, 66M '
+ 'parameters\n'
+ 'The '
+ 'DistilBERT '
+ 'model '
+ 'distilled '
+ 'from the BERT '
+ 'model '
+ 'bert-base-uncased '
+ 'checkpoint, '
+ 'with an '
+ 'additional '
+ 'linear '
+ 'layer.\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'DistilBERT',
+ 'id': 'distilbert-base-uncased-distilled-squad',
+ 'language_list': ['en']},
+ 'distilgpt2': { 'description': '6-layer, 768-hidden, 12-heads, 82M '
+ 'parameters\n'
+ 'The DistilGPT2 model distilled from the '
+ 'GPT2 model gpt2 checkpoint.\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'DistilBERT',
+ 'id': 'distilgpt2',
+ 'language_list': ['en']},
+ 'distilroberta-base': { 'description': '6-layer, 768-hidden, 12-heads, 82M '
+ 'parameters\n'
+ 'The DistilRoBERTa model distilled '
+ 'from the RoBERTa model '
+ 'roberta-base checkpoint.\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'RoBERTa',
+ 'id': 'distilroberta-base',
+ 'language_list': ['en']},
+ 'facebook/bart-large': { 'description': '24-layer, 1024-hidden, 16-heads, '
+ '406M parameters\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'Bart',
+ 'id': 'facebook/bart-large',
+ 'language_list': ['en']},
+ 'facebook/bart-large-cnn': { 'description': '12-layer, 1024-hidden, '
+ '16-heads, 406M '
+ 'parameters (same as '
+ 'base)\n'
+ 'bart-large base architecture '
+ 'finetuned on cnn '
+ 'summarization task',
+ 'download_info': {'en': 'en'},
+ 'family': 'Bart',
+ 'id': 'facebook/bart-large-cnn',
+ 'language_list': ['en']},
+ 'facebook/bart-large-mnli': { 'description': 'Adds a 2 layer '
+ 'classification head with 1 '
+ 'million parameters\n'
+ 'bart-large base architecture '
+ 'with a classification head, '
+ 'finetuned on MNLI',
+ 'download_info': {'en': 'en'},
+ 'family': 'Bart',
+ 'id': 'facebook/bart-large-mnli',
+ 'language_list': ['en']},
+ 'facebook/mbart-large-en-ro': { 'description': '12-layer, 1024-hidden, '
+ '16-heads, 880M parameters\n'
+ 'bart-large architecture '
+ 'pretrained on cc25 '
+ 'multilingual data , '
+ 'finetuned on WMT english '
+ 'romanian translation.',
+ 'download_info': { 'multilingual': 'multilingual'},
+ 'family': 'Bart',
+ 'id': 'facebook/mbart-large-en-ro',
+ 'language_list': ['multilingual']},
+ 'flaubert/flaubert_base_cased': { 'description': '12-layer, 768-hidden, '
+ '12-heads, 138M '
+ 'parameters\n'
+ 'FlauBERT base '
+ 'architecture with cased '
+ 'vocabulary\n'
+ '\n',
+ 'download_info': {'fr': 'fr'},
+ 'family': 'FlauBERT',
+ 'id': 'flaubert/flaubert_base_cased',
+ 'language_list': ['fr']},
+ 'flaubert/flaubert_base_uncased': { 'description': '12-layer, 768-hidden, '
+ '12-heads, 137M '
+ 'parameters\n'
+ 'FlauBERT base '
+ 'architecture with '
+ 'uncased vocabulary\n'
+ '\n',
+ 'download_info': {'fr': 'fr'},
+ 'family': 'FlauBERT',
+ 'id': 'flaubert/flaubert_base_uncased',
+ 'language_list': ['fr']},
+ 'flaubert/flaubert_large_cased': { 'description': '24-layer, 1024-hidden, '
+ '16-heads, 373M '
+ 'parameters\n'
+ 'FlauBERT large '
+ 'architecture\n'
+ '\n',
+ 'download_info': {'fr': 'fr'},
+ 'family': 'FlauBERT',
+ 'id': 'flaubert/flaubert_large_cased',
+ 'language_list': ['fr']},
+ 'flaubert/flaubert_small_cased': { 'description': '6-layer, 512-hidden, '
+ '8-heads, 54M '
+ 'parameters\n'
+ 'FlauBERT small '
+ 'architecture\n'
+ '\n',
+ 'download_info': {'fr': 'fr'},
+ 'family': 'FlauBERT',
+ 'id': 'flaubert/flaubert_small_cased',
+ 'language_list': ['fr']},
+ 'gpt2': { 'description': '12-layer, 768-hidden, 12-heads, 117M '
+ 'parameters.\n'
+ 'OpenAI GPT-2 English model',
+ 'download_info': {'en': 'en'},
+ 'family': 'GPT-2',
+ 'id': 'gpt2',
+ 'language_list': ['en']},
+ 'gpt2-large': { 'description': '36-layer, 1280-hidden, 20-heads, 774M '
+ 'parameters.\n'
+ 'OpenAIâ\x80\x99s Large-sized GPT-2 English '
+ 'model',
+ 'download_info': {'en': 'en'},
+ 'family': 'GPT-2',
+ 'id': 'gpt2-large',
+ 'language_list': ['en']},
+ 'gpt2-medium': { 'description': '24-layer, 1024-hidden, 16-heads, 345M '
+ 'parameters.\n'
+ 'OpenAIâ\x80\x99s Medium-sized GPT-2 '
+ 'English model',
+ 'download_info': {'en': 'en'},
+ 'family': 'GPT-2',
+ 'id': 'gpt2-medium',
+ 'language_list': ['en']},
+ 'gpt2-xl': { 'description': '48-layer, 1600-hidden, 25-heads, 1558M '
+ 'parameters.\n'
+ 'OpenAIâ\x80\x99s XL-sized GPT-2 English model',
+ 'download_info': {'en': 'en'},
+ 'family': 'GPT-2',
+ 'id': 'gpt2-xl',
+ 'language_list': ['en']},
+ 'openai-gpt': { 'description': '12-layer, 768-hidden, 12-heads, 110M '
+ 'parameters.\n'
+ 'OpenAI GPT English model',
+ 'download_info': {'en': 'en'},
+ 'family': 'GPT',
+ 'id': 'openai-gpt',
+ 'language_list': ['en']},
+ 'roberta-base': { 'description': '12-layer, 768-hidden, 12-heads, 125M '
+ 'parameters\n'
+ 'RoBERTa using the BERT-base '
+ 'architecture\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'RoBERTa',
+ 'id': 'roberta-base',
+ 'language_list': ['en']},
+ 'roberta-base-openai-detector': { 'description': '12-layer, 768-hidden, '
+ '12-heads, 125M '
+ 'parameters\n'
+ 'roberta-base fine-tuned '
+ 'by OpenAI on the outputs '
+ 'of the 1.5B-parameter '
+ 'GPT-2 model.\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'RoBERTa',
+ 'id': 'roberta-base-openai-detector',
+ 'language_list': ['en']},
+ 'roberta-large': { 'description': '24-layer, 1024-hidden, 16-heads, 355M '
+ 'parameters\n'
+ 'RoBERTa using the BERT-large '
+ 'architecture\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'RoBERTa',
+ 'id': 'roberta-large',
+ 'language_list': ['en']},
+ 'roberta-large-mnli': { 'description': '24-layer, 1024-hidden, 16-heads, '
+ '355M parameters\n'
+ 'roberta-large fine-tuned on MNLI.\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'RoBERTa',
+ 'id': 'roberta-large-mnli',
+ 'language_list': ['en']},
+ 'roberta-large-openai-detector': { 'description': '24-layer, 1024-hidden, '
+ '16-heads, 355M '
+ 'parameters\n'
+ 'roberta-large '
+ 'fine-tuned by OpenAI on '
+ 'the outputs of the '
+ '1.5B-parameter GPT-2 '
+ 'model.\n'
+ '\n',
+ 'download_info': {'en': 'en'},
+ 'family': 'RoBERTa',
+ 'id': 'roberta-large-openai-detector',
+ 'language_list': ['en']},
+ 't5-base': { 'description': '~220M parameters with 12-layers, '
+ '768-hidden-state, 3072 feed-forward '
+ 'hidden-state, 12-heads,\n'
+ 'Trained on English text: the Colossal Clean '
+ 'Crawled Corpus (C4)',
+ 'download_info': {'en': 'en'},
+ 'family': 'T5',
+ 'id': 't5-base',
+ 'language_list': ['en']},
+ 't5-large': { 'description': '~770M parameters with 24-layers, '
+ '1024-hidden-state, 4096 feed-forward '
+ 'hidden-state, 16-heads,\n'
+ 'Trained on English text: the Colossal Clean '
+ 'Crawled Corpus (C4)',
+ 'download_info': {'en': 'en'},
+ 'family': 'T5',
+ 'id': 't5-large',
+ 'language_list': ['en']},
+ 't5-small': { 'description': '~60M parameters with 6-layers, '
+ '512-hidden-state, 2048 feed-forward '
+ 'hidden-state, 8-heads,\n'
+ 'Trained on English text: the Colossal Clean '
+ 'Crawled Corpus (C4)',
+ 'download_info': {'en': 'en'},
+ 'family': 'T5',
+ 'id': 't5-small',
+ 'language_list': ['en']},
+ 'transfo-xl-wt103': { 'description': '18-layer, 1024-hidden, 16-heads, '
+ '257M parameters.\n'
+ 'English model trained on '
+ 'wikitext-103',
+ 'download_info': {'en': 'en'},
+ 'family': 'Transformer-XL',
+ 'id': 'transfo-xl-wt103',
+ 'language_list': ['en']},
+ 'wietsedv/bert-base-dutch-cased': { 'description': '12-layer, 768-hidden, '
+ '12-heads, 110M '
+ 'parameters.\n'
+ 'Trained on cased Dutch '
+ 'text.\n'
+ '\n'
+ '(see details on '
+ 'wietsedv repository).',
+ 'download_info': {'nl': 'nl'},
+ 'family': 'BERT',
+ 'id': 'wietsedv/bert-base-dutch-cased',
+ 'language_list': ['nl']},
+ 'xlm-clm-ende-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n'
+ 'XLM English-German model trained '
+ 'with CLM (Causal Language Modeling) '
+ 'on the concatenation of English and '
+ 'German wikipedia',
+ 'download_info': {'multilingual': 'multilingual'},
+ 'family': 'XLM',
+ 'id': 'xlm-clm-ende-1024',
+ 'language_list': ['multilingual']},
+ 'xlm-clm-enfr-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n'
+ 'XLM English-French model trained '
+ 'with CLM (Causal Language Modeling) '
+ 'on the concatenation of English and '
+ 'French wikipedia',
+ 'download_info': {'multilingual': 'multilingual'},
+ 'family': 'XLM',
+ 'id': 'xlm-clm-enfr-1024',
+ 'language_list': ['multilingual']},
+ 'xlm-mlm-100-1280': { 'description': '16-layer, 1280-hidden, 16-heads\n'
+ 'XLM model trained with MLM (Masked '
+ 'Language Modeling) on 100 languages.',
+ 'download_info': {'multilingual': 'multilingual'},
+ 'family': 'XLM',
+ 'id': 'xlm-mlm-100-1280',
+ 'language_list': ['multilingual']},
+ 'xlm-mlm-17-1280': { 'description': '16-layer, 1280-hidden, 16-heads\n'
+ 'XLM model trained with MLM (Masked '
+ 'Language Modeling) on 17 languages.',
+ 'download_info': {'multilingual': 'multilingual'},
+ 'family': 'XLM',
+ 'id': 'xlm-mlm-17-1280',
+ 'language_list': ['multilingual']},
+ 'xlm-mlm-en-2048': { 'description': '12-layer, 2048-hidden, 16-heads\n'
+ 'XLM English model',
+ 'download_info': {'en': 'en'},
+ 'family': 'XLM',
+ 'id': 'xlm-mlm-en-2048',
+ 'language_list': ['en']},
+ 'xlm-mlm-ende-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n'
+ 'XLM English-German model trained on '
+ 'the concatenation of English and '
+ 'German wikipedia',
+ 'download_info': {'multilingual': 'multilingual'},
+ 'family': 'XLM',
+ 'id': 'xlm-mlm-ende-1024',
+ 'language_list': ['multilingual']},
+ 'xlm-mlm-enfr-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n'
+ 'XLM English-French model trained on '
+ 'the concatenation of English and '
+ 'French wikipedia',
+ 'download_info': {'multilingual': 'multilingual'},
+ 'family': 'XLM',
+ 'id': 'xlm-mlm-enfr-1024',
+ 'language_list': ['multilingual']},
+ 'xlm-mlm-enro-1024': { 'description': '6-layer, 1024-hidden, 8-heads\n'
+ 'XLM English-Romanian Multi-language '
+ 'model',
+ 'download_info': {'multilingual': 'multilingual'},
+ 'family': 'XLM',
+ 'id': 'xlm-mlm-enro-1024',
+ 'language_list': ['multilingual']},
+ 'xlm-mlm-tlm-xnli15-1024': { 'description': '12-layer, 1024-hidden, '
+ '8-heads\n'
+ 'XLM Model pre-trained with '
+ 'MLM + TLM on the 15 XNLI '
+ 'languages.',
+ 'download_info': { 'multilingual': 'multilingual'},
+ 'family': 'XLM',
+ 'id': 'xlm-mlm-tlm-xnli15-1024',
+ 'language_list': ['multilingual']},
+ 'xlm-mlm-xnli15-1024': { 'description': '12-layer, 1024-hidden, 8-heads\n'
+ 'XLM Model pre-trained with MLM on '
+ 'the 15 XNLI languages.',
+ 'download_info': {'multilingual': 'multilingual'},
+ 'family': 'XLM',
+ 'id': 'xlm-mlm-xnli15-1024',
+ 'language_list': ['multilingual']},
+ 'xlm-roberta-base': { 'description': '~125M parameters with 12-layers, '
+ '768-hidden-state, 3072 feed-forward '
+ 'hidden-state, 8-heads,\n'
+ 'Trained on on 2.5 TB of newly '
+ 'created clean CommonCrawl data in '
+ '100 languages',
+ 'download_info': { 'xlm-roberta-base': 'xlm-roberta-base'},
+ 'family': 'XLM-RoBERTa',
+ 'id': 'xlm-roberta-base',
+ 'language_list': ['xlm-roberta-base']},
+ 'xlm-roberta-large': { 'description': '~355M parameters with 24-layers, '
+ '1027-hidden-state, 4096 '
+ 'feed-forward hidden-state, '
+ '16-heads,\n'
+ 'Trained on 2.5 TB of newly created '
+ 'clean CommonCrawl data in 100 '
+ 'languages',
+ 'download_info': { 'xlm-roberta-large': 'xlm-roberta-large'},
+ 'family': 'XLM-RoBERTa',
+ 'id': 'xlm-roberta-large',
+ 'language_list': ['xlm-roberta-large']},
+ 'xlnet-base-cased': { 'description': '12-layer, 768-hidden, 12-heads, 110M '
+ 'parameters.\n'
+ 'XLNet English model',
+ 'download_info': {'en': 'en'},
+ 'family': 'XLNet',
+ 'id': 'xlnet-base-cased',
+ 'language_list': ['en']},
+ 'xlnet-large-cased': { 'description': '24-layer, 1024-hidden, 16-heads, '
+ '340M parameters.\n'
+ 'XLNet Large English model',
+ 'download_info': {'en': 'en'},
+ 'family': 'XLNet',
+ 'id': 'xlnet-large-cased',
+ 'language_list': ['en']}
+
+ }
+
+TRANSFORMERS_LIST = ['bert-base-uncased', 'bert-large-uncased', 'bert-base-cased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-chinese', 'bert-base-german-cased', 'bert-large-uncased-whole-word-masking', 'bert-large-cased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad', 'bert-large-cased-whole-word-masking-finetuned-squad', 'bert-base-cased-finetuned-mrpc', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'cl-tohoku/bert-base-japanese', 'cl-tohoku/bert-base-japanese-whole-word-masking', 'cl-tohoku/bert-base-japanese-char', 'cl-tohoku/bert-base-japanese-char-whole-word-masking', 'TurkuNLP/bert-base-finnish-cased-v1', 'TurkuNLP/bert-base-finnish-uncased-v1', 'wietsedv/bert-base-dutch-cased', 'facebook/bart-large', 'facebook/bart-large-mnli', 'facebook/bart-large-cnn', 'facebook/mbart-large-en-ro', 'openai-gpt', 'transfo-xl-wt103', 'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'distilgpt2', 'ctrl', 'xlnet-base-cased', 'xlnet-large-cased', 'xlm-mlm-en-2048', 'xlm-mlm-ende-1024', 'xlm-mlm-enfr-1024', 'xlm-mlm-enro-1024', 'xlm-mlm-tlm-xnli15-1024', 'xlm-mlm-xnli15-1024', 'xlm-clm-enfr-1024', 'xlm-clm-ende-1024', 'xlm-mlm-17-1280', 'xlm-mlm-100-1280', 'roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', 'roberta-large-openai-detector', 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad', 'distilbert-base-cased', 'distilbert-base-cased-distilled-squad', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased', 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1', 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'camembert-base', 't5-small', 't5-base', 't5-large', 'xlm-roberta-base', 'xlm-roberta-large', 'flaubert/flaubert_small_cased', 'flaubert/flaubert_base_uncased', 'flaubert/flaubert_base_cased', 'flaubert/flaubert_large_cased', 'allenai/longformer-base-4096', 'allenai/longformer-large-4096']
+
diff --git a/python-lib/macro/model_downloaders.py b/python-lib/macro/model_downloaders.py
new file mode 100644
index 0000000..a8f3caa
--- /dev/null
+++ b/python-lib/macro/model_downloaders.py
@@ -0,0 +1,293 @@
+import json
+import requests
+import shutil
+import gzip
+import os
+from pathlib import Path
+import tarfile
+import io
+import zipfile
+from macro.model_configurations import MODEL_CONFIFURATIONS
+import time
+from transformers.file_utils import (S3_BUCKET_PREFIX,
+ CLOUDFRONT_DISTRIB_PREFIX,
+ hf_bucket_url)
+from .model_configurations import MODEL_CONFIFURATIONS
+
+
+WORD2VEC_BASE_URL = "http://vectors.nlpl.eu/repository/20/{}.zip"
+FASTTEXT_BASE_URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{}.300.vec.gz"
+HG_FILENAMES = ["pytorch_model.bin","config.json","vocab.txt"]
+
+class BaseDownloader(object):
+ def __init__(self,folder,macro_inputs,proxy,progress_callback):
+ self.folder = folder
+ self.proxy = proxy
+ self.progress_callback = progress_callback
+ self.language = macro_inputs["language"]
+ self.embedding_model = macro_inputs["embedding_model"]
+ self.embedding_family = macro_inputs["embedding_family"]
+ self.model_params = MODEL_CONFIFURATIONS[self.embedding_model]
+ self.model_id = self.embedding_model + '-' + self.language
+ self.archive_name = ''
+
+
+
+ def get_stream(self, download_link):
+ response = requests.get(download_link, stream=True, proxies=self.proxy)
+ return response
+
+ def download_plain(self, response, bytes_so_far=0):
+ #Download plain files
+ total_size = self.get_file_size(response)
+ update_time = time.time()
+ with self.folder.get_writer(self.archive_name) as w:
+ for chunk in response.iter_content(chunk_size=100000):
+ if chunk:
+ bytes_so_far += len(chunk)
+ percent = int(float(bytes_so_far) / total_size * 100)
+ update_time = self.update_percent(percent, update_time)
+ w.write(chunk)
+ return bytes_so_far
+
+ def download_gz(self, response, bytes_so_far=0):
+ #Download .ga files
+ total_size = self.get_file_size(response)
+ update_time = time.time()
+ destination_writer = self.folder.get_writer(self.archive_name)
+
+ #Write .gz file to folder
+ for chunk in response.iter_content(chunk_size=32768):
+ if chunk: # filter out keep-alive new chunks
+ bytes_so_far += len(chunk)
+ percent = int(float(bytes_so_far) / total_size * 95)
+ update_time = self.update_percent(percent, update_time)
+ destination_writer.write(chunk)
+ destination_writer.close()
+
+ #Unzip file
+ write_to_path = self.language + '/' + self.embedding_family + '/' + self.model_id
+ with self.folder.get_writer(write_to_path) as f_out, self.folder.get_download_stream(self.archive_name) as f_in:
+ shutil.copyfileobj(gzip.open(f_in), f_out)
+
+ #Remove the .gz file
+ self.folder.delete_path(self.archive_name)
+ return bytes_so_far
+
+ def download_tar_gz(self, response, bytes_so_far=0):
+ #Download .tar files
+ total_size = self.get_file_size(response)
+ update_time = time.time()
+ with self.folder.get_writer(self.archive_name) as w:
+ for chunk in response.iter_content(chunk_size=100000):
+ if chunk:
+ bytes_so_far += len(chunk)
+ percent = int(float(bytes_so_far) / total_size * 95)
+ update_time = self.update_percent(percent, update_time)
+ w.write(chunk)
+ #Untar file
+ with self.folder.get_download_stream(self.archive_name) as f_in:
+ with tarfile.open(fileobj=io.BytesIO(f_in.read())) as tar:
+ members = tar.getmembers()
+ for member in members:
+ if member.isfile():
+ write_to_path = self.language + '/' + self.embedding_family + '/' + member.name
+ with self.folder.get_writer(write_to_path) as f_out:
+ shutil.copyfileobj(tar.extractfile(member),f_out)
+ self.folder.delete_path(self.archive_name)
+ return bytes_so_far
+
+ def download_zip(self, response, bytes_so_far = 0):
+ #Download .zip files
+ total_size = self.get_file_size(response)
+ update_time = time.time()
+ with self.folder.get_writer(self.archive_name) as w:
+ for chunk in response.iter_content(chunk_size=100000):
+ if chunk:
+ bytes_so_far += len(chunk)
+ percent = int(float(bytes_so_far) / total_size * 95)
+ update_time = self.update_percent(percent, update_time)
+ w.write(chunk)
+ #Unzip file
+ with self.folder.get_download_stream(self.archive_name) as f_in:
+ with zipfile.ZipFile(io.BytesIO(f_in.read())) as fzip:
+ if self.embedding_model == "word2vec":
+ archive_name = "model.bin"
+ elif self.embedding_model == "glove":
+ archive_name = fzip.namelist()[0]
+ else:
+ raise NotImplementedError()
+ write_to_path = self.language + '/' + self.embedding_family + '/' + self.model_id
+ with fzip.open(archive_name) as fzip_file, self.folder.get_writer(write_to_path) as f_out:
+ shutil.copyfileobj(fzip_file, f_out)
+ self.folder.delete_path(self.archive_name)
+ return bytes_so_far
+
+ def get_file_size(self,response):
+ if self.model_id == "word2vec-en":
+ total_size = 3390000000 #3.39GB
+ else:
+ total_size = int(response.headers.get('content-length'))
+
+ return total_size if total_size>0 else 300000000 #300MB
+
+ def update_percent(self,percent, last_update_time):
+ new_time = time.time()
+ if (new_time - last_update_time) > 5:
+ self.progress_callback(percent)
+ return new_time
+ else:
+ return last_update_time
+
+ def get_download_link(self):
+ raise NotImplementedError()
+
+ def run(self):
+ raise NotImplementedError()
+
+
+
+
+class Word2vecDownloader(BaseDownloader):
+ def __init__(self,folder,macro_inputs,proxy,progress_callback):
+ BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback)
+ self.archive_name = self.language + '/' + self.embedding_family + '/'
+ if self.language == "en":
+ self.archive_name += self.model_id + ".bin.gz"
+ else:
+ self.archive_name += self.model_id + ".zip"
+
+ def get_gdrive_stream(self, download_link):
+ id_gdrive = self.model_params["download_info"][self.language]["id_gdrive"]
+ session = requests.Session()
+ response = session.get(download_link, params={'id': id_gdrive} , stream=True, proxies=self.proxy)
+ token = self.__get_confirm_token(response)
+
+ if token:
+ params = {'id': id_gdrive, 'confirm': token}
+ response = session.get(download_link, params=params, stream=True, proxies=self.proxy)
+ else:
+ raise RuntimeError("Google Drive Token could not be verified.")
+
+ return response
+
+ def __get_confirm_token(self,response):
+ for key, value in response.cookies.items():
+ if key.startswith('download_warning'):
+ return value
+ return None
+
+ def get_download_link(self):
+ if self.language == "en":
+ return self.model_params["download_info"][self.language]["model_link"]
+ else:
+ model_id = self.model_params["download_info"][self.language]["model_id"]
+ return WORD2VEC_BASE_URL.format(model_id)
+
+ def run(self):
+ if self.language == "en":
+ download_link = self.get_download_link()
+ response = self.get_gdrive_stream(download_link)
+ self.download_gz(response)
+ else:
+ download_link = self.get_download_link()
+ response = self.get_stream(download_link)
+ self.download_zip(response)
+
+
+
+
+class FasttextDownloader(BaseDownloader):
+ def __init__(self,folder,macro_inputs,proxy,progress_callback):
+ BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback)
+ self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_id + ".gz"
+
+ def get_download_link(self):
+ return FASTTEXT_BASE_URL.format(self.model_params["download_info"][self.language])
+
+ def run(self):
+ download_link = self.get_download_link()
+ response = self.get_stream(download_link)
+ self.download_gz(response)
+
+
+
+
+
+class GloveDownloader(BaseDownloader):
+ def __init__(self,folder,macro_inputs,proxy,progress_callback):
+ BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback)
+ self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_id + ".zip"
+
+ def get_download_link(self):
+ return self.model_params["download_info"][self.language]
+
+ def run(self):
+ download_link = self.get_download_link()
+ response = self.get_stream(download_link)
+ self.download_zip(response)
+
+
+class ElmoDownloader(BaseDownloader):
+ def __init__(self,folder,macro_inputs,proxy,progress_callback):
+ BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback)
+ self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_id + ".tar.gz"
+
+ def get_download_link(self):
+ return self.model_params["download_info"][self.language]
+
+ def run(self):
+ download_link = self.get_download_link()
+ response = self.get_stream(download_link)
+ self.download_tar_gz(response)
+
+
+
+
+class UseDownloader(BaseDownloader):
+ def __init__(self,folder,macro_inputs,proxy,progress_callback):
+ BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback)
+ self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_id + ".tar.gz"
+
+ def get_download_link(self):
+ return self.model_params["download_info"][self.language]
+
+ def run(self):
+ download_link = self.get_download_link()
+ response = self.get_stream(download_link)
+ self.download_tar_gz(response)
+
+
+
+class HuggingFaceDownloader(BaseDownloader):
+ def __init__(self,folder,macro_inputs,proxy,progress_callback):
+ BaseDownloader.__init__(self,folder,macro_inputs,proxy,progress_callback)
+ self.macro_inputs = macro_inputs
+ self.model_shortcut_name = self.macro_inputs["transformer_shortcut_name"]
+
+
+ def run(self):
+ bytes_so_far = 0
+ for filename in HG_FILENAMES:
+ self.archive_name = self.language + '/' + self.embedding_family + '/' + self.model_shortcut_name.replace("/","_") + '/' + filename
+ download_link = self.get_download_link(filename)
+ response = self.get_stream(download_link)
+ if response.status_code == 200:
+ bytes_so_far = self.download_plain(response, bytes_so_far)
+ elif response.status_code == 404:
+ pass
+
+ def get_file_size(self, response=None):
+ total_size = 0
+ for filename in HG_FILENAMES:
+ download_link = self.get_download_link(filename)
+ response = self.get_stream(download_link)
+ if response.status_code == 200:
+ total_size += int(response.headers.get('content-length'))
+ elif response.status_code == 404:
+ total_size += 0
+ return total_size
+
+
+ def get_download_link(self,filename):
+ return hf_bucket_url(self.model_shortcut_name,filename)
\ No newline at end of file
diff --git a/python-runnables/download-pretrained-embedding/runnable.json b/python-runnables/download-pretrained-embedding/runnable.json
index 3d95597..85eea75 100755
--- a/python-runnables/download-pretrained-embedding/runnable.json
+++ b/python-runnables/download-pretrained-embedding/runnable.json
@@ -1,82 +1,18 @@
{
- "meta" : {
- "label" : "Pre-trained Embeddings",
- "description" : "Downloads pre-trained word embeddings. Available models are: Word2vec, GloVe, fastText and ELMo.",
- "icon" : "icon-cloud-download"
+ "meta": {
+ "label": "Pre-trained Embeddings",
+ "description": "Downloads pre-trained word embeddings. Available models are: Word2vec, GloVe, fastText and ELMo.",
+ "icon": "icon-cloud-download"
},
- "impersonate" : false,
-
- "permissions" : ["WRITE_CONF"],
-
- "resultType" : "HTML",
-
- "resultLabel" : "model download output",
-
- "extension" : "txt",
-
- "mimeType" : "text/plain",
-
- "params": [
- {
- "name": "source",
- "label": "Source",
- "type": "SELECT",
- "selectChoices": [
- {
- "label": "Word2vec",
- "value": "word2vec"
- },
- {
- "label": "GloVe",
- "value": "glove"
- },
- {
- "label": "FastText",
- "value": "fasttext"
- },
- {
- "label": "ELMo",
- "value": "elmo"
- }
- ],
- "mandatory": true
- },
- {
- "visibilityCondition": "model.source == 'fasttext'",
- "name": "text_language_fasttext",
- "label": "Text language",
- "type": "SELECT",
- "selectChoices": [
- {
- "label": "English",
- "value": "english"
- },
- {
- "label": "French",
- "value": "french"
- }
- ],
- "mandatory": true
- },
- {
- "visibilityCondition": "model.source != 'fasttext'",
- "name": "text_language_other",
- "label": "Text language",
- "type": "SELECT",
- "selectChoices": [
- {
- "label": "English",
- "value": "english"
- }
- ],
- "mandatory": true
- },
- {
- "name": "outputName",
- "label" : "Output folder name",
- "type": "STRING",
- "description":"Use a different folder for each downloaded embeddings.",
- "mandatory" : true
- }
- ]
-}
+ "impersonate": true,
+ "permissions": [
+ "WRITE_CONF"
+ ],
+ "resultType": "HTML",
+ "resultLabel": "model download output",
+ "extension": "txt",
+ "mimeType": "text/plain",
+ "paramsTemplate" : "index.html",
+ "paramsModule" : "modelDownloader.build",
+ "paramsPythonSetup": "recipe-helper.py"
+}
\ No newline at end of file
diff --git a/python-runnables/download-pretrained-embedding/runnable.py b/python-runnables/download-pretrained-embedding/runnable.py
index 9c10ff7..4bb266c 100755
--- a/python-runnables/download-pretrained-embedding/runnable.py
+++ b/python-runnables/download-pretrained-embedding/runnable.py
@@ -2,41 +2,18 @@
import dataiku
from dataiku.runnables import Runnable
-
-import os
-import gzip
+from macro.model_downloaders import (Word2vecDownloader,
+ FasttextDownloader,
+ GloveDownloader,
+ ElmoDownloader,
+ UseDownloader,
+ HuggingFaceDownloader
+ )
+from macro.macro_utils import read_model_inputs
+from macro.model_configurations import TRANSFORMERS_LIST
import zipfile
-import requests
-import shutil
-
-
-def download_file_from_google_drive(id, destination):
- URL = "https://docs.google.com/uc?export=download"
-
- session = requests.Session()
- response = session.get(URL, params={'id': id}, stream=True)
- token = get_confirm_token(response)
-
- if token:
- params = {'id': id, 'confirm': token}
- response = session.get(URL, params=params, stream=True)
- save_response_content(response, destination)
-
-
-def get_confirm_token(response):
- for key, value in response.cookies.items():
- if key.startswith('download_warning'):
- return value
- return None
-
-
-def save_response_content(response, destination):
- CHUNK_SIZE = 32768
-
- with open(destination, "wb") as f:
- for chunk in response.iter_content(CHUNK_SIZE):
- if chunk: # filter out keep-alive new chunks
- f.write(chunk)
+import json
+import os
class MyRunnable(Runnable):
@@ -52,6 +29,7 @@ def __init__(self, project_key, config, plugin_config):
self.config = config
self.plugin_config = plugin_config
self.client = dataiku.api_client()
+
def get_progress_target(self):
"""
@@ -63,14 +41,10 @@ def get_progress_target(self):
def run(self, progress_callback):
# Retrieving parameters
- output_folder_name = self.config.get('outputName', '')
- source = self.config.get('source', '')
- if source == 'fasttext':
- text_language = self.config.get('text_language_fasttext', '')
- else:
- text_language = self.config.get('text_language_other', '')
+ macro_inputs = read_model_inputs(self.config)
# Creating new Managed Folder if needed
+ output_folder_name = macro_inputs["output_folder_name"]
project = self.client.get_project(self.project_key)
output_folder_found = False
@@ -86,104 +60,32 @@ def run(self, progress_callback):
output_folder = dataiku.Folder(output_folder.get_definition()["id"],
project_key=self.project_key)
- output_folder_path = output_folder.get_path()
-
#######################################
# Downloading and extracting the data
#######################################
- if source == 'word2vec':
- if text_language == 'english':
- file_id = '0B7XkCwpI5KDYNlNUTTlSS21pQmM'
- else:
- raise NotImplementedError("Word2vec vectors are only available for English. Use fastText for other languages.")
-
- # Download from Google Drive
- archive_fname = os.path.join(output_folder_path, "GoogleNews-vectors-negative300.bin.gz")
- download_file_from_google_drive(file_id, archive_fname)
-
- # Decompress in managed folder and rename
- """
- decompressed_file = gzip.GzipFile(archive_fname)
- with open(os.path.join(output_folder_path, "Word2vec_embeddings"), 'wb') as outfile:
- print('))))))))))) WRITING FILE')
- outfile.write(decompressed_file.read())
- """
- outfile_path = os.path.join(output_folder_path, "Word2vec_embeddings")
- with open(outfile_path, 'wb') as f_out, gzip.open(archive_fname, 'rb') as f_in:
- shutil.copyfileobj(f_in, f_out)
-
- os.remove(archive_fname)
-
-
- elif source == 'fasttext':
- if text_language == 'english':
- url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec'
- elif text_language == 'french':
- url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fr.vec'
- elif text_language == 'german':
- url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.de.vec'
- else:
- raise NotImplementedError(
- "Only English, French and German languages are supported.")
- r = requests.get(url, stream=True)
- with output_folder.get_writer("fastText_embeddings") as w:
- for chunk in r.iter_content(chunk_size=100000):
- if chunk:
- w.write(chunk)
-
-
- elif source == 'glove':
- if text_language == 'english':
- url = 'http://nlp.stanford.edu/data/glove.42B.300d.zip'
- else:
- raise NotImplementedError("GloVe vectors are only available for English. Use fastText for other languages.")
-
- archive_name = os.path.basename(url)
-
- # Download archive
- r = requests.get(url, stream=True)
- with output_folder.get_writer(archive_name) as w:
- for chunk in r.iter_content(chunk_size=100000):
- if chunk:
- w.write(chunk)
-
- file_basename = os.path.splitext(archive_name)[0]
- file_name = file_basename + '.txt'
- file_rename = "GloVe_embeddings"
-
- # Unzip archive into same directory
- zip_ref = zipfile.ZipFile(os.path.join(
- output_folder_path, archive_name), 'r')
- zip_ref.extractall(output_folder_path)
- zip_ref.close()
-
- # remove archive
- os.remove(os.path.join(output_folder_path, archive_name))
- # rename embedding file
- os.rename(os.path.join(output_folder_path, file_name), os.path.join(output_folder_path, file_rename))
-
-
- elif source == 'elmo':
- if text_language == 'english':
- import tensorflow as tf
- import tensorflow_hub as hub
-
- elmo_model_dir = os.path.join(output_folder_path, "ELMo")
-
- if not os.path.exists(elmo_model_dir):
- os.makedirs(elmo_model_dir)
-
- # Path for saving ELMo
- os.environ["TFHUB_CACHE_DIR"] = elmo_model_dir
-
- # Download ELMo
- elmo_model = hub.Module(
- "https://tfhub.dev/google/elmo/2", trainable=False)
- else:
- raise NotImplementedError(
- "ELMo is only available for English. Use fastText for other languages.")
+ embedding_model = macro_inputs["embedding_model"]
+ proxy = self.plugin_config["proxy"]
+ if embedding_model == 'word2vec':
+ Word2vecDownloader(output_folder,macro_inputs,proxy,progress_callback).run()
+
+
+ elif embedding_model == 'fasttext':
+ FasttextDownloader(output_folder,macro_inputs,proxy,progress_callback).run()
+
+
+ elif embedding_model == 'glove':
+ GloveDownloader(output_folder,macro_inputs,proxy,progress_callback).run()
+
+ elif embedding_model == 'elmo':
+ ElmoDownloader(output_folder,macro_inputs,proxy,progress_callback).run()
+
+ elif embedding_model == 'use':
+ UseDownloader(output_folder,macro_inputs,proxy,progress_callback).run()
+
+ elif embedding_model in TRANSFORMERS_LIST:
+ HuggingFaceDownloader(output_folder,macro_inputs,proxy,progress_callback).run()
else:
- raise NotImplementedError(
- "Only Word2vec, GloVe and FastText embeddings are supported.")
+ raise("Model not found.")
+
return "
The model was downloaded successfuly !"
diff --git a/resource/index.css b/resource/index.css
new file mode 100644
index 0000000..614344e
--- /dev/null
+++ b/resource/index.css
@@ -0,0 +1,11 @@
+.error-message{
+ color:red;
+ font-size: 10px;
+ margin-left: 2px;
+}
+
+.desc-message{
+ color:gray;
+ font-size: 10px;
+ margin-left: 2px;
+}
diff --git a/resource/index.html b/resource/index.html
new file mode 100644
index 0000000..0b88c1a
--- /dev/null
+++ b/resource/index.html
@@ -0,0 +1,69 @@
+
+
+