diff --git a/clowder_data_on_local.ipynb b/clowder_data_on_local.ipynb new file mode 100644 index 0000000..0529fce --- /dev/null +++ b/clowder_data_on_local.ipynb @@ -0,0 +1,767 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4NGqRtRJpJYO" + }, + "outputs": [], + "source": [ + "import requests\n", + "import json\n", + "import os\n", + "import tempfile\n", + "import re\n", + "import pathlib\n", + "import pandas as pd\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Please put your API key and clowder base url here." + ], + "metadata": { + "id": "YKlxthr4me3_" + } + }, + { + "cell_type": "code", + "source": [ + "key=\"\" # your API key\n", + "url=\"https://clowder.ncsa.illinois.edu/clowder\" # url to clowder without the slash and api\"" + ], + "metadata": { + "id": "6PMIYV5i1uUQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Create some variables we use later" + ], + "metadata": { + "id": "psAgdUyI12ms" + } + }, + { + "cell_type": "code", + "source": [ + "base_headers = {'X-API-key': key}\n", + "headers = {**base_headers, 'Content-type': 'application/json',\n", + " 'accept': 'application/json'}\n", + "clowder_base_uri = f\"{url}/api\"" + ], + "metadata": { + "id": "jhnMe7qAmCDl" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "List out all spaces you have access to" + ], + "metadata": { + "id": "AcBEPosEmqI4" + } + }, + { + "cell_type": "code", + "source": [ + "def list_spaces():\n", + " r = requests.get(clowder_base_uri + '/spaces/canEdit', headers=headers)\n", + " r.raise_for_status()\n", + " return pd.DataFrame.from_dict(r.json())\n", + "\n", + "list_spaces()" + ], + "metadata": { + "id": "r8glZflpmIoX", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "outputId": "a0ec7c51-59a0-48ce-8d27-397adc58d7ec" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id name description created\n", + "0 63c8414ce4b083a8c2580991 dt1 trial Wed Jan 18 18:58:20 UTC 2023" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedescriptioncreated
063c8414ce4b083a8c2580991dt1trialWed Jan 18 18:58:20 UTC 2023
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "List out all datasets" + ], + "metadata": { + "id": "eU9ZiXARmtkJ" + } + }, + { + "cell_type": "code", + "source": [ + "def list_datasets():\n", + " r = requests.get(clowder_base_uri + '/datasets', headers=headers)\n", + " r.raise_for_status()\n", + " return pd.DataFrame.from_dict(r.json())\n", + "\n", + "list_datasets()" + ], + "metadata": { + "id": "LUtE5D9_mLXn", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 111 + }, + "outputId": "9e89c48f-c894-4b92-faf4-15209896b5e0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id name description \\\n", + "0 63c84b8ae4b083a8c2580a41 dataset 2 \n", + "1 63c84b5be4b083a8c2580a26 dataset1 \n", + "\n", + " created thumbnail authorId \\\n", + "0 Wed Jan 18 19:42:02 UTC 2023 None 635bfd4db7c7f35aca68df66 \n", + "1 Wed Jan 18 19:41:15 UTC 2023 None 635bfd4db7c7f35aca68df66 \n", + "\n", + " spaces resource_type \n", + "0 [63c8414ce4b083a8c2580991] dataset \n", + "1 [63c8414ce4b083a8c2580991] dataset " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedescriptioncreatedthumbnailauthorIdspacesresource_type
063c84b8ae4b083a8c2580a41dataset 2Wed Jan 18 19:42:02 UTC 2023None635bfd4db7c7f35aca68df66[63c8414ce4b083a8c2580991]dataset
163c84b5be4b083a8c2580a26dataset1Wed Jan 18 19:41:15 UTC 2023None635bfd4db7c7f35aca68df66[63c8414ce4b083a8c2580991]dataset
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "List out all files in a particular dataset.\n", + "It takes datasetId as parameter." + ], + "metadata": { + "id": "_KLqApxomzA_" + } + }, + { + "cell_type": "code", + "source": [ + "def list_files_in_dataset(datasetId):\n", + " r = requests.get(clowder_base_uri + '/datasets/' + datasetId + '/listAllFiles', headers=headers)\n", + " r.raise_for_status()\n", + " return pd.DataFrame.from_dict(r.json())\n", + "\n", + "list_files_in_dataset('63c84b8ae4b083a8c2580a41')" + ], + "metadata": { + "id": "pg3YGSzomPkP", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "outputId": "b609b15d-b37f-40ec-886e-c54a9eee8291" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " size date-created id contentType \\\n", + "0 57 Wed Jan 18 19:42:12 UTC 2023 63c84b94e4b083a8c2580a45 text/plain \n", + "\n", + " filename \n", + "0 a2.txt " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sizedate-createdidcontentTypefilename
057Wed Jan 18 19:42:12 UTC 202363c84b94e4b083a8c2580a45text/plaina2.txt
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Retrives the url to download the contents of a dataset as zip. It takes datasetId as parameter.\n", + "Returns:\n", + "\n", + "* url - url to download from\n", + "* filename - zip file name\n", + "\n" + ], + "metadata": { + "id": "n0VhgvxvnDMk" + } + }, + { + "cell_type": "code", + "source": [ + "def get_dataset_download_link(datasetId):\n", + " # get dataset name\n", + " r = requests.get(f\"{clowder_base_uri}/datasets/{datasetId}\", headers=headers)\n", + " r.raise_for_status()\n", + " print(r.json())\n", + " filename = r.json()[\"name\"] + \".zip\"\n", + " url = f\"{clowder_base_uri}/datasets/{datasetId}//download?bagit=false&compression=-1&tracking=true\"\n", + " return {\"url\": url, \"filename\": filename}\n", + "\n", + "\n", + "get_dataset_download_link('63c84b8ae4b083a8c2580a41')" + ], + "metadata": { + "id": "FK0CAA2QmQN-", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "b87eba94-b5a6-4795-ce32-9ce68ae1be1f" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'id': '63c84b8ae4b083a8c2580a41', 'name': 'dataset 2', 'description': '', 'created': 'Wed Jan 18 19:42:02 UTC 2023', 'thumbnail': None, 'authorId': '635bfd4db7c7f35aca68df66', 'spaces': ['63c8414ce4b083a8c2580991'], 'resource_type': 'dataset'}\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'url': 'https://clowder.ncsa.illinois.edu/clowder/api/datasets/63c84b8ae4b083a8c2580a41//download?bagit=false&compression=-1&tracking=true',\n", + " 'filename': 'dataset 2.zip'}" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "Retrives the url to download a file. It takes fileId as parameter.\n", + "\n", + "Returns:\n", + "\n", + "\n", + "\n", + "* url - url to download from\n", + "\n", + "* filename - original filename\n", + "* bytes - size of the file\n", + "\n", + "\n" + ], + "metadata": { + "id": "RtAoGz32cG66" + } + }, + { + "cell_type": "code", + "source": [ + "def get_file_download_link(fileId):\n", + " # get file name\n", + " r = requests.get(f\"{clowder_base_uri}/files/{fileId}/metadata?key={key}\")\n", + " r.raise_for_status()\n", + " filename = r.json()[\"filename\"]\n", + " bloburl = f\"{clowder_base_uri}/files/{fileId}/blob?key={key}\"\n", + " return {\"url\": bloburl, \"filename\": filename, \"bytes\": r.json()[\"size\"]}\n", + "\n", + "get_file_download_link('63c84b73e4b083a8c2580a2a')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bhN5XjIGcF_k", + "outputId": "9bd74f96-a476-4154-ee10-8522d0058445" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'url': 'https://clowder.ncsa.illinois.edu/clowder/api/files/63c84b73e4b083a8c2580a2a/blob?key=f40d04bd-05cd-4514-a7c4-1b6f7d65099c',\n", + " 'filename': 'a1.txt',\n", + " 'bytes': '55'}" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Download dataset/file to local disk.\n", + "\n", + "Paramaets:\n", + "\n", + "* url - url to download from\n", + "\n", + "* path - user can specify path of the file\n", + "* chunkSize - user can specify the chunk size which will denote the download speed. the file/dataset will be downloaded in the provided sized chunks\n" + ], + "metadata": { + "id": "0zaHOORXgkYs" + } + }, + { + "cell_type": "code", + "source": [ + "def download_to_disk(url, inputfilename = None, chunk_size = 10 * 1024):\n", + " result = requests.get(url, stream=True)\n", + "\n", + " if inputfilename is None:\n", + " (inputfile, inputfilename) = tempfile.mkstemp(suffix='.txt')\n", + " else:\n", + " inputfile = os.open(inputfilename, os.O_WRONLY | os.O_CREAT, 0o600)\n", + " current_dir = os.getcwd()\n", + " inputfilename = os.path.join(current_dir, inputfilename)\n", + " try:\n", + " with os.fdopen(inputfile, \"wb\") as outputfile:\n", + " for chunk in result.iter_content(chunk_size):\n", + " outputfile.write(chunk)\n", + " return inputfilename\n", + " except Exception:\n", + " os.remove(inputfilename)\n", + " raise\n", + "\n", + "result = get_file_download_link('63c84b73e4b083a8c2580a2a')\n", + "download_to_disk(result[\"url\"], result[\"filename\"], 10 * 1024)" + ], + "metadata": { + "id": "ES7vMMXSmSjY", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "outputId": "665b59e9-fe4c-418f-fcb1-b14be815c7eb" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content/a1.txt'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 17 + } + ] + } + ] +} \ No newline at end of file