clowder-framework
diff --git a/‎.github/workflows/docker.yaml‎
Lines changed: 162 additions & 0 deletions b/‎.github/workflows/docker.yaml‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎pyclowder/api/v1/files.py‎
Lines changed: 14 additions & 0 deletions b/‎pyclowder/api/v1/files.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎pyclowder/api/v2/files.py‎
Lines changed: 22 additions & 6 deletions b/‎pyclowder/api/v2/files.py‎
Lines changed: 22 additions & 6 deletions
diff --git a/‎pyclowder/files.py‎
Lines changed: 15 additions & 1 deletion b/‎pyclowder/files.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎sample-extractors/test-dataset-extractor/Dockerfile‎
Lines changed: 8 additions & 0 deletions b/‎sample-extractors/test-dataset-extractor/Dockerfile‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎sample-extractors/test-dataset-extractor/README.md‎
Lines changed: 81 additions & 0 deletions b/‎sample-extractors/test-dataset-extractor/README.md‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎sample-extractors/test-dataset-extractor/extractor_info.json‎
Lines changed: 29 additions & 0 deletions b/‎sample-extractors/test-dataset-extractor/extractor_info.json‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎sample-extractors/test-dataset-extractor/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎sample-extractors/test-dataset-extractor/requirements.txt‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,162 @@
+name: Docker
+
+# This will run when:
+# - a new release is created, to make sure the right tags of the
+#   docker images are pushed (expects tags to be v1.8.4).
+# - when new code is pushed to main/develop to push the tags
+#   latest and develop
+# - when a pull request is created and updated  to make sure the
+#   Dockerfile is still valid.
+# To be able to push to dockerhub, this execpts the following
+# secrets to be set in the project:
+# - DOCKERHUB_USERNAME : username that can push to the org
+# - DOCKERHUB_PASSWORD : password asscoaited with the username
+on:
+  push:
+    branches:
+      - master
+
+  pull_request:
+
+# Certain actions will only run when this is the main repo.
+env:
+  MAIN_REPO: clowder-framework/pyclowder
+  DOCKERHUB_ORG: clowder
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        name:
+          - wordcount
+        include:
+          - name: wordcount
+            FOLDER: sample-extractors/wordcount
+            PLATFORM: "linux/amd64,linux/arm64"
+    steps:
+      - uses: actions/checkout@v2
+
+      # set environment variables
+      - name: Extractor Version
+        run: |
+          # find out what the BRANCH is, in case of a PR we will use the PR-<number>
+          if [ "${{ github.event.release.target_commitish }}" != "" ]; then
+            BRANCH="${{ github.event.release.target_commitish }}"
+          elif [[ $GITHUB_REF =~ pull ]]; then
+            BRANCH="$(echo $GITHUB_REF | sed 's#refs/pull/\([0-9]*\)/merge#PR-\1#')"
+          else
+            BRANCH=${GITHUB_REF##*/}
+          fi
+
+          # should we push to dockerhub, and is there a README
+          DOCKERHUB_PUSH="false"
+          DOCKERHUB_README="false"
+          if [ "$BRANCH" == "main" -a "${{ github.repository }}" == "${{ env.MAIN_REPO }}" ]; then
+            if [ "${{ secrets.DOCKERHUB_USERNAME }}" != "" -a "${{ secrets.DOCKERHUB_PASSWORD }}" != "" ]; then
+              DOCKERHUB_PUSH="true"
+              if [ -e "${{ matrix.FOLDER }}/README.md" ]; then
+                DOCKERHUB_README="true"
+              fi
+            fi
+          fi
+
+          # calculate the version and all tags
+          if [ "$BRANCH" == "main" ]; then
+            VERSION="$(awk '/"version":/ { print $2 }' ${{ matrix.FOLDER }}/extractor_info.json | sed 's/^.*"\([0-9\.]*\)".*$/\1/')"
+            tags="latest"
+            oldversion=""
+            tmpversion="${VERSION}"
+            while [ "${oldversion}" != "${tmpversion}" ]; do
+              oldversion="${tmpversion}"
+              tags="${tags} ${tmpversion}"
+              tmpversion=${tmpversion%.*}
+            done
+          else
+            VERSION="test"
+            tags="$BRANCH"
+          fi
+
+          # create a list of all images to be pushed
+          IMAGE="extractors-${{ matrix.name }}"
+          IMAGES=""
+          for tag in ${tags}; do
+            if [ "$DOCKERHUB_PUSH" == "true" ]; then
+              IMAGES="${IMAGES}${{ env.DOCKERHUB_ORG }}/${IMAGE}:${tag},"
+            fi
+            IMAGES="${IMAGES}ghcr.io/${{ github.repository_owner }}/${IMAGE}:${tag},"
+          done
+          IMAGES="${IMAGES%,*}"
+
+          # save the results in env
+          echo "BRANCH=${BRANCH}"
+          echo "VERSION=${VERSION}"
+          echo "DOCKERHUB_README=${DOCKERHUB_README}"
+          echo "DOCKERHUB_PUSH=${DOCKERHUB_PUSH}"
+          echo "IMAGES=${IMAGES}"
+
+          echo "BRANCH=${BRANCH}" >> $GITHUB_ENV
+          echo "VERSION=${VERSION}" >> $GITHUB_ENV
+          echo "DOCKERHUB_README=${DOCKERHUB_README}" >> $GITHUB_ENV
+          echo "DOCKERHUB_PUSH=${DOCKERHUB_PUSH}" >> $GITHUB_ENV
+          echo "IMAGES=${IMAGES}" >> $GITHUB_ENV
+
+      # setup docker build
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Inspect Builder
+        run: |
+          echo "Name:      ${{ steps.buildx.outputs.name }}"
+          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
+          echo "Status:    ${{ steps.buildx.outputs.status }}"
+          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
+          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
+
+      # login to registries
+      - name: Login to DockerHub
+        if: env.DOCKERHUB_PUSH == 'true'
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      # build the docker images
+      - name: Build and push ${{ matrix.name }}
+        uses: docker/build-push-action@v2
+        with:
+          push: true
+          context: ${{ matrix.FOLDER }}
+          platforms: ${{ matrix.PLATFORM }}
+          cache-from: type=gha,scope=${{ matrix.name }}
+          cache-to: type=gha,mode=max,scope=${{ matrix.name }}
+          tags: ${{ env.IMAGES }}
+          build-args: |
+            VERSION=${{ env.VERSION }}
+            BUILDNUMBER=${{ github.run_number }}
+            GITSHA1=${{ github.sha  }}
+
+      # this will update the README of the dockerhub repo
+      - name: Docker Hub Description
+        if: env.DOCKERHUB_README == 'true'
+        uses: peter-evans/dockerhub-description@v2
+        env:
+          DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
+          DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }}
+          DOCKERHUB_REPOSITORY: ${{ env.DOCKERHUB_ORG }}/extractors-${{ matrix.NAME }}
+          README_FILEPATH: ${{ matrix.FOLDER }}/README.md
@@ -11,6 +11,7 @@
 import requests
 from requests_toolbelt.multipart.encoder import MultipartEncoder
 
+from pyclowder.client import ClowderClient
 from pyclowder.collections import get_datasets, get_child_collections
 from pyclowder.datasets import get_file_list
 
@@ -95,6 +96,19 @@ def download_info(connector, client, fileid):
 
     return result
 
+def download_summary(connector, host, key, fileid):
+    """Download file summary  from Clowder. It's the same as download_info. We have different names for the
+    same functionality for v2. To be consistent, we are keeping this method in v1,
+    Keyword arguments:
+    connector -- connector information, used to get missing parameters and send status updates
+    host -- the clowder host, including http and port, should end with a /
+    key -- the secret key to login to clowder
+    fileid -- the file to fetch metadata of
+    """
+    client = ClowderClient(host=host, key=key)
+    result = download_info(connector, client, fileid)
+    return result.json()
+
 
 def download_metadata(connector, client, fileid, extractor=None):
     """Download file JSON-LD metadata from Clowder.
 
@@ -94,6 +94,22 @@ def download_info(connector, client, fileid):
 
     return result
 
+def download_summary(connector, client, fileid):
+    """Download file summary  from Clowder.
+
+    Keyword arguments:
+    connector -- connector information, used to get missing parameters and send status updates
+    client -- ClowderClient containing authentication credentials
+    fileid -- the file to fetch metadata of
+    """
+
+    url = '%s/api/v2/files/%s/summary' % (client.host, fileid)
+    headers = {"X-API-KEY": client.key}
+    # fetch data
+    result = connector.get(url, stream=True, verify=connector.ssl_verify if connector else True, headers=headers)
+
+    return result
+
 
 def download_metadata(connector, client, fileid, extractor=None):
     """Download file JSON-LD metadata from Clowder.
@@ -302,12 +318,12 @@ def upload_to_dataset(connector, client, datasetid, filepath, check_duplicate=Fa
 
     if os.path.exists(filepath):
         filename = os.path.basename(filepath)
-        m = MultipartEncoder(
-            fields={'File': (filename, open(filepath, 'rb'))}
-        )
-        headers = {"X-API-KEY": client.key,
-                   'Content-Type': m.content_type}
-        result = connector.post(url, data=m, headers=headers,
+        # m = MultipartEncoder(
+        #     fields={'File': (filename, open(filepath, 'rb'))}
+        # )
+        file_data = {"file": open(filepath, 'rb')}
+        headers = {"X-API-KEY": client.key}
+        result = connector.post(url, files=file_data, headers=headers,
                                 verify=connector.ssl_verify if connector else True)
 
         uploadedfileid = result.json()['id']
 
@@ -69,6 +69,20 @@ def download_info(connector, host, key, fileid):
     return result.json()
 
 
+def download_summary(connector, host, key, fileid):
+    """Download file summary metadata from Clowder.
+
+    Keyword arguments:
+    connector -- connector information, used to get missing parameters and send status updates
+    host -- the clowder host, including http and port, should end with a /
+    key -- the secret key to login to clowder
+    fileid -- the file to fetch metadata of
+    """
+    client = ClowderClient(host=host, key=key)
+    result = files.download_summary(connector, client, fileid)
+    return result.json()
+
+
 def download_metadata(connector, host, key, fileid, extractor=None):
     """Download file JSON-LD metadata from Clowder.
 
@@ -240,7 +254,7 @@ def upload_to_dataset(connector, host, key, datasetid, filepath, check_duplicate
     """
     client = ClowderClient(host=host, key=key)
     if clowder_version == 2:
-        files.upload_to_dataset(connector, client, datasetid, filepath, check_duplicate)
+        return files.upload_to_dataset(connector, client, datasetid, filepath, check_duplicate)
     else:
         logger = logging.getLogger(__name__)
 
 
@@ -0,0 +1,8 @@
+FROM python:3.8
+
+WORKDIR /extractor
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+COPY test-dataset-extractor.py extractor_info.json ./
+CMD python test-dataset-extractor.py
@@ -0,0 +1,81 @@
+A simple test extractor that verifies the functions of file in pyclowder.
+
+# Docker
+
+This extractor is ready to be run as a docker container, the only dependency is a running Clowder instance. Simply build and run.
+
+1. Start Clowder V2. For help starting Clowder V2, see our [getting started guide](https://github.com/clowder-framework/clowder2/blob/main/README.md).
+
+2. First build the extractor Docker container:
+
+```
+# from this directory, run:
+
+docker build -t test-dataset-extractor .
+```
+
+3. Finally run the extractor:
+
+```
+docker run -t -i --rm --net clowder_clowder -e "RABBITMQ_URI=amqp://guest:guest@rabbitmq:5672/%2f" --name "test-dataset-extractor" test-dataset-extractor
+```
+
+Then open the Clowder web app and run the wordcount extractor on a .txt file (or similar)! Done.
+
+### Python and Docker details
+
+You may use any version of Python 3. Simply edit the first line of the `Dockerfile`, by default it uses `FROM python:3.8`.
+
+Docker flags:
+
+- `--net` links the extractor to the Clowder Docker network (run `docker network ls` to identify your own.)
+- `-e RABBITMQ_URI=` sets the environment variables can be used to control what RabbitMQ server and exchange it will bind itself to. Setting the `RABBITMQ_EXCHANGE` may also help.
+  - You can also use `--link` to link the extractor to a RabbitMQ container.
+- `--name` assigns the container a name visible in Docker Desktop.
+
+## Troubleshooting
+
+**If you run into _any_ trouble**, please reach out on our Clowder Slack in the [#pyclowder channel](https://clowder-software.slack.com/archives/CNC2UVBCP).
+
+Alternate methods of running extractors are below.
+
+# Commandline Execution
+
+To execute the extractor from the command line you will need to have the required packages installed. It is highly recommended to use python virtual environment for this. You will need to create a virtual environment first, then activate it and finally install all required packages.
+
+```
+  Step 1 - Start clowder docker-compose 
+  Step 2 - Starting heartbeat listener 
+          virtualenv clowder2-python (try pipenv)
+          source clowder2-python/bin/activate
+  Step 3 - Run heatbeat_listener_sync.py to register new extractor (This step will likely not be needed in future)
+            cd ~/Git/clowder2/backend
+	       pip install email_validator
+        copy heartbeat_listener_sync.py to /backend from /backend/app/rabbitmq
+	    python heartbeat_listener_sync.py
+	
+  Step 4 - Installing pyclowder branch & running extractor
+	    source ~/clowder2-python/bin/activate
+	    pip uninstall pyclowder
+
+	    # the pyclowder Git repo should have Todd's branch activated (50-clowder20-submit-file-to-extractor)
+	    pip install -e ~/Git/pyclowder
+	
+	    cd ~/Git/pyclowder/sample-extractors/test-dataset-extractor
+	    export CLOWDER_VERSION=2   
+	    export CLOWDER_URL=http://localhost:8000/
+
+	    python test-dataset-extractor.py
+
+	
+  Step 5 = # post a particular File ID (text file) to the new extractor
+    POST http://localhost:3002/api/v2/files/639b31754241665a4fc3e513/extract?extractorName=ncsa.test-dataset-extractor
+    
+    Or,
+    Go to Clowder UI and submit a file for extraction
+```
+
+# Run the extractor from Pycharm
+  You can run the heartbeat_listener_sync.py and test_file_extractor.py from pycharm. 
+  Create a pipenv (generally pycharm directs you to create one when you first open the file). To run test_file_extractor.py,
+  add 'CLOWDER_VERSION=2' to environment variable in run configuration.
@@ -0,0 +1,29 @@
+{
+  "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
+  "name": "ncsa.test-dataset-extractor",
+  "version": "2.0",
+  "description": "Test Dataset extractor. Test to verify all functionalities of dataset in pyclowder.",
+  "author": "Dipannita Dey <[email protected]>",
+  "contributors": [],
+  "contexts": [
+    {
+      "lines": "http://clowder.ncsa.illinois.edu/metadata/sample_metadata#lines",
+      "words": "http://clowder.ncsa.illinois.edu/metadata/sample_metadata#words",
+      "characters": "http://clowder.ncsa.illinois.edu/metadata/sample_metadata#characters"
+    }
+  ],
+  "repository": [
+    {
+      "repType": "git",
+      "repUrl": "https://opensource.ncsa.illinois.edu/stash/scm/cats/pyclowder.git"
+    }
+  ],
+  "process": {
+    "dataset": [
+    "*"
+  ]
+  },
+  "external_services": [],
+  "dependencies": [],
+  "bibtex": []
+}
@@ -0,0 +1 @@
+pyclowder==3.0.2