Skip to content

Commit 39d6eb9

Browse files
committed
Merge branch 'master' into 72-add-same-v2-visualization-to-pyclowder-dataset-class
2 parents ed2f550 + 4d52fb2 commit 39d6eb9

File tree

16 files changed

+594
-8
lines changed

16 files changed

+594
-8
lines changed

.github/workflows/docker.yaml

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
name: Docker
2+
3+
# This will run when:
4+
# - a new release is created, to make sure the right tags of the
5+
# docker images are pushed (expects tags to be v1.8.4).
6+
# - when new code is pushed to main/develop to push the tags
7+
# latest and develop
8+
# - when a pull request is created and updated to make sure the
9+
# Dockerfile is still valid.
10+
# To be able to push to dockerhub, this execpts the following
11+
# secrets to be set in the project:
12+
# - DOCKERHUB_USERNAME : username that can push to the org
13+
# - DOCKERHUB_PASSWORD : password asscoaited with the username
14+
on:
15+
push:
16+
branches:
17+
- master
18+
19+
pull_request:
20+
21+
# Certain actions will only run when this is the main repo.
22+
env:
23+
MAIN_REPO: clowder-framework/pyclowder
24+
DOCKERHUB_ORG: clowder
25+
26+
jobs:
27+
docker:
28+
runs-on: ubuntu-latest
29+
permissions:
30+
packages: write
31+
32+
strategy:
33+
fail-fast: false
34+
matrix:
35+
name:
36+
- wordcount
37+
include:
38+
- name: wordcount
39+
FOLDER: sample-extractors/wordcount
40+
PLATFORM: "linux/amd64,linux/arm64"
41+
steps:
42+
- uses: actions/checkout@v2
43+
44+
# set environment variables
45+
- name: Extractor Version
46+
run: |
47+
# find out what the BRANCH is, in case of a PR we will use the PR-<number>
48+
if [ "${{ github.event.release.target_commitish }}" != "" ]; then
49+
BRANCH="${{ github.event.release.target_commitish }}"
50+
elif [[ $GITHUB_REF =~ pull ]]; then
51+
BRANCH="$(echo $GITHUB_REF | sed 's#refs/pull/\([0-9]*\)/merge#PR-\1#')"
52+
else
53+
BRANCH=${GITHUB_REF##*/}
54+
fi
55+
56+
# should we push to dockerhub, and is there a README
57+
DOCKERHUB_PUSH="false"
58+
DOCKERHUB_README="false"
59+
if [ "$BRANCH" == "main" -a "${{ github.repository }}" == "${{ env.MAIN_REPO }}" ]; then
60+
if [ "${{ secrets.DOCKERHUB_USERNAME }}" != "" -a "${{ secrets.DOCKERHUB_PASSWORD }}" != "" ]; then
61+
DOCKERHUB_PUSH="true"
62+
if [ -e "${{ matrix.FOLDER }}/README.md" ]; then
63+
DOCKERHUB_README="true"
64+
fi
65+
fi
66+
fi
67+
68+
# calculate the version and all tags
69+
if [ "$BRANCH" == "main" ]; then
70+
VERSION="$(awk '/"version":/ { print $2 }' ${{ matrix.FOLDER }}/extractor_info.json | sed 's/^.*"\([0-9\.]*\)".*$/\1/')"
71+
tags="latest"
72+
oldversion=""
73+
tmpversion="${VERSION}"
74+
while [ "${oldversion}" != "${tmpversion}" ]; do
75+
oldversion="${tmpversion}"
76+
tags="${tags} ${tmpversion}"
77+
tmpversion=${tmpversion%.*}
78+
done
79+
else
80+
VERSION="test"
81+
tags="$BRANCH"
82+
fi
83+
84+
# create a list of all images to be pushed
85+
IMAGE="extractors-${{ matrix.name }}"
86+
IMAGES=""
87+
for tag in ${tags}; do
88+
if [ "$DOCKERHUB_PUSH" == "true" ]; then
89+
IMAGES="${IMAGES}${{ env.DOCKERHUB_ORG }}/${IMAGE}:${tag},"
90+
fi
91+
IMAGES="${IMAGES}ghcr.io/${{ github.repository_owner }}/${IMAGE}:${tag},"
92+
done
93+
IMAGES="${IMAGES%,*}"
94+
95+
# save the results in env
96+
echo "BRANCH=${BRANCH}"
97+
echo "VERSION=${VERSION}"
98+
echo "DOCKERHUB_README=${DOCKERHUB_README}"
99+
echo "DOCKERHUB_PUSH=${DOCKERHUB_PUSH}"
100+
echo "IMAGES=${IMAGES}"
101+
102+
echo "BRANCH=${BRANCH}" >> $GITHUB_ENV
103+
echo "VERSION=${VERSION}" >> $GITHUB_ENV
104+
echo "DOCKERHUB_README=${DOCKERHUB_README}" >> $GITHUB_ENV
105+
echo "DOCKERHUB_PUSH=${DOCKERHUB_PUSH}" >> $GITHUB_ENV
106+
echo "IMAGES=${IMAGES}" >> $GITHUB_ENV
107+
108+
# setup docker build
109+
- name: Set up QEMU
110+
uses: docker/setup-qemu-action@v2
111+
112+
- name: Set up Docker Buildx
113+
id: buildx
114+
uses: docker/setup-buildx-action@v2
115+
116+
- name: Inspect Builder
117+
run: |
118+
echo "Name: ${{ steps.buildx.outputs.name }}"
119+
echo "Endpoint: ${{ steps.buildx.outputs.endpoint }}"
120+
echo "Status: ${{ steps.buildx.outputs.status }}"
121+
echo "Flags: ${{ steps.buildx.outputs.flags }}"
122+
echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
123+
124+
# login to registries
125+
- name: Login to DockerHub
126+
if: env.DOCKERHUB_PUSH == 'true'
127+
uses: docker/login-action@v2
128+
with:
129+
username: ${{ secrets.DOCKERHUB_USERNAME }}
130+
password: ${{ secrets.DOCKERHUB_PASSWORD }}
131+
132+
- name: Login to GitHub Container Registry
133+
uses: docker/login-action@v2
134+
with:
135+
registry: ghcr.io
136+
username: ${{ github.actor }}
137+
password: ${{ secrets.GITHUB_TOKEN }}
138+
139+
# build the docker images
140+
- name: Build and push ${{ matrix.name }}
141+
uses: docker/build-push-action@v2
142+
with:
143+
push: true
144+
context: ${{ matrix.FOLDER }}
145+
platforms: ${{ matrix.PLATFORM }}
146+
cache-from: type=gha,scope=${{ matrix.name }}
147+
cache-to: type=gha,mode=max,scope=${{ matrix.name }}
148+
tags: ${{ env.IMAGES }}
149+
build-args: |
150+
VERSION=${{ env.VERSION }}
151+
BUILDNUMBER=${{ github.run_number }}
152+
GITSHA1=${{ github.sha }}
153+
154+
# this will update the README of the dockerhub repo
155+
- name: Docker Hub Description
156+
if: env.DOCKERHUB_README == 'true'
157+
uses: peter-evans/dockerhub-description@v2
158+
env:
159+
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
160+
DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }}
161+
DOCKERHUB_REPOSITORY: ${{ env.DOCKERHUB_ORG }}/extractors-${{ matrix.NAME }}
162+
README_FILEPATH: ${{ matrix.FOLDER }}/README.md

pyclowder/api/v1/files.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import requests
1212
from requests_toolbelt.multipart.encoder import MultipartEncoder
1313

14+
from pyclowder.client import ClowderClient
1415
from pyclowder.collections import get_datasets, get_child_collections
1516
from pyclowder.datasets import get_file_list
1617

@@ -95,6 +96,19 @@ def download_info(connector, client, fileid):
9596

9697
return result
9798

99+
def download_summary(connector, host, key, fileid):
100+
"""Download file summary from Clowder. It's the same as download_info. We have different names for the
101+
same functionality for v2. To be consistent, we are keeping this method in v1,
102+
Keyword arguments:
103+
connector -- connector information, used to get missing parameters and send status updates
104+
host -- the clowder host, including http and port, should end with a /
105+
key -- the secret key to login to clowder
106+
fileid -- the file to fetch metadata of
107+
"""
108+
client = ClowderClient(host=host, key=key)
109+
result = download_info(connector, client, fileid)
110+
return result.json()
111+
98112

99113
def download_metadata(connector, client, fileid, extractor=None):
100114
"""Download file JSON-LD metadata from Clowder.

pyclowder/api/v2/files.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,22 @@ def download_info(connector, client, fileid):
9494

9595
return result
9696

97+
def download_summary(connector, client, fileid):
98+
"""Download file summary from Clowder.
99+
100+
Keyword arguments:
101+
connector -- connector information, used to get missing parameters and send status updates
102+
client -- ClowderClient containing authentication credentials
103+
fileid -- the file to fetch metadata of
104+
"""
105+
106+
url = '%s/api/v2/files/%s/summary' % (client.host, fileid)
107+
headers = {"X-API-KEY": client.key}
108+
# fetch data
109+
result = connector.get(url, stream=True, verify=connector.ssl_verify if connector else True, headers=headers)
110+
111+
return result
112+
97113

98114
def download_metadata(connector, client, fileid, extractor=None):
99115
"""Download file JSON-LD metadata from Clowder.
@@ -302,12 +318,12 @@ def upload_to_dataset(connector, client, datasetid, filepath, check_duplicate=Fa
302318

303319
if os.path.exists(filepath):
304320
filename = os.path.basename(filepath)
305-
m = MultipartEncoder(
306-
fields={'File': (filename, open(filepath, 'rb'))}
307-
)
308-
headers = {"X-API-KEY": client.key,
309-
'Content-Type': m.content_type}
310-
result = connector.post(url, data=m, headers=headers,
321+
# m = MultipartEncoder(
322+
# fields={'File': (filename, open(filepath, 'rb'))}
323+
# )
324+
file_data = {"file": open(filepath, 'rb')}
325+
headers = {"X-API-KEY": client.key}
326+
result = connector.post(url, files=file_data, headers=headers,
311327
verify=connector.ssl_verify if connector else True)
312328

313329
uploadedfileid = result.json()['id']

pyclowder/files.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,20 @@ def download_info(connector, host, key, fileid):
6969
return result.json()
7070

7171

72+
def download_summary(connector, host, key, fileid):
73+
"""Download file summary metadata from Clowder.
74+
75+
Keyword arguments:
76+
connector -- connector information, used to get missing parameters and send status updates
77+
host -- the clowder host, including http and port, should end with a /
78+
key -- the secret key to login to clowder
79+
fileid -- the file to fetch metadata of
80+
"""
81+
client = ClowderClient(host=host, key=key)
82+
result = files.download_summary(connector, client, fileid)
83+
return result.json()
84+
85+
7286
def download_metadata(connector, host, key, fileid, extractor=None):
7387
"""Download file JSON-LD metadata from Clowder.
7488
@@ -240,7 +254,7 @@ def upload_to_dataset(connector, host, key, datasetid, filepath, check_duplicate
240254
"""
241255
client = ClowderClient(host=host, key=key)
242256
if clowder_version == 2:
243-
files.upload_to_dataset(connector, client, datasetid, filepath, check_duplicate)
257+
return files.upload_to_dataset(connector, client, datasetid, filepath, check_duplicate)
244258
else:
245259
logger = logging.getLogger(__name__)
246260

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
FROM python:3.8
2+
3+
WORKDIR /extractor
4+
COPY requirements.txt ./
5+
RUN pip install -r requirements.txt
6+
7+
COPY test-dataset-extractor.py extractor_info.json ./
8+
CMD python test-dataset-extractor.py
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
A simple test extractor that verifies the functions of file in pyclowder.
2+
3+
# Docker
4+
5+
This extractor is ready to be run as a docker container, the only dependency is a running Clowder instance. Simply build and run.
6+
7+
1. Start Clowder V2. For help starting Clowder V2, see our [getting started guide](https://github.com/clowder-framework/clowder2/blob/main/README.md).
8+
9+
2. First build the extractor Docker container:
10+
11+
```
12+
# from this directory, run:
13+
14+
docker build -t test-dataset-extractor .
15+
```
16+
17+
3. Finally run the extractor:
18+
19+
```
20+
docker run -t -i --rm --net clowder_clowder -e "RABBITMQ_URI=amqp://guest:guest@rabbitmq:5672/%2f" --name "test-dataset-extractor" test-dataset-extractor
21+
```
22+
23+
Then open the Clowder web app and run the wordcount extractor on a .txt file (or similar)! Done.
24+
25+
### Python and Docker details
26+
27+
You may use any version of Python 3. Simply edit the first line of the `Dockerfile`, by default it uses `FROM python:3.8`.
28+
29+
Docker flags:
30+
31+
- `--net` links the extractor to the Clowder Docker network (run `docker network ls` to identify your own.)
32+
- `-e RABBITMQ_URI=` sets the environment variables can be used to control what RabbitMQ server and exchange it will bind itself to. Setting the `RABBITMQ_EXCHANGE` may also help.
33+
- You can also use `--link` to link the extractor to a RabbitMQ container.
34+
- `--name` assigns the container a name visible in Docker Desktop.
35+
36+
## Troubleshooting
37+
38+
**If you run into _any_ trouble**, please reach out on our Clowder Slack in the [#pyclowder channel](https://clowder-software.slack.com/archives/CNC2UVBCP).
39+
40+
Alternate methods of running extractors are below.
41+
42+
# Commandline Execution
43+
44+
To execute the extractor from the command line you will need to have the required packages installed. It is highly recommended to use python virtual environment for this. You will need to create a virtual environment first, then activate it and finally install all required packages.
45+
46+
```
47+
Step 1 - Start clowder docker-compose
48+
Step 2 - Starting heartbeat listener
49+
virtualenv clowder2-python (try pipenv)
50+
source clowder2-python/bin/activate
51+
Step 3 - Run heatbeat_listener_sync.py to register new extractor (This step will likely not be needed in future)
52+
cd ~/Git/clowder2/backend
53+
pip install email_validator
54+
copy heartbeat_listener_sync.py to /backend from /backend/app/rabbitmq
55+
python heartbeat_listener_sync.py
56+
57+
Step 4 - Installing pyclowder branch & running extractor
58+
source ~/clowder2-python/bin/activate
59+
pip uninstall pyclowder
60+
61+
# the pyclowder Git repo should have Todd's branch activated (50-clowder20-submit-file-to-extractor)
62+
pip install -e ~/Git/pyclowder
63+
64+
cd ~/Git/pyclowder/sample-extractors/test-dataset-extractor
65+
export CLOWDER_VERSION=2
66+
export CLOWDER_URL=http://localhost:8000/
67+
68+
python test-dataset-extractor.py
69+
70+
71+
Step 5 = # post a particular File ID (text file) to the new extractor
72+
POST http://localhost:3002/api/v2/files/639b31754241665a4fc3e513/extract?extractorName=ncsa.test-dataset-extractor
73+
74+
Or,
75+
Go to Clowder UI and submit a file for extraction
76+
```
77+
78+
# Run the extractor from Pycharm
79+
You can run the heartbeat_listener_sync.py and test_file_extractor.py from pycharm.
80+
Create a pipenv (generally pycharm directs you to create one when you first open the file). To run test_file_extractor.py,
81+
add 'CLOWDER_VERSION=2' to environment variable in run configuration.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
3+
"name": "ncsa.test-dataset-extractor",
4+
"version": "2.0",
5+
"description": "Test Dataset extractor. Test to verify all functionalities of dataset in pyclowder.",
6+
"author": "Dipannita Dey <[email protected]>",
7+
"contributors": [],
8+
"contexts": [
9+
{
10+
"lines": "http://clowder.ncsa.illinois.edu/metadata/sample_metadata#lines",
11+
"words": "http://clowder.ncsa.illinois.edu/metadata/sample_metadata#words",
12+
"characters": "http://clowder.ncsa.illinois.edu/metadata/sample_metadata#characters"
13+
}
14+
],
15+
"repository": [
16+
{
17+
"repType": "git",
18+
"repUrl": "https://opensource.ncsa.illinois.edu/stash/scm/cats/pyclowder.git"
19+
}
20+
],
21+
"process": {
22+
"dataset": [
23+
"*"
24+
]
25+
},
26+
"external_services": [],
27+
"dependencies": [],
28+
"bibtex": []
29+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pyclowder==3.0.2

0 commit comments

Comments
 (0)