Skip to content

Commit da83a34

Browse files
authored
Merge pull request #8 from moleculemaker/feature/lambert8/chemscraper-k8s-job
feat: proof-of-concept for ChemScraper job in Kubernetes
2 parents 9efcdba + 6e3043c commit da83a34

40 files changed

+2787
-0
lines changed

.github/workflows/docker.yml

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
name: Docker
2+
3+
# This will run when:
4+
# - when new code is pushed to main/develop to push the tags
5+
# latest and develop
6+
# - when a pull request is created and updated to make sure the
7+
# Dockerfile is still valid.
8+
# To be able to push to dockerhub, this execpts the following
9+
# secrets to be set in the project:
10+
# - DOCKERHUB_USERNAME : username that can push to the org
11+
# - DOCKERHUB_PASSWORD : password asscoaited with the username
12+
on:
13+
push:
14+
branches:
15+
- main
16+
- develop
17+
18+
pull_request:
19+
20+
# Trigger the workflow on release activity
21+
release:
22+
# Only use the types keyword to narrow down the activity types that will trigger your workflow.
23+
types:
24+
- published
25+
- edited
26+
- created
27+
28+
# Certain actions will only run when this is the main repo.
29+
env:
30+
MAIN_REPO: moleculemaker/chemscraper-helm-chart
31+
DOCKERHUB_ORG: moleculemaker
32+
33+
jobs:
34+
docker:
35+
runs-on: ubuntu-latest
36+
steps:
37+
- name: Set up Docker Buildx
38+
uses: docker/setup-buildx-action@v2
39+
40+
- uses: actions/checkout@v3
41+
42+
- name: Docker meta
43+
id: meta
44+
uses: docker/metadata-action@v4
45+
with:
46+
images: |
47+
moleculemaker/chemscraper-job
48+
tags: |
49+
# set latest tag for default branch
50+
type=raw,value=latest,enable={{is_default_branch}}
51+
type=ref,event=branch
52+
type=ref,event=pr
53+
type=ref,event=tag
54+
type=semver,pattern={{version}}
55+
type=semver,pattern={{major}}.{{minor}}
56+
57+
- name: Login to DockerHub
58+
uses: docker/login-action@v2
59+
with:
60+
username: ${{ secrets.DOCKERHUB_USERNAME }}
61+
password: ${{ secrets.DOCKERHUB_PASSWORD }}
62+
63+
64+
- name: Build and push
65+
uses: docker/build-push-action@v3
66+
with:
67+
context: job
68+
platforms: linux/amd64
69+
push: true
70+
tags: ${{ steps.meta.outputs.tags }}
71+
labels: ${{ steps.meta.outputs.labels }}

job/.dockerignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Git metadata
2+
.git/
3+
.gitignore
4+
.github/
5+
6+
# Docker metadata
7+
.env
8+
docker-compose.yml
9+
Dockerfile
10+
11+
# This stuff doesn't end up in the container
12+
#chemscraper/
13+
README.md
14+
inputs/
15+
outputs/
16+

job/.env

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# mmli-backend passes our current JOB_ID as an environment variable
2+
JOB_ID='examplejobid'
3+
4+
# Default log level - adjust for more/less verbose logs
5+
# e.g. DEBUG, INFO, WARNING, ERROR
6+
LOG_LEVEL='DEBUG'
7+
8+
# Path to input PDFs for RM+CS
9+
# Files in this path will be automatically downloaded from MinIO before running the AlphaSynthesis job
10+
CHEMSCRAPER_INPUT_FILE='/usr/app/inputs/or100.09.tables.small.pdf'
11+
12+
# Path to output from full RM+CS workflow
13+
# We store to the same path as ReactionMiner outputs, so that this is also uploaded to MinIO
14+
CHEMSCRAPER_OUTPUT_DIR='/usr/app/outputs'
15+
16+
# Base URL to ChemScraper API
17+
# We will override this default in production
18+
19+
# External
20+
CHEMSCRAPER_BASE_URL='https://chemscraper.backend.staging.mmli1.ncsa.illinois.edu' # Staging
21+
#CHEMSCRAPER_BASE_URL='https://chemscraper.backend.mmli1.ncsa.illinois.edu' # Prod
22+
23+
24+
# Internal
25+
#CHEMSCRAPER_BASE_URL='http://chemscraper-services-staging.staging.svc.cluster.local:8000' # Staging
26+
#CHEMSCRAPER_BASE_URL='http://chemscraper-services.alphasynthesis.svc.cluster.local:8000' # Prod
27+
28+
29+
# Local
30+
#CHEMSCRAPER_BASE_URL='http://host.docker.internal:8000'

job/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.idea/
2+
**/__pycache__/**/*.pyc
3+
outputs/*

job/Dockerfile

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
FROM mambaorg/micromamba:alpine
2+
WORKDIR /usr/app/
3+
4+
# Use a drop-in for conda executable
5+
ENV CONDA='micromamba'
6+
7+
# Install dependencies using conda
8+
COPY environment.yml .
9+
RUN ${CONDA} env update -n base -f environment.yml && ${CONDA} clean --all
10+
SHELL [ "${CONDA}", "run", "-n", "base" ]
11+
12+
# Generate ChemScraper API Client
13+
# FIXME: We needed to manually change one line of the generated code
14+
#COPY generate-client.sh .
15+
#RUN ./generate-client.sh
16+
17+
# Entrypoint script
18+
COPY run_chemscraper.py .
19+
CMD [ "python", "./run_chemscraper.py" ]

job/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
## ChemScraper Docker Job
2+
A slim container designed to execute submit a single PDF to ChemScraper
3+
4+
### Usage
5+
6+
7+
### Building
8+

job/chemscraper/.gitignore

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
__pycache__/
2+
build/
3+
dist/
4+
*.egg-info/
5+
.pytest_cache/
6+
7+
# pyenv
8+
.python-version
9+
10+
# Environments
11+
.env
12+
.venv
13+
14+
# mypy
15+
.mypy_cache/
16+
.dmypy.json
17+
dmypy.json
18+
19+
# JetBrains
20+
.idea/
21+
22+
/coverage.xml
23+
/.coverage

job/chemscraper/README.md

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# fast-api-client
2+
A client library for accessing FastAPI
3+
4+
## Usage
5+
First, create a client:
6+
7+
```python
8+
from fast_api_client import Client
9+
10+
client = Client(base_url="https://api.example.com")
11+
```
12+
13+
If the endpoints you're going to hit require authentication, use `AuthenticatedClient` instead:
14+
15+
```python
16+
from fast_api_client import AuthenticatedClient
17+
18+
client = AuthenticatedClient(base_url="https://api.example.com", token="SuperSecretToken")
19+
```
20+
21+
Now call your endpoint and use your models:
22+
23+
```python
24+
from fast_api_client.models import MyDataModel
25+
from fast_api_client.api.my_tag import get_my_data_model
26+
from fast_api_client.types import Response
27+
28+
with client as client:
29+
my_data: MyDataModel = get_my_data_model.sync(client=client)
30+
# or if you need more info (e.g. status_code)
31+
response: Response[MyDataModel] = get_my_data_model.sync_detailed(client=client)
32+
```
33+
34+
Or do the same thing with an async version:
35+
36+
```python
37+
from fast_api_client.models import MyDataModel
38+
from fast_api_client.api.my_tag import get_my_data_model
39+
from fast_api_client.types import Response
40+
41+
async with client as client:
42+
my_data: MyDataModel = await get_my_data_model.asyncio(client=client)
43+
response: Response[MyDataModel] = await get_my_data_model.asyncio_detailed(client=client)
44+
```
45+
46+
By default, when you're calling an HTTPS API it will attempt to verify that SSL is working correctly. Using certificate verification is highly recommended most of the time, but sometimes you may need to authenticate to a server (especially an internal server) using a custom certificate bundle.
47+
48+
```python
49+
client = AuthenticatedClient(
50+
base_url="https://internal_api.example.com",
51+
token="SuperSecretToken",
52+
verify_ssl="/path/to/certificate_bundle.pem",
53+
)
54+
```
55+
56+
You can also disable certificate validation altogether, but beware that **this is a security risk**.
57+
58+
```python
59+
client = AuthenticatedClient(
60+
base_url="https://internal_api.example.com",
61+
token="SuperSecretToken",
62+
verify_ssl=False
63+
)
64+
```
65+
66+
Things to know:
67+
1. Every path/method combo becomes a Python module with four functions:
68+
1. `sync`: Blocking request that returns parsed data (if successful) or `None`
69+
1. `sync_detailed`: Blocking request that always returns a `Request`, optionally with `parsed` set if the request was successful.
70+
1. `asyncio`: Like `sync` but async instead of blocking
71+
1. `asyncio_detailed`: Like `sync_detailed` but async instead of blocking
72+
73+
1. All path/query params, and bodies become method arguments.
74+
1. If your endpoint had any tags on it, the first tag will be used as a module name for the function (my_tag above)
75+
1. Any endpoint which did not have a tag will be in `fast_api_client.api.default`
76+
77+
## Advanced customizations
78+
79+
There are more settings on the generated `Client` class which let you control more runtime behavior, check out the docstring on that class for more info. You can also customize the underlying `httpx.Client` or `httpx.AsyncClient` (depending on your use-case):
80+
81+
```python
82+
from fast_api_client import Client
83+
84+
def log_request(request):
85+
print(f"Request event hook: {request.method} {request.url} - Waiting for response")
86+
87+
def log_response(response):
88+
request = response.request
89+
print(f"Response event hook: {request.method} {request.url} - Status {response.status_code}")
90+
91+
client = Client(
92+
base_url="https://api.example.com",
93+
httpx_args={"event_hooks": {"request": [log_request], "response": [log_response]}},
94+
)
95+
96+
# Or get the underlying httpx client to modify directly with client.get_httpx_client() or client.get_async_httpx_client()
97+
```
98+
99+
You can even set the httpx client directly, but beware that this will override any existing settings (e.g., base_url):
100+
101+
```python
102+
import httpx
103+
from fast_api_client import Client
104+
105+
client = Client(
106+
base_url="https://api.example.com",
107+
)
108+
# Note that base_url needs to be re-set, as would any shared cookies, headers, etc.
109+
client.set_httpx_client(httpx.Client(base_url="https://api.example.com", proxies="http://localhost:8030"))
110+
```
111+
112+
## Building / publishing this package
113+
This project uses [Poetry](https://python-poetry.org/) to manage dependencies and packaging. Here are the basics:
114+
1. Update the metadata in pyproject.toml (e.g. authors, version)
115+
1. If you're using a private repository, configure it with Poetry
116+
1. `poetry config repositories.<your-repository-name> <url-to-your-repository>`
117+
1. `poetry config http-basic.<your-repository-name> <username> <password>`
118+
1. Publish the client with `poetry publish --build -r <your-repository-name>` or, if for public PyPI, just `poetry publish --build`
119+
120+
If you want to install this client into another project without publishing it (e.g. for development) then:
121+
1. If that project **is using Poetry**, you can simply do `poetry add <path-to-this-client>` from that project
122+
1. If that project is not using Poetry:
123+
1. Build a wheel with `poetry build -f wheel`
124+
1. Install that wheel from the other project `pip install <path-to-wheel>`
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"""A client library for accessing FastAPI"""
2+
3+
from .client import AuthenticatedClient, Client
4+
5+
__all__ = (
6+
"AuthenticatedClient",
7+
"Client",
8+
)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Contains methods for accessing the API"""

0 commit comments

Comments
 (0)