Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,5 @@ chart/charts/

*/sql_app.db
.idea/

drafts/
34 changes: 34 additions & 0 deletions app/cfg/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,40 @@ kubernetes_jobs:
subPath: 'uws/jobs/oed-cheminfo/JOB_ID/out'
claimName: 'mmli-shared-job-data'

# Config for running SimpleFold protein structure prediction job
ml-simplefold:
image: "davidbianchi/ml-simplefold:version1"
imagePullPolicy: "IfNotPresent"
nodeSelector:
nvidia.com/gpu.present: "true"
tolerations:
- key: nvidia.com/gpu
operator: Equal
value: "1"
effect: NoSchedule
volumes:
- name: 'shared-storage'
mountPath: '/workspace/input'
subPath: 'uws/jobs/ml-simplefold/JOB_ID/in'
claimName: 'mmli-shared-job-data'
- name: 'shared-storage'
mountPath: '/workspace/output'
subPath: 'uws/jobs/ml-simplefold/JOB_ID/out'
claimName: 'mmli-shared-job-data'
- name: 'shared-storage'
mountPath: '/root/.cache/torch/hub'
subPath: 'uws/jobs/ml-simplefold/.cache'
claimName: 'mmli-shared-job-data'
resources:
limits:
cpu: "16"
memory: "32Gi"
nvidia.com/gpu: "1"
requests:
cpu: "4"
memory: "24Gi"
nvidia.com/gpu: "1"

defaults:
# Kubeconfig (credentials + cluster) used to run this job in Kubernetes
kubeconfig: "/opt/kubeconfig"
Expand Down
1 change: 1 addition & 0 deletions app/models/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class JobType(str, Enum):
OED_CATPRED = 'oed-catpred'
DEFAULT = 'defaults'
EZ_SPECIFICITY = 'ez-specificity'
ML_SIMPLEFOLD = 'ml-simplefold'

def __str__(self) -> str:
return self.value
Expand Down
2 changes: 1 addition & 1 deletion app/postjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

try:
log.info(f'Uploading to MinIO: {job_output_dir}')
upload_local_directory_to_minio(local_path=job_output_dir, bucket_name=bucket_name)
upload_local_directory_to_minio(local_path=job_output_dir, bucket_name=bucket_name, minio_prefix=f"{job_id}/out")
log.info(f'Uploaded successfully to MinIO: {job_output_dir}')
except Exception as ex:
log.error(f'Failed to upload output files from job[{bucket_name}]: {job_id} - {str(ex)}')
5 changes: 5 additions & 0 deletions app/routers/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from services.aceretro_service import ACERetroService
from services.reactionminer_service import ReactionMinerService
from services.oed_service import OEDService
from services.simplefold_service import SimpleFoldService


from typing import Optional, List
Expand Down Expand Up @@ -106,6 +107,10 @@ async def get_results(bucket_name: str, job_id: str, service: MinIOService = Dep
elif bucket_name == JobType.OED_DLKCAT or bucket_name == JobType.OED_UNIKP or bucket_name == JobType.OED_CATPRED:
return await OEDService.propertyPredictionResultPostProcess(bucket_name, job_id, service, db)

elif bucket_name == JobType.ML_SIMPLEFOLD:
print("Getting ML-SIMPLEFOLD job result")
return await SimpleFoldService.resultPostProcess(bucket_name, job_id, service, db)

else:
raise HTTPException(status_code=400, detail="Invalid job type: " + bucket_name)

Expand Down
26 changes: 26 additions & 0 deletions app/routers/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,32 @@ async def create_job(
raise HTTPException(status_code=400, detail="Failed to upload file to MinIO")
command = app_config['kubernetes_jobs'][job_type]['command']

elif job_type == JobType.ML_SIMPLEFOLD:
log.info(f"------------------ STARTING ML-SIMPLEFOLD JOB ------------------ job[{job_type}]: " + job_id)
job_config = json.loads(job_info.replace('\"', '"'))

if 'fasta' not in job_config:
raise HTTPException(status_code=400, detail='"job_info" requires "fasta" for SimpleFold jobs')

# Upload FASTA content to MinIO
if service.ensure_bucket_exists(job_type):
upload_result = service.upload_file(job_type, f"/{job_id}/in/input.fasta", job_config['fasta'].encode('utf-8'))
if not upload_result:
raise HTTPException(status_code=400, detail="Failed to upload FASTA to MinIO")

command = (
"simplefold"
" --simplefold_model simplefold_100M"
" --num_steps 500"
" --tau 0.01"
" --nsample_per_protein 1"
" --plddt"
" --fasta_path ${JOB_INPUT_DIR}/input.fasta"
" --output_dir ${JOB_OUTPUT_DIR}"
" --backend torch"
" && rm -rf ${JOB_OUTPUT_DIR}/cache"
)

elif job_type == JobType.EZ_SPECIFICITY:
#TODO: update command to handle ez-specificity jobs
command = app_config['kubernetes_jobs'][job_type]['command']
Expand Down
19 changes: 11 additions & 8 deletions app/services/kubejob_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def download_remote_directory_from_minio(remote_path: str, bucket_name: str, tar


# Upload a local directory recursively to MinIO
def upload_local_directory_to_minio(local_path: str, bucket_name: str):
def upload_local_directory_to_minio(local_path: str, bucket_name: str, minio_prefix: str = ""):
if not os.path.isdir(local_path):
log.warning('Not a directory: ' + local_path)
return False
Expand All @@ -95,14 +95,13 @@ def upload_local_directory_to_minio(local_path: str, bucket_name: str):
for local_file in glob.glob(local_path + '/**'):
local_file = local_file.replace(os.sep, "/")
if not os.path.isfile(local_file):
upload_local_directory_to_minio(local_file, bucket_name)
dir_name = os.path.basename(local_file)
sub_prefix = os.path.join(minio_prefix, dir_name) if minio_prefix else dir_name
upload_local_directory_to_minio(local_file, bucket_name, sub_prefix)
else:
log.debug(f'Examining {str(local_file)}...')
file_path_head = os.path.split(local_file)[0]
remote_prefix = os.sep.join(file_path_head.split(os.sep)[-2:])

remote_path = os.path.join(remote_prefix, local_file[1 + len(local_path):])
log.info(f'Uploading {local_path} -> {remote_path}...')
file_name = os.path.basename(local_file)
remote_path = os.path.join(minio_prefix, file_name) if minio_prefix else file_name
log.info(f'Uploading {local_file} -> {remote_path}...')
minioClient.fput_object(bucket_name=bucket_name, object_name=remote_path, file_path=local_file)


Expand Down Expand Up @@ -172,6 +171,10 @@ def send_notification_email(self, job_id, job_type, updated_job, new_phase):
results_url = f'{openenzyemdb_frontend_url}/enzyme-recommendation/result/{updated_job.job_id}'
job_type_name = 'OpenEnzymeDB - Enzyme Recommendation'

elif job_type == JobType.ML_SIMPLEFOLD:
# SimpleFold jobs don't have a frontend URL yet - skip email for now
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May need to circle back to fill out the email on success/failure eventually

But this is kind of an internal job, so maybe we don't need to notify at all on this one?

return

# OED & CLEANDB jobs are very fast - no need to send notification email
elif job_type.startswith('oed-') or job_type.startswith('cleandb-'):
#self.logger.warning(f'WARNING: Skipping sending notification email for {job_type} - {job_id}')
Expand Down
23 changes: 23 additions & 0 deletions app/services/simplefold_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from fastapi import HTTPException
from starlette.responses import PlainTextResponse

from config import get_logger
from services.minio_service import MinIOService
from sqlmodel.ext.asyncio.session import AsyncSession

log = get_logger(__name__)


class SimpleFoldService:

@staticmethod
async def resultPostProcess(bucket_name: str, job_id: str, service: MinIOService, db: AsyncSession):
"""
Inputs stored in Minio: /{job_id}/in/<fasta_file> Bucket name: ml-simplefold
Outputs stored in Minio: /{job_id}/out/ Bucket name: ml-simplefold
"""
cif_path = f"{job_id}/out/predictions_simplefold_100M/input_sampled_0.cif"
content = service.get_file(bucket_name, cif_path)
if content is None:
raise HTTPException(status_code=404, detail=f"Output CIF file not found for job {job_id}")
return PlainTextResponse(content.decode('utf-8'))
10 changes: 8 additions & 2 deletions app/services/templates/job.tpl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ spec:
{%- endfor %}
initContainers:
- name: init
image: moleculemaker/mmli-backend:kubejob
image: moleculemaker/mmli-backend:pr-101
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good eye on this! Sorry, I forgot that I had hardcoded these image tags 🙏

We may need to remember to circle back and update these after merging the PR

imagePullPolicy: Always
# securityContext:
# runAsUser: 0
Expand Down Expand Up @@ -120,9 +120,15 @@ spec:
limits:
cpu: "{{ resources.limits.cpu }}"
memory: "{{ resources.limits.memory }}"
{%- if resources.limits['nvidia.com/gpu'] is defined %}
nvidia.com/gpu: "{{ resources.limits['nvidia.com/gpu'] }}"
{%- endif %}
requests:
cpu: "{{ resources.requests.cpu }}"
memory: "{{ resources.requests.memory }}"
{%- if resources.requests['nvidia.com/gpu'] is defined %}
nvidia.com/gpu: "{{ resources.requests['nvidia.com/gpu'] }}"
{%- endif %}
{%- if command %}
command:
- 'bash'
Expand Down Expand Up @@ -181,7 +187,7 @@ spec:
{%- endfor %}
containers:
- name: post-job
image: "moleculemaker/mmli-backend:kubejob"
image: "moleculemaker/mmli-backend:pr-101"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good eye on this! Sorry, I forgot that I had hardcoded these image tags 🙏

We may need to remember to circle back and update these after merging the PR

imagePullPolicy: Always
command:
- 'bash'
Expand Down
18 changes: 13 additions & 5 deletions chart/values.mmli2.prod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,20 @@ config:
operator: Exists

oed-cheminfo:
nodeSelector:
ncsa.role: worker-job
tolerations:
- effect: NoSchedule
key: mmli.role
nodeSelector:
ncsa.role: worker-job
tolerations:
- effect: NoSchedule
key: mmli.role
operator: Exists
ml-simplefold:
runtimeClassName: nvidia
nodeSelector:
nvidia.com/gpu.present: "true"
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
external:
chemscraper:
apiBaseUrl: "http://chemscraper-services.alphasynthesis.svc.cluster.local:8000"
Expand Down
18 changes: 13 additions & 5 deletions chart/values.mmli2.staging.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,20 @@ config:
key: nvidia.com/gpu
operator: Exists
oed-cheminfo:
nodeSelector:
ncsa.role: worker-job
tolerations:
- effect: NoSchedule
key: mmli.role
nodeSelector:
ncsa.role: worker-job
tolerations:
- effect: NoSchedule
key: mmli.role
operator: Exists
ml-simplefold:
runtimeClassName: nvidia
nodeSelector:
nvidia.com/gpu.present: "true"
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
minio:
server: "mmli-backend-staging-minio.staging.svc.cluster.local:9000"
apiBaseUrl: "minioapi.mmli.fastapi.staging.mmli2.ncsa.illinois.edu"
Expand Down
7 changes: 7 additions & 0 deletions chart/values.prod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,13 @@ config:
- effect: NoSchedule
key: mmli.role
operator: Exists
ml-simplefold:
nodeSelector:
nvidia.com/gpu.present: "true"
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
external:
chemscraper:
apiBaseUrl: "http://chemscraper-services.alphasynthesis.svc.cluster.local:8000"
Expand Down
7 changes: 7 additions & 0 deletions chart/values.staging.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,13 @@ config:
- effect: NoSchedule
key: mmli.role
operator: Exists
ml-simplefold:
nodeSelector:
nvidia.com/gpu.present: "true"
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
minio:
server: "mmli-backend-staging-minio.staging.svc.cluster.local:9000"
apiBaseUrl: "minioapi.mmli.fastapi.staging.mmli1.ncsa.illinois.edu"
Expand Down
32 changes: 32 additions & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,38 @@ config:
subPath: "uws/jobs/oed-cheminfo/JOB_ID/out"
claimName: "mmli-shared-job-data"

# Config for running SimpleFold protein structure prediction job
ml-simplefold:
image: "davidbianchi/ml-simplefold:version1"
imagePullPolicy: "IfNotPresent"
env:
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
volumes:
- name: "shared-storage"
mountPath: "/workspace/input"
subPath: "uws/jobs/ml-simplefold/JOB_ID/in"
claimName: "mmli-shared-job-data"
- name: "shared-storage"
mountPath: "/workspace/output"
subPath: "uws/jobs/ml-simplefold/JOB_ID/out"
claimName: "mmli-shared-job-data"
- name: "shared-storage"
mountPath: "/root/.cache/torch/hub"
subPath: "uws/jobs/ml-simplefold/.cache"
claimName: "mmli-shared-job-data"
resources:
limits:
cpu: "16"
memory: "32Gi"
nvidia.com/gpu: "1"
requests:
cpu: "4"
memory: "24Gi"
nvidia.com/gpu: "1"

defaults:
# Kubeconfig (credentials + cluster) used to run this job in Kubernetes
kubeconfig: "/opt/kubeconfig"
Expand Down