Skip to content

Commit b76ad46

Browse files
authored
Fix custom deployment (#8)
1 parent 1f4f7fc commit b76ad46

File tree

7 files changed

+31
-27
lines changed

7 files changed

+31
-27
lines changed

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ dependencies = [
1010
"aiodocker==0.24.0",
1111
"fhaviary[server]==0.18.1",
1212
"fhlmi==0.26.0",
13+
"fh-llm-client==0.0.11", # TODO deprecate this
1314
"ldp==0.23.0",
1415
"pandas==2.2.3",
1516
"numpy==2.2.3",
@@ -22,7 +23,8 @@ dependencies = [
2223
"google-auth==2.38.0",
2324
"google-cloud-storage==3.0.0",
2425
"google-cloud-secret-manager==2.23.0",
25-
"crow-client>=0.3.14",
26+
"crow-client>=0.3.14", # TODO deprecate this
27+
"futurehouse-client", # TODO pin this
2628
"jupyter==1.1.1",
2729
"nbconvert==7.16.6",
2830
"notebook==7.3.2",

src/fhda/Dockerfile.custom_deployment

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ ENV PYTHONPATH="/app/miniconda/lib/python3.12/site-packages:${PYTHONPATH:-}"
3131
RUN pip3 install --no-cache-dir uv==0.5.21
3232
RUN conda install -c conda-forge mamba -y
3333

34-
3534
# Install R and kernels in the crow_env environment
3635
RUN mamba install -c conda-forge -y \
3736
r-base=4.3.3 \
@@ -91,6 +90,7 @@ RUN mamba install -c conda-forge -c bioconda -y \
9190
gseapy=1.1.4 \
9291
blast=2.16.0 \
9392
clipkit=2.3.0 \
93+
clustalo=1.2.4 \
9494
fastqc=0.12.1 \
9595
iqtree=2.3.6 \
9696
mafft=7.526 \
@@ -116,7 +116,7 @@ FROM base AS builder
116116

117117
ARG MODULE_NAME
118118
ARG USE_INTERNAL_DEPS
119-
ARG USE_GIT_CROW_CLIENT
119+
ARG USE_GIT_FUTUREHOUSE_CLIENT
120120

121121

122122
RUN mkdir -p ~/.ssh && \
@@ -150,12 +150,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \
150150
else \
151151
echo 'Skipping aviary_internal install'; \
152152
fi && \
153-
if [ "$USE_GIT_CROW_CLIENT" = "true" ]; then \
153+
if [ "$USE_GIT_FUTUREHOUSE_CLIENT" = "true" ]; then \
154154
git clone git@github.com:Future-House/crow-ecosystem.git /app/crow-ecosystem && \
155-
cd /app/crow-ecosystem/packages/crow-client && \
155+
cd /app/crow-ecosystem/packages/futurehouse-client && \
156156
uv pip install --system -e .; \
157157
else \
158-
uv pip install --system crow-client; \
158+
uv pip install --system futurehouse-client; \
159159
fi
160160

161161
WORKDIR /app/${MODULE_NAME}
@@ -174,6 +174,11 @@ RUN --mount=type=ssh \
174174
RUN find /app -type l -delete && \
175175
rm -rf /app/.git
176176

177+
# Fix futurehouse_client imports by creating a symlink if we're using the git version
178+
RUN if [ "$USE_GIT_FUTUREHOUSE_CLIENT" = "true" ]; then \
179+
ln -sf /app/crow-ecosystem/packages/futurehouse-client/futurehouse_client /app/miniconda/lib/python3.12/site-packages/futurehouse_client; \
180+
fi
181+
177182
FROM base AS runtime
178183

179184
COPY --from=builder /app/ /app/

src/fhda/data_analysis_env.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def from_task(
117117
gcs_artifact_path: The path to the GCS artifact – required for evaluation on crow jobs
118118
environment_config: A JSON string of environment configuration
119119
"""
120-
logger.info("User task: %s", task)
120+
logger.info("User task: %s", task[:100])
121121
logger.info("GCS artifact path: %s", gcs_artifact_path)
122122
logger.info("environment_config: %s", environment_config)
123123
# Track cost of running the environment
@@ -137,9 +137,10 @@ def from_task(
137137
}
138138
else:
139139
kwargs = {}
140+
environment_config = {}
140141
logger.info("Filtered kwargs: %s", kwargs)
141142
task_hash = hashlib.sha256(task.encode()).hexdigest()
142-
if kwargs.get("eval", False):
143+
if environment_config.get("eval", False):
143144
logger.info("Eval mode is True")
144145
# Create a temporary directory in GCP mounted storage volume
145146
trajectory_path = cfg.DATA_STORAGE_PATH / f"{task_hash}-{time.time()}"

src/fhda/prompts.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@
7070
- Ensure each cell executes successfully before moving to the next.
7171
- Assume you already have the packages you need installed and only install new ones if you receive errors.
7272
- If you need to install packages, use pip.
73-
- All cells are by default Python cells. Use python for all analysis.
73+
- All cells are by default Python cells. Use python or bash tools for all analysis.
74+
- You can use bash cells by adding %%bash to the first line of the cell or running a subprocess.
7475
"""
7576

7677
GENERAL_NOTEBOOK_GUIDELINES_R = """
@@ -223,7 +224,7 @@
223224
224225
4. Execute Analysis Plan:
225226
<analysis_planning>
226-
- For each step in your analysis plan, list the Python functions and libraries you'll use.
227+
- For each step in your analysis plan, list the Python or bash functions and libraries you'll use.
227228
- Think about how to structure your code for readability and efficiency.
228229
- Plan how to document your code with clear comments.
229230
- Consider how to present results clearly, using tables or visualizations where appropriate.

src/scripts/deploy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def rename_dockerfile(path: Path, new_name: str):
6060
if __name__ == "__main__":
6161
client = CrowClient(
6262
# stage=Stage.from_string(os.environ.get("CROW_ENV", ENV_VARS["STAGE"])),
63-
stage=Stage.from_string(os.environ.get("CROW_ENV", "LOCAL")),
63+
stage=Stage.from_string(os.environ.get("CROW_ENV", "PROD")),
6464
organization="FutureHouse",
6565
auth_type=AuthType.API_KEY,
6666
api_key=os.environ[f"CROW_API_KEY_{ENV_VARS['STAGE']}"],

src/scripts/platform_eval.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
import pandas as pd
88
import logging
99
from pathlib import Path
10-
from crow_client import CrowClient
11-
from crow_client.models import AuthType, Stage, JobResponse
10+
from crow_client import CrowClient, JobResponseVerbose
11+
from crow_client.models import AuthType, Stage
1212
from aviary.utils import MultipleChoiceQuestion, eval_answer, EvalAnswerMode
1313

1414

@@ -77,7 +77,7 @@ async def fetch_jobs_batch(
7777
List of fetched jobs
7878
"""
7979

80-
async def get_job_async(job_id: str) -> JobResponse:
80+
async def get_job_async(job_id: str) -> JobResponseVerbose:
8181
return await asyncio.to_thread(
8282
client.get_job, job_id, False, True
8383
) # False for history, True for verbose
@@ -327,7 +327,7 @@ async def main(
327327
parser.add_argument(
328328
"--job-file-path",
329329
type=str,
330-
default="local/bixbench_runs/baseline-3.7-single-cell-run2-20250325-065452.json",
330+
default="local/bixbench_runs/bb50k_v2-20250412-094827.json",
331331
help="Path to Job data file with all the job IDs",
332332
)
333333
parser.add_argument(
@@ -337,7 +337,7 @@ async def main(
337337
help="Path to save evaluation results",
338338
)
339339
parser.add_argument(
340-
"--batch-size", type=int, default=50, help="Batch size for job requests"
340+
"--batch-size", type=int, default=200, help="Batch size for job requests"
341341
)
342342
parser.add_argument(
343343
"--api-key",

src/scripts/platform_run_jobs.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
logger = logging.getLogger(__name__)
1818

1919
ENV = "PROD"
20-
JOB_NAME = "job-futurehouse-data-analysis-crow"
21-
CROW_STAGE = getattr(Stage, "LOCAL") # TODO: Change to ENV
20+
JOB_NAME = "job-futurehouse-data-analysis-crow-high"
21+
CROW_STAGE = getattr(Stage, ENV)
2222
API_KEY = os.environ.get(f"CROW_API_KEY_{ENV}")
2323
DATASET_NAME = "bb50k"
2424
if DATASET_NAME == "bixbench":
@@ -142,19 +142,14 @@ async def load_bb50k_data(
142142
open_question: bool = True,
143143
) -> list[dict[str, Any]]:
144144
"""Load the BixBench dataset."""
145-
data = json.load(
146-
open(
147-
"local/bb50k/ngs_analysis_rna_seq_dge_dataset_0_qa_metadata_questions_20250404_210834.json"
148-
)
149-
)
150-
data = data["questions"]
145+
data = json.load(open("local/bb50k/single_dataset_per_wf.json"))
151146
processed_data = []
152147
for i in data:
153148
processed_data.append(
154149
{
155-
"data_folder": GCS_ARTIFACT_PATH + "dataset0",
150+
"data_folder": f"{GCS_ARTIFACT_PATH}/{i['workflow']}/{i['dataset'].replace('dataset_', '')}",
156151
"short_id": i["qa_id"],
157-
"categories": i["generator_class"],
152+
"generator_class": i["generator_class"],
158153
"uuid": i["qa_id"],
159154
"domain": i["domain"],
160155
"workflow": i["workflow"],
@@ -248,7 +243,7 @@ async def main():
248243
raise ValueError(f"Dataset {DATASET_NAME} not supported")
249244

250245
if MINI_MODE:
251-
data = data[:5]
246+
data = data[:2]
252247

253248
jobs = await submit_jobs(data)
254249
await save_results(jobs, RESULTS_FILE)

0 commit comments

Comments
 (0)