data-analysis-crow/src/fhda/data_analysis_env.py at 8b5518aeebf2c14a12802b5a075d1662716cafdb · Future-House/data-analysis-crow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import hashlib
import logging
import shutil
from typing import Any, cast
import time
from aviary.core import (
    EvalAnswerMode,
    Frame,
    Message,
    Messages,
    Tool,
)

from lmi.cost_tracker import GLOBAL_COST_TRACKER, enable_cost_tracking

from .notebook_env import NBEnvironment
from .utils import NBLanguage, MultipleChoiceQuestion, nb_to_html
from . import prompts
from . import config as cfg

logger = logging.getLogger(__name__)

CORRECT_MSG = "Correct answer!"
INCORRECT_MSG = "Incorrect answer."


class DataAnalysisEnv(NBEnvironment):
    def __init__(
        self,
        *,
        problem_id: str,
        problem: str,
        answer: str | int | float | None = None,  # noqa: PYI041
        system_prompt: str | None = None,
        correct_reward: float = 1.0,
        eval_mode: EvalAnswerMode | None = None,
        metadata: dict[str, Any] | None = None,  # used for NBEvalExpt
        mcqs: list[MultipleChoiceQuestion] | None = None,
        exclude_tools: list[str] | None = None,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.problem_id = problem_id
        self.problem = problem
        self.mcqs = mcqs
        self.answer = answer
        self.eval_mode = eval_mode
        self.correct_reward = correct_reward
        self.system_prompt = system_prompt
        self.metadata = metadata
        self.question_rewards: dict[str, int] = {}
        self.exclude_tools = exclude_tools

    async def reset(self) -> tuple[Messages, list[Tool]]:
        # Discard base class's init_obs and make our own with the problem statement
        _, tools = await super().reset()
        if self.exclude_tools:
            tools = [
                tool
                for tool in tools
                if tool._tool_fn.__name__ not in self.exclude_tools
            ]

        messages = [
            Message(content=self.problem),
            self.get_env_state_msg(),
        ]
        # If the list_workdir tool is excluded, add the content of the working directory to the initial message
        if self.exclude_tools is not None and "list_workdir" in self.exclude_tools:
            messages.append(
                Message(
                    content=f"Here is the content of your working directory:\n{self.list_workdir()}"
                )
            )

        if self.system_prompt:
            messages.append(Message(role="system", content=self.system_prompt))
        init_obs = cast(
            Messages,
            messages,
        )
        print(messages)
        print(tools)
        return init_obs, tools

    async def submit_answer(self, answer: str) -> str:  # type: ignore[override]
        """Submit an answer to the problem.

        Note that this tool may only be called once and ends the episode.

        Args:
            answer: The answer to the problem
        """
        # TODO: support various eval modes
        self.state.answer = answer
        self.state.done = True
        logger.info("Submitting answer and closing environment")
        await self.close()
        logger.info("Answer: %s", answer)
        return answer

    def export_frame(self) -> Frame:
        return Frame(
            state={
                "last_action": self.state.actions[-1] if self.state.actions else None,
                "answer": self.state.answer,
                "done": self.state.done,
                "total_reward": self.state.total_reward,
                "nb_state": self.state.nb,
                "nb_state_html": nb_to_html(self.state.nb),
                "nb_runtime_errors": self.state.notebook_runtime_errors,
            },
            info={
                "eval_mode": self.eval_mode,
                "language": self.state.language,
                "problem": self.problem,
                "problem_id": self.problem_id,
                "cost": GLOBAL_COST_TRACKER.lifetime_cost_usd,
            },
        )

    @classmethod
    def from_task(
        cls,
        task: str,
        gcs_artifact_path: str | None = None,
        environment_config: dict[str, Any] | None = None,
    ) -> "DataAnalysisEnv":
        """
        Perform data analysis on a user query.

        Args:
            task: The user query
            gcs_artifact_path: The path to the GCS artifact – required for evaluation on crow jobs
            environment_config: A JSON string of environment configuration
        """
        logger.info("User task: %s", task[:100])
        logger.info("GCS artifact path: %s", gcs_artifact_path)
        logger.info("environment_config: %s", environment_config)
        # Track cost of running the environment
        enable_cost_tracking()
        if (
            not gcs_artifact_path
        ):  # Platform jobs should always be associated with data from a GCS bucket
            raise NotImplementedError(
                "Running crow jobs without gcs_artifact_path is not supported"
            )

        if environment_config:
            kwargs = {
                k: v
                for k, v in environment_config.items()
                if k in cfg.VALID_FROM_TASK_KWARGS
            }
        else:
            kwargs = {}
            environment_config = {}
        logger.info("Filtered kwargs: %s", kwargs)
        task_hash = hashlib.sha256(task.encode()).hexdigest()
        if environment_config.get("eval", False):
            logger.info("Eval mode is True")
            # Create a temporary directory in GCP mounted storage volume
            trajectory_path = cfg.DATA_STORAGE_PATH / f"{task_hash}-{time.time()}"
            trajectory_path.mkdir(parents=True, exist_ok=True)
            for item in (cfg.DATA_STORAGE_PATH / gcs_artifact_path).iterdir():
                if item.is_file():
                    shutil.copy2(item, trajectory_path)
                elif item.is_dir():
                    shutil.copytree(
                        item, trajectory_path / item.name, dirs_exist_ok=True
                    )
        else:
            logger.info("Eval mode is False")
            # Use the GCP folder created when uploading the data via the platform
            trajectory_path = cfg.DATA_STORAGE_PATH / gcs_artifact_path
            # Augment incoming user query with CoT instructions
            task = (
                f"Here is the user query to address:\n"
                f"<query>\n"
                f"{task}\n"
                f"</query>\n"
                f"{prompts.CHAIN_OF_THOUGHT_AGNOSTIC.format(language=kwargs.get('language', 'PYTHON'))}\n"
                f"{prompts.GENERAL_NOTEBOOK_GUIDELINES.format(language=kwargs.get('language', 'PYTHON'))}"
            )
        logger.info("Trajectory path: %s", trajectory_path)
        nb_path = trajectory_path / NBEnvironment.NOTEBOOK_NAME
        logger.info("NB path: %s", nb_path)
        language = getattr(NBLanguage, environment_config.get("language", "PYTHON"))
        # Overwrite the language in the kwargs with NBLanguage enum
        kwargs["language"] = language
        logger.info("Language: %s", language.name)
        if language == NBLanguage.R:
            task += f"\n{prompts.R_OUTPUT_RECOMMENDATION_PROMPT}"

        if trajectory_path.exists():
            files = list(trajectory_path.iterdir())
            logger.info("Files in directory: %s", [f.name for f in files])
            if not files:
                raise ValueError(
                    f"No files found in trajectory path: {trajectory_path}"
                )
        else:
            raise ValueError(f"Trajectory path does not exist: {trajectory_path}")

        return cls(
            problem_id=f"data-analysis-task-{task_hash}",
            problem=task,
            eval_mode=EvalAnswerMode.LLM,
            nb_path=nb_path,
            work_dir=trajectory_path,
            system_prompt=prompts.CAPSULE_SYSTEM_PROMPT_QUERY,
            use_tmp_work_dir=False,
            **kwargs,
        )