Merge pull request #146 from ServiceNow:adding_darglint

recursix · web-flow · commit a46a2ed23108 · 2024-11-25T17:08:33.000-05:00
Adding darglint as workflow test
diff --git a/.github/workflows/darglint.yml b/.github/workflows/darglint.yml
@@ -0,0 +1,34 @@
+name: Darglint checks
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+
+  build:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip' # caching pip dependencies
+
+      - name: Pip install
+        run: pip install darglint
+
+      - name: Pip list
+        run: pip list
+
+      - name: Darglint checks
+        run: darglint -v 2 -z short .
diff --git a/src/agentlab/agents/agent_args.py b/src/agentlab/agents/agent_args.py
@@ -1,5 +1,5 @@
-from bgym import AbstractAgentArgs
 import bgym
+from bgym import AbstractAgentArgs
 
 
 class AgentArgs(AbstractAgentArgs):
@@ -28,6 +28,9 @@ def set_reproducibility_mode(self):
         as possible e.g. setting the temperature of the model to 0.
 
         This is only called when reproducibility is requested.
+
+        Raises:
+            NotImplementedError: If the agent does not support reproducibility.
         """
         raise NotImplementedError(
             f"set_reproducibility_mode is not implemented for agent_args {self.__class__.__name__}"
diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -193,12 +193,19 @@
     add_missparsed_messages=True,
 )
 
+
 AGENT_8B = GenericAgentArgs(
     chat_model_args=CHAT_MODEL_ARGS_DICT["meta-llama/Meta-Llama-3-8B-Instruct"],
     flags=FLAGS_8B,
 )
 
 
+AGENT_LLAMA31_8B = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/meta-llama/llama-3.1-8b-instruct"],
+    flags=FLAGS_8B,
+)
+
+
 # GPT-4o default config
 FLAGS_GPT_4o = GenericPromptFlags(
     obs=dp.ObsFlags(
diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py
@@ -199,6 +199,8 @@ def make_repro_agent(agent_args: AgentArgs, exp_dir: Path | str):
         agent_args (AgentArgs): The original agent args.
         exp_dir (Path | str): The directory where the experiment was saved.
 
+    Returns:
+        ReproAgentArgs: The new agent args.
     """
     exp_dir = Path(exp_dir)
     assert isinstance(agent_args, GenericAgentArgs)
diff --git a/src/agentlab/agents/visualwebarena/agent.py b/src/agentlab/agents/visualwebarena/agent.py
@@ -188,6 +188,12 @@ def get_action(self, obs: Any) -> tuple[str, dict]:
         Replica of VisualWebArena agent
         https://github.com/web-arena-x/visualwebarena/blob/89f5af29305c3d1e9f97ce4421462060a70c9a03/agent/prompts/prompt_constructor.py#L211
         https://github.com/web-arena-x/visualwebarena/blob/89f5af29305c3d1e9f97ce4421462060a70c9a03/agent/prompts/prompt_constructor.py#L272
+
+        Args:
+            obs (Any): Observation from the environment
+
+        Returns:
+            tuple[str, dict]: Action and AgentInfo
         """
         user_messages = []
 
diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
@@ -69,9 +69,7 @@ def set_index_from_variables(
         index_black_list: List of wildard patterns to match variables that
             should be excluded from the index.
         task_key: The key to use as the first level of the index.
-        force_at_leaste_one_variable: If True, force at least one variable in the
-            index. If no variable is found, the index will be set to
-            task_key + "agent.agent_name".
+        add_agent_and_benchmark: If True, add agent.agent_name and env.benchmark
     """
     df.reset_index(inplace=True)
     constants, variables, _ = get_constants_and_variables(df)
@@ -127,6 +125,7 @@ def load_result_df(
             should be included in the index.
         index_black_list: List of wildard patterns to match variables that
             should be excluded from the index.
+        remove_args_suffix: If True, remove the _args suffix from the columns
 
     Returns:
         pd.DataFrame: The result dataframe
@@ -733,17 +732,13 @@ def _categorize_error(row):
 
 
 def _benchmark_from_task_name(task_name: str):
-    """Extract the benchmark from the task name.
-    TODO should be more robost, e.g. handle workarna.L1, workarena.L2, etc.
-    """
+    """Extract the benchmark from the task name."""
+    # TODO should be more robost, e.g. handle workarna.L1, workarena.L2, etc.
     return task_name.split(".")[0]
 
 
 def summarize_study(result_df: pd.DataFrame) -> pd.DataFrame:
-    """Create a summary of the study.
-
-    Similar to global report, but handles single agent differently.
-    """
+    """Create a summary of the study. Similar to global report, but handles single agent differently."""
 
     levels = list(range(result_df.index.nlevels))
     return result_df.groupby(level=levels[1:]).apply(summarize)
diff --git a/src/agentlab/experiments/exp_utils.py b/src/agentlab/experiments/exp_utils.py
@@ -95,6 +95,9 @@ def add_dependencies(exp_args_list: list[ExpArgs], task_dependencies: dict[str,
     Returns:
         list[ExpArgs]
             The modified exp_args_list with dependencies added.
+
+    Raises:
+        ValueError: If the task_dependencies are not valid.
     """
 
     if task_dependencies is None or all([len(dep) == 0 for dep in task_dependencies.values()]):
diff --git a/src/agentlab/experiments/graph_execution_ray.py b/src/agentlab/experiments/graph_execution_ray.py
@@ -47,6 +47,17 @@ def poll_for_timeout(tasks: dict[str, ray.ObjectRef], timeout: float, poll_inter
 
     I tried various different methods for killing a job that hangs. so far it's
     the only one that seems to work reliably (hopefully)
+
+    Args:
+        tasks: dict[str, ray.ObjectRef]
+            Dictionary of task_id: task_ref
+        timeout: float
+            Timeout in seconds
+        poll_interval: float
+            Polling interval in seconds
+
+    Returns:
+        dict[str, Any]: Dictionary of task_id: result
     """
     task_list = list(tasks.values())
     task_ids = list(tasks.keys())
diff --git a/src/agentlab/experiments/launch_exp.py b/src/agentlab/experiments/launch_exp.py
@@ -4,6 +4,7 @@
 
 import bgym
 from browsergym.experiments.loop import ExpArgs, yield_all_exp_results
+
 from agentlab.experiments.exp_utils import run_exp
 
 
@@ -24,13 +25,16 @@ def run_experiments(
             Number of parallel jobs.
         exp_args_list: list[ExpArgs]
             List of ExpArgs objects.
-        exp_dir: Path
+        study_dir: Path
             Directory where the experiments will be saved.
         parallel_backend: str
             Parallel backend to use. Either "joblib", "ray" or "sequential".
             The only backend that supports webarena graph dependencies correctly is ray or sequential.
         avg_step_timeout: int
             Will raise a TimeoutError if the episode is not finished after env_args.max_steps * avg_step_timeout seconds.
+
+    Raises:
+        ValueError: If the parallel_backend is not recognized.
     """
 
     if len(exp_args_list) == 0:
@@ -110,6 +114,13 @@ def find_incomplete(study_dir: str | Path, include_errors=True):
             Find all incomplete experiments and relaunch them.
             - "incomplete_only": relaunch only the incomplete experiments.
             - "incomplete_or_error": relaunch incomplete or errors.
+
+    Returns:
+        list[ExpArgs]
+            List of ExpArgs objects to relaunch.
+
+    Raises:
+        ValueError: If the study_dir does not exist.
     """
     study_dir = Path(study_dir)
 
@@ -152,6 +163,16 @@ def _hide_completed(exp_result: bgym.ExpResult, include_errors: bool = True):
 
     This little hack, allows an elegant way to keep the task dependencies for e.g. webarena
     while skipping the tasks that are completed when relaunching.
+
+    Args:
+        exp_result: bgym.ExpResult
+            The experiment result to hide.
+        include_errors: bool
+            If True, include experiments that errored.
+
+    Returns:
+        ExpArgs
+            The ExpArgs object hidden if the experiment is completed.
     """
 
     hide = False
diff --git a/src/agentlab/experiments/reproducibility_util.py b/src/agentlab/experiments/reproducibility_util.py
@@ -59,7 +59,7 @@ def _get_git_username(repo: Repo) -> str:
     5. Environment variables (GIT_AUTHOR_NAME and GIT_COMMITTER_NAME)
 
     Args:
-        repo (git.Repo): A GitPython Repo object representing the Git repository.
+        repo (Repo): A GitPython Repo object representing the Git repository.
 
     Returns:
         str: The first non-None username found, or None if no username is found.
diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py
@@ -15,11 +15,7 @@
 from agentlab.analyze import inspect_results
 from agentlab.experiments import reproducibility_util as repro
 from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies
-from agentlab.experiments.launch_exp import (
-    find_incomplete,
-    non_dummy_count,
-    run_experiments,
-)
+from agentlab.experiments.launch_exp import find_incomplete, non_dummy_count, run_experiments
 
 logger = logging.getLogger(__name__)
 
@@ -39,24 +35,19 @@ def make_study(
             The agent configuration(s) to run. *IMPORTANT*: these objects will be pickled and
             unpickled.  Make sure they are imported from a package that is accessible from
             PYTHONPATH. Otherwise, it won't load in agentlab-xray.
-
         benchmark: bgym.Benchmark | str
             The benchmark to run the agents on. See bgym.DEFAULT_BENCHMARKS for the main ones. You
             can also make your own by modifying an existing one.
-
         logging_level_stdout: int
             The logging level for the stdout of the main script. Each job will have its own logging
             level that will save into file and can be seen in agentlab-xray.
-
         suffix: str
             A suffix to add to the study name. This can be useful to keep track of your experiments.
             By default the study name contains agent name, benchmark name and date.
-
         comment: str
             Extra comments from the authors of this study to be stored in the reproducibility
             information. Leave any extra information that can explain why results could be different
             than expected.
-
         ignore_dependencies: bool
             If True, ignore the dependencies of the tasks in the benchmark. *Use with caution.* So
             far, only WebArena and VisualWebArena have dependencies between tasks to minimize the
@@ -261,7 +252,15 @@ def load_exp_args_list(self):
     def set_reproducibility_info(self, strict_reproducibility=False, comment=None):
         """Gather relevant information that may affect the reproducibility of the experiment
 
-        e.g.: versions of BrowserGym, benchmark, AgentLab..."""
+        e.g.: versions of BrowserGym, benchmark, AgentLab...
+
+        Args:
+            strict_reproducibility: bool
+                If True, all modifications have to be committed before running the experiments.
+                Also, if relaunching a study, it will not be possible if the code has changed.
+            comment: str
+                Extra comment to add to the reproducibility information.
+        """
         agent_names = [a.agent_name for a in self.agent_args]
         info = repro.get_reproducibility_info(
             agent_names,
@@ -327,13 +326,14 @@ def _run(self, n_jobs=1, parallel_backend="joblib", strict_reproducibility=False
         Args:
             n_jobs: int
                 Number of parallel jobs.
-
             parallel_backend: str
                 Parallel backend to use. Either "joblib", "dask" or "sequential".
-
             strict_reproducibility: bool
                 If True, all modifications have to be committed before running the experiments.
                 Also, if relaunching a study, it will not be possible if the code has changed.
+
+        Raises:
+            ValueError: If the exp_args_list is None.
         """
 
         if self.exp_args_list is None:
@@ -357,10 +357,6 @@ def append_to_journal(self, strict_reproducibility=True):
         Args:
             strict_reproducibility: bool
                 If True, incomplete experiments will raise an error.
-
-        Raises:
-            ValueError: If the reproducibility information is not compatible
-                with the report.
         """
         _, summary_df, _ = self.get_results()
         repro.append_to_journal(
@@ -538,9 +534,16 @@ def _agents_on_benchmark(
             If True, the experiments will be run in demo mode.
         logging_level: int
             The logging level for individual jobs.
+        logging_level_stdout: int
+            The logging level for the stdout.
+        ignore_dependencies: bool
+            If True, the dependencies will be ignored and all experiments can be run in parallel.
 
     Returns:
         list[ExpArgs]: The list of experiments to run.
+
+    Raises:
+        ValueError: If multiple agents are run on a benchmark that requires manual reset.
     """
 
     if not isinstance(agents, (list, tuple)):
diff --git a/src/agentlab/llm/llm_utils.py b/src/agentlab/llm/llm_utils.py
@@ -63,20 +63,15 @@ def retry(
         messages (list): the list of messages so far. This list will be modified with
             the new messages and the retry messages.
         n_retry (int): the maximum number of sequential retries.
-        parser (function): a function taking a message and retruning a parsed value,
+        parser (callable): a function taking a message and retruning a parsed value,
             or raising a ParseError
         log (bool): whether to log the retry messages.
-        min_retry_wait_time (float): the minimum wait time in seconds
-            after RateLimtError. will try to parse the wait time from the error
-            message.
-        rate_limit_max_wait_time (int): the maximum wait time in seconds
 
     Returns:
         dict: the parsed value, with a string at key "action".
 
     Raises:
-        RetryError: if the parser could not parse a valid value after n_retry retries.
-        RateLimitError: if the requests exceed the rate limit.
+        ParseError: if the parser could not parse the response after n_retry retries.
     """
     tries = 0
     while tries < n_retry:
diff --git a/src/agentlab/llm/prompt_templates.py b/src/agentlab/llm/prompt_templates.py
@@ -26,7 +26,7 @@ def format_message(self, message: dict) -> str:
         Formats a given message based on its type.
 
         Args:
-            message (BaseMessage): The message to be formatted.
+            message (dict): The message to be formatted.
 
         Returns:
             str: The formatted message.