Merge pull request #4874 from yuwenma/promote-wip

google-oss-prow[bot] · web-flow · commit e674baf4e031 · 2025-07-30T13:15:48.000Z
llm-eval: run root Gemini CLI (and config) with different KCC repos
diff --git a/.gitignore b/.gitignore
@@ -176,4 +176,4 @@ experiments/mcp/**/__pycache__
 experiments/mcp/**/*.egg-info
 
 # MCP eval
-experiments/mcp-eval/**/__pycache__
+experiments/**/__pycache__/
diff --git a/experiments/llm-eval/evaluator.py b/experiments/llm-eval/evaluator.py
@@ -24,10 +24,19 @@
 
 STOP_TOKEN="soapoirejwpgoijrepoiqjt"
 class MCPEvaluator:
-    def __init__(self, gemini_cli_path="gemini", mcp_config_path="~/.gemini/settings.json", use_mcp=True, log_path=None):
+    def __init__(self, gemini_cli_path="gemini", src_mcp_config_path="~/.gemini/settings.json", use_mcp=True, log_path=None):
         self.gemini_cli_path = gemini_cli_path
-        self.mcp_config_path = os.path.expanduser(mcp_config_path)
         self.use_mcp = use_mcp
+        
+        if self.use_mcp:
+            config_to_write = {}
+            if src_mcp_config_path:
+                config_path = os.path.expanduser(src_mcp_config_path)
+                with open(config_path, 'r') as f:
+                    config_to_write = json.load(f)
+
+            self.mcp_config = config_to_write
+
         self.log_path = log_path
         self.test_results = [] # To store detailed results
         self.metrics = defaultdict(float) # For aggregated metrics
@@ -37,22 +46,6 @@ def __init__(self, gemini_cli_path="gemini", mcp_config_path="~/.gemini/settings
             with open(self.log_path, 'w') as f:
                 f.write("--- Evaluation Log ---\n\n")
 
-    def setup_mcp_config(self, config_data=None):
-        """
-        Writes MCP server configuration to the Gemini CLI settings.json.
-        If use_mcp is False, an empty config is written.
-        """
-        expanded_path = os.path.expanduser(self.mcp_config_path)
-        os.makedirs(os.path.dirname(expanded_path), exist_ok=True)
-        
-        config_to_write = {}
-        if self.use_mcp and config_data:
-            config_to_write = config_data
-
-        with open(expanded_path, 'w') as f:
-            json.dump(config_to_write, f, indent=4)
-        print(f"MCP configuration written to: {expanded_path}")
-
     def _get_git_root(self):
         try:
             return subprocess.check_output(["git", "rev-parse", "--show-toplevel"], text=True).strip()
@@ -91,7 +84,7 @@ def run_test_case(self, name, prompt, verifier_script=None, cleanup_script=None,
         effective_cwd = task_dir
         # Run setup script
         if setup_script:
-            success, stdout, stderr = self._run_script(setup_script, task_dir)
+            success, stdout, stderr = self._run_script(setup_script, effective_cwd)
             if not success:
                 print(f"Setup script {setup_script} failed. Skipping test.")
                 print(f"  Stdout:\n{stdout}")
@@ -110,12 +103,22 @@ def run_test_case(self, name, prompt, verifier_script=None, cleanup_script=None,
                 }
                 self.test_results.append(result)
                 return
-            effective_cwd = os.path.join(task_dir, stdout.strip())
             
         print(f"--- Running LLM: {prompt} ---")
-        command = "/mcp" if self.use_mcp else None
+        print(f"--- effective_cwd: {effective_cwd} ---")
+        gemini_working_dir = os.path.join(effective_cwd, stdout.strip())
+        print(f"--- LLM response: {stdout} ---")
+        
+        if self.use_mcp:
+            # Write MCP configuration to the .gemini/settings.json file
+            mcp_config_path = os.path.join(gemini_working_dir, ".gemini", "settings.json")
+            os.makedirs(os.path.dirname(mcp_config_path), exist_ok=True)
+            with open(mcp_config_path, 'w') as f:
+                json.dump(self.mcp_config, f, indent=4)
+            print(f"MCP configuration written to: {mcp_config_path}")
+
         start_time = time.time()
-        stdout, stderr, returncode, llm_requests = self._run_gemini_command_internal(command, prompt, cwd=effective_cwd)
+        stdout, stderr, returncode, llm_requests = self._run_gemini_command_internal(prompt, cwd=gemini_working_dir)
         end_time = time.time()
         latency = (end_time - start_time) * 1000 # in ms
         print(f"--- LLM response: {stdout} ---")
@@ -239,31 +242,30 @@ def get_summary(self):
         )
         return summary
         
-    def _run_gemini_command_internal(self, command, prompt, cwd=None):
+    def _run_gemini_command_internal(self, prompt, cwd=None):
         """
         Runs a Gemini CLI command and captures its output and error, printing it in real-time.
         The command is terminated when 'User notified.' is detected in the output.     
         
         Args:
-            command (str): The Gemini CLI command (e.g., "/mcp", "ask").
             prompt (str): The prompt to send to Gemini CLI.
             cwd (str): The working directory to run the command in.
 
         Returns:
             tuple: A tuple containing (stdout, stderr, returncode, llm_requests_count).
         """
-        gemini_cli_path = self.gemini_cli_path
-        prompt=f"cd {cwd}\n" + prompt +f"\nOnce you are done. return {STOP_TOKEN}"
-        args = [gemini_cli_path, "-d", "-p", prompt, "-y"]  # Use -p for non-interactive mode
-
-        if command:
-            args.insert(1, command)  # Insert command after 'gemini'
-
         env = os.environ.copy()
         # mcp is under a different python virtual env.
         env.pop('VIRTUAL_ENV', None)
+
+        effective_cwd = cwd if cwd else self.git_root
+        env['MCPWorkDir'] = effective_cwd
         
-        effective_cwd = self.git_root
+        gemini_cli_path = self.gemini_cli_path
+        # Always inform the LLM of the context directory.
+        prompt_with_context = prompt + f"\nOnce you are done. return {STOP_TOKEN}"
+        args = [gemini_cli_path, "-d", "-p", prompt_with_context, "-y"]  # Use -p for non-interactive mode
+
         try:
             process = subprocess.Popen(
                 args,
@@ -279,8 +281,8 @@ def _run_gemini_command_internal(self, command, prompt, cwd=None):
             stdout_output = []
             stderr_output = []
 
-            streams = [process.stdout, process.stderr]                                                
-            terminated = False                                                                        
+            streams = [process.stdout, process.stderr]
+            terminated = False
             while streams:                                                                            
                 readable, _, _ = select.select(streams, [], [], 1) # 1s timeout                       
                 if not readable:                                                                      
@@ -324,10 +326,10 @@ def _run_gemini_command_internal(self, command, prompt, cwd=None):
             llm_requests = len(re.findall(r"LLM API request sent", stderr))
             return stdout, stderr, process.returncode, llm_requests
 
-        except FileNotFoundError:
+        except FileNotFoundError as e:
             print(f"Error: Gemini CLI not found at '{gemini_cli_path}'. "
                   "Please ensure it's installed and in your system's PATH.")
-            return "", f"Gemini CLI not found at '{gemini_cli_path}'", 127, 0
+            return "", f"Gemini CLI not found at '{gemini_cli_path}': {e}", 127, 0
         except Exception as e:
             print(f"An error occurred: {e}")
             return "", str(e), 1, 0
diff --git a/experiments/llm-eval/main.py b/experiments/llm-eval/main.py
@@ -156,7 +156,6 @@ def compare_reports(mcp_df, no_mcp_df):
         # --- Run with MCP Disabled ---
         print("\n--- Starting Evaluation with MCP Disabled ---")
         no_mcp_evaluator = MCPEvaluator(gemini_cli_path=args.gemini_cli_path, use_mcp=False, log_path=args.log)
-        no_mcp_evaluator.setup_mcp_config()
         for test in test_cases:
             no_mcp_evaluator.run_test_case(**test)
         no_mcp_results_df = no_mcp_evaluator.generate_report()
@@ -166,23 +165,7 @@ def compare_reports(mcp_df, no_mcp_df):
     else:
         # --- Run with MCP Enabled ---
         print("--- Starting Evaluation with MCP Enabled ---")
-        mcp_config = {}
-        if args.config_path:
-            config_path = os.path.expanduser(args.config_path)
-            if not os.path.isabs(config_path):
-                config_path = os.path.join(git_root, config_path)
-            
-            with open(config_path, 'r') as f:
-                mcp_config = json.load(f)
-
-            # Make server directories absolute
-            if "mcp_servers" in mcp_config:
-                for server in mcp_config["mcp_servers"]:
-                    if "directory" in server and not os.path.isabs(server["directory"]):
-                        server["directory"] = os.path.join(git_root, server["directory"])
-        
-        mcp_evaluator = MCPEvaluator(gemini_cli_path=args.gemini_cli_path, use_mcp=True, log_path=args.log)
-        mcp_evaluator.setup_mcp_config(mcp_config)
+        mcp_evaluator = MCPEvaluator(gemini_cli_path=args.gemini_cli_path, use_mcp=True, src_mcp_config_path=args.config_path, log_path=args.log)
         for test in test_cases:
             mcp_evaluator.run_test_case(**test)
         mcp_results_df = mcp_evaluator.generate_report()
@@ -194,7 +177,6 @@ def compare_reports(mcp_df, no_mcp_df):
         if not args.task:
             print("\n--- Starting Evaluation with MCP Disabled ---")
             no_mcp_evaluator = MCPEvaluator(gemini_cli_path=args.gemini_cli_path, use_mcp=False, log_path=args.log)
-            no_mcp_evaluator.setup_mcp_config()
             for test in test_cases:
                 no_mcp_evaluator.run_test_case(**test)
             no_mcp_results_df = no_mcp_evaluator.generate_report()
diff --git a/experiments/llm-eval/tasks/beta-promote/BigQueryReservationReservation-promote/setup.sh b/experiments/llm-eval/tasks/beta-promote/BigQueryReservationReservation-promote/setup.sh
@@ -16,4 +16,7 @@
 set -e
 SUBDIR="BigQueryReservationReservation-promote-bigqueryreservation"
 git clone git@github.com:GoogleCloudPlatform/k8s-config-connector.git $SUBDIR
-echo $SUBDIR
+rm -rf $SUBDIR/.gemini # Avoid using git cloned .gemini.
+
+export MCPWorkDir=${SUBDIR} # placeholder. evaluator runs setup.sh as a temp shell. This env var won't take effect when the shell finishes.
+echo ${MCPWorkDir}
diff --git a/experiments/llm-eval/tasks/beta-promote/KMSImportJob-promote/setup.sh b/experiments/llm-eval/tasks/beta-promote/KMSImportJob-promote/setup.sh
@@ -16,4 +16,7 @@
 set -e
 SUBDIR="KMSImportJob-promote-kms"
 git clone git@github.com:GoogleCloudPlatform/k8s-config-connector.git $SUBDIR
-echo $SUBDIR
+rm -rf $SUBDIR/.gemini # Avoid using git cloned .gemini.
+
+export MCPWorkDir=${SUBDIR} # placeholder. evaluator runs setup.sh as a temp shell. This env var won't take effect when the shell finishes.
+echo ${MCPWorkDir}
diff --git a/experiments/llm-eval/tasks/beta-promote/LoggingLink-promote/setup.sh b/experiments/llm-eval/tasks/beta-promote/LoggingLink-promote/setup.sh
@@ -16,4 +16,7 @@
 set -e
 SUBDIR="LoggingLink-promote-logging"
 git clone git@github.com:GoogleCloudPlatform/k8s-config-connector.git $SUBDIR
-echo $SUBDIR
+rm -rf $SUBDIR/.gemini # Avoid using git cloned .gemini.
+
+export MCPWorkDir=${SUBDIR} # placeholder. evaluator runs setup.sh as a temp shell. This env var won't take effect when the shell finishes.
+echo ${MCPWorkDir}
diff --git a/experiments/llm-eval/tasks/beta-promote/MetastoreBackup-promote/setup.sh b/experiments/llm-eval/tasks/beta-promote/MetastoreBackup-promote/setup.sh
@@ -16,4 +16,7 @@
 set -e
 SUBDIR="MetastoreBackup-promote-metastore"
 git clone git@github.com:GoogleCloudPlatform/k8s-config-connector.git $SUBDIR
-echo $SUBDIR
+rm -rf $SUBDIR/.gemini # Avoid using git cloned .gemini.
+
+export MCPWorkDir=${SUBDIR} # placeholder. evaluator runs setup.sh as a temp shell. This env var won't take effect when the shell finishes.
+echo ${MCPWorkDir}
diff --git a/experiments/llm-eval/tasks/beta-promote/MetastoreFederation-promote/setup.sh b/experiments/llm-eval/tasks/beta-promote/MetastoreFederation-promote/setup.sh
@@ -16,4 +16,7 @@
 set -e
 SUBDIR="MetastoreFederation-promote-metastore"
 git clone git@github.com:GoogleCloudPlatform/k8s-config-connector.git $SUBDIR
-echo $SUBDIR
+rm -rf $SUBDIR/.gemini # Avoid using git cloned .gemini.
+
+export MCPWorkDir=${SUBDIR} # placeholder. evaluator runs setup.sh as a temp shell. This env var won't take effect when the shell finishes.
+echo ${MCPWorkDir}
diff --git a/experiments/llm-eval/tasks/beta-promote/VMwareEngineExternalAddress-promote/setup.sh b/experiments/llm-eval/tasks/beta-promote/VMwareEngineExternalAddress-promote/setup.sh
@@ -16,4 +16,7 @@
 set -e
 SUBDIR="VMwareEngineExternalAddress-promote-vmwareengine"
 git clone git@github.com:GoogleCloudPlatform/k8s-config-connector.git $SUBDIR
-echo $SUBDIR
+rm -rf $SUBDIR/.gemini # Avoid using git cloned .gemini.
+
+export MCPWorkDir=${SUBDIR} # placeholder. evaluator runs setup.sh as a temp shell. This env var won't take effect when the shell finishes.
+echo ${MCPWorkDir}
diff --git a/experiments/mcp/criteria/apis.json b/experiments/mcp/criteria/apis.json
@@ -28,6 +28,6 @@
       "**Completeness**: The YAML must include all required fields as defined in the CRD.",
       "**Correctness**: User-provided configurations must be placed at the correct paths within the YAML structure.",
       "**Validity**: All values, especially names and labels, must adhere to Kubernetes syntax and naming conventions.",
-      "**Minimalism**: The generated YAML should not include optional fields unless they are explicitly requested or required for a specific configuration.",
+      "**Minimalism**: The generated YAML should not include optional fields unless they are explicitly requested or required for a specific configuration."
     ]
   }
diff --git a/experiments/mcp/src/pkg/promotion.py b/experiments/mcp/src/pkg/promotion.py
diff --git a/experiments/mcp/src/run.py b/experiments/mcp/src/run.py
diff --git a/experiments/mcp/src/server.py b/experiments/mcp/src/server.py

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,6 @@`
`28`	`28`	`"Completeness: The YAML must include all required fields as defined in the CRD.",`
`29`	`29`	`"Correctness: User-provided configurations must be placed at the correct paths within the YAML structure.",`
`30`	`30`	`"Validity: All values, especially names and labels, must adhere to Kubernetes syntax and naming conventions.",`
`31`		`- "Minimalism: The generated YAML should not include optional fields unless they are explicitly requested or required for a specific configuration.",`
	`31`	`+ "Minimalism: The generated YAML should not include optional fields unless they are explicitly requested or required for a specific configuration."`
`32`	`32`	`]`
`33`	`33`	`}`