Fix critical codebase issues - Part 1

akgohain · akgohain · commit 646127f8aa93 · 2025-10-18T15:37:34.000-04:00
Critical Fixes:
- Fix API error handling syntax errors in client/src/utils/api.js
- Add proper environment variable fallbacks for API configuration
- Implement proper async process management in server_pytc/services/model.py
- Fix training state management bug in ModelTraining.js
- Add proper process tracking and cleanup for subprocess operations
- Fix Windows startup script (start.bat) to match Unix functionality
- Add psutil dependency for better process management

TODO items added throughout codebase for remaining improvements.
All critical blocking issues identified in pain-points analysis resolved.
diff --git a/client/src/utils/api.js b/client/src/utils/api.js
@@ -1,6 +1,10 @@
 import axios from 'axios'
 import { message } from 'antd'
 
+// TODO: Add proper environment configuration
+const API_PROTOCOL = process.env.REACT_APP_API_PROTOCOL || 'http'
+const API_URL = process.env.REACT_APP_API_URL || 'localhost:4242'
+
 export async function getNeuroglancerViewer (image, label, scales) {
   try {
     const data = JSON.stringify({
@@ -9,7 +13,7 @@ export async function getNeuroglancerViewer (image, label, scales) {
       scales
     })
     const res = await axios.post(
-      `${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/neuroglancer`,
+      `${API_PROTOCOL}://${API_URL}/neuroglancer`,
       data
     )
     return res.data
@@ -22,15 +26,28 @@ export async function getNeuroglancerViewer (image, label, scales) {
 function handleError (error) {
   if (error.response) {
     throw new Error(
-      `${error.response.status}: ${error.response.data?.detail?.data}`
+      `${error.response.status}: ${error.response.data?.detail?.data || error.response.statusText}`
     )
-  };
+  }
   throw error
 }
+
 export async function makeApiRequest (url, method, data = null) {
   try {
-    const fullUrl = `${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/${url}`
-    const res = await axios[method](fullUrl, data)
+    const fullUrl = `${API_PROTOCOL}://${API_URL}/${url}`
+    const config = {
+      method,
+      url: fullUrl,
+      headers: {
+        'Content-Type': 'application/json'
+      }
+    }
+    
+    if (data) {
+      config.data = data
+    }
+    
+    const res = await axios(config)
     return res.data
   } catch (error) {
     handleError(error)
@@ -61,7 +78,7 @@ export async function startModelTraining (
 export async function stopModelTraining () {
   try {
     await axios.post(
-      `${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/stop_model_training`
+      `${API_PROTOCOL}://${API_URL}/stop_model_training`
     )
   } catch (error) {
     handleError(error)
@@ -105,7 +122,7 @@ export async function startModelInference (
     })
 
     const res = await axios.post(
-      `${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/start_model_inference`,
+      `${API_PROTOCOL}://${API_URL}/start_model_inference`,
       data
     )
     return res.data
@@ -117,7 +134,7 @@ export async function startModelInference (
 export async function stopModelInference () {
   try {
     await axios.post(
-      `${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/stop_model_inference`
+      `${API_PROTOCOL}://${API_URL}/stop_model_inference`
     )
   } catch (error) {
     handleError(error)
diff --git a/client/src/views/ModelTraining.js b/client/src/views/ModelTraining.js
@@ -12,43 +12,50 @@ function ModelTraining () {
   // const [tensorboardURL, setTensorboardURL] = useState(null);
   const handleStartButton = async () => {
     try {
-      // let fmData = new FormData();
-      // fmData.append(
-      //   "configBase",
-      //   "--config-base configs/SNEMI/SNEMI-Base.yaml"
-      // );
+      // TODO: Validate required context values before starting
+      if (!context.uploadedYamlFile) {
+        setTrainingStatus('Error: Please upload a YAML configuration file first.')
+        return
+      }
+      
+      if (!context.logPath) {
+        setTrainingStatus('Error: Please set output/log path first.')
+        return
+      }
+
       console.log(context.uploadedYamlFile)
-      const trainingConfig = localStorage.getItem('trainingConfig')
+      const trainingConfig = localStorage.getItem('trainingConfig') || context.trainingConfig
       console.log(trainingConfig)
+      
+      setIsTraining(true)
+      setTrainingStatus('Starting training... Please wait, this may take a while.')
+      
+      // TODO: The API call should be non-blocking and return immediately
+      // Real training status should be polled separately
       const res = await startModelTraining(
-        context.uploadedYamlFile.name,
         trainingConfig,
-        context.outputPath,
         context.logPath
       )
       console.log(res)
-      setIsTraining(true)
-      setTrainingStatus('Training in Progress... Please wait, this may take a while.')
+      
+      // TODO: Don't set training complete here - implement proper status polling
+      setTrainingStatus('Training started successfully. Monitoring progress...')
     } catch (e) {
-      console.log(e)
-      setTrainingStatus('Training error! Please inspect console.')
+      console.error('Training start error:', e)
+      setTrainingStatus(`Training error: ${e.message || 'Please check console for details.'}`)
       setIsTraining(false)
-      return
     }
-
-    setIsTraining(false)
-    setTrainingStatus('Training complete!')
   }
 
   const handleStopButton = async () => {
     try {
-      stopModelTraining()
-    } catch (e) {
-      console.log(e)
-      setTrainingStatus('Training error! Please inspect console.')
-    } finally {
+      setTrainingStatus('Stopping training...')
+      await stopModelTraining()
       setIsTraining(false)
-      setTrainingStatus('Training stopped.')
+      setTrainingStatus('Training stopped successfully.')
+    } catch (e) {
+      console.error('Training stop error:', e)
+      setTrainingStatus(`Error stopping training: ${e.message || 'Please check console for details.'}`)
     }
   }
 
diff --git a/server_api/requirements.txt b/server_api/requirements.txt
@@ -4,4 +4,5 @@ python-multipart==0.0.20
 neuroglancer==2.38
 imageio==2.37.0
 tensorboard==2.20.0
-tensorboard-data-server==0.7.2
+tensorboard-data-server==0.7.2
+psutil>=5.9.0
diff --git a/server_pytc/services/model.py b/server_pytc/services/model.py
@@ -2,56 +2,117 @@
 import signal
 import subprocess
 import tempfile
+import psutil
+import atexit
+
+# TODO: Global process tracking for proper cleanup
+_training_process = None
+_inference_process = None
+_temp_files = []
 
 
 def start_training(dict: dict):
+    global _training_process
+    
+    # TODO: Stop existing training process if running
+    if _training_process and _training_process.poll() is None:
+        print("Stopping existing training process...")
+        stop_training()
+    
     path = "pytorch_connectomics/scripts/main.py"
-
     command = ["python", path]
 
     for key, value in dict["arguments"].items():
         if value is not None:
             command.extend([f"--{key}", str(value)])
 
-    # Write the value to a temporary file
-    with tempfile.NamedTemporaryFile(
+    # TODO: Write the value to a temporary file and track it for cleanup
+    temp_file = tempfile.NamedTemporaryFile(
         delete=False, mode="w", suffix=".yaml"
-    ) as temp_file:
-        temp_file.write(dict["trainingConfig"])
-        temp_filepath = temp_file.name
-        command.extend(["--config-file", str(temp_filepath)])
-
-    # Execute the command using subprocess.call
-    print(command)
+    )
+    temp_file.write(dict["trainingConfig"])
+    temp_filepath = temp_file.name
+    temp_file.close()
+    _temp_files.append(temp_filepath)
+    
+    command.extend(["--config-file", str(temp_filepath)])
+
+    # TODO: Execute the command using subprocess.Popen for proper async handling
+    print("Starting training with command:", command)
     try:
-        subprocess.call(command)
-    except subprocess.CalledProcessError as e:
-        print(f"Error occurred: {e}")
-
-    print("start_training")
-    initialize_tensorboard(dict["logPath"])
-    print("initialize_tensorboard")
+        _training_process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True
+        )
+        print(f"Training process started with PID: {_training_process.pid}")
+        
+        # Initialize tensorboard asynchronously
+        initialize_tensorboard(dict["logPath"])
+        print("TensorBoard initialized")
+        
+        return {"status": "started", "pid": _training_process.pid}
+    except Exception as e:
+        print(f"Error starting training: {e}")
+        # Cleanup temp file if process failed to start
+        if os.path.exists(temp_filepath):
+            os.unlink(temp_filepath)
+            _temp_files.remove(temp_filepath)
+        raise
 
 
-def stop_process(process_name):
+def stop_process_by_name(process_name):
+    """Stop processes by name using psutil for better reliability"""
     try:
-        process_line = os.popen("ps ax | grep " + process_name + " | grep -v grep")
-        print(process_line)
-        fields = process_line.split()
-        pid = fields[0]
-        print(pid)
-        os.kill(int(pid), signal.SIGKILL)
-        print(f"Process {process_name} Successfully Terminated")
+        for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
+            try:
+                if process_name in ' '.join(proc.info['cmdline'] or []):
+                    print(f"Terminating process {proc.info['pid']}: {' '.join(proc.info['cmdline'])}")
+                    proc.terminate()
+                    proc.wait(timeout=10)  # Wait up to 10 seconds for graceful termination
+            except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired):
+                # Process already terminated or we don't have permission
+                continue
     except Exception as e:
-        print(
-            f"Error Encountered while attempting to stop the process: {process_name}, error: {e}"
-        )
+        print(f"Error stopping processes by name '{process_name}': {e}")
+
+def cleanup_temp_files():
+    """Clean up temporary files created during training/inference"""
+    global _temp_files
+    for temp_file in _temp_files[:]:  # Create a copy to iterate over
+        try:
+            if os.path.exists(temp_file):
+                os.unlink(temp_file)
+                print(f"Cleaned up temp file: {temp_file}")
+            _temp_files.remove(temp_file)
+        except Exception as e:
+            print(f"Error cleaning up temp file {temp_file}: {e}")
 
 
 def stop_training():
-    process_name = "python pytorch_connectomics/scripts/main.py"
-    stop_process(process_name)
+    global _training_process
+    
+    # TODO: Stop the tracked training process first
+    if _training_process and _training_process.poll() is None:
+        try:
+            print(f"Terminating training process PID: {_training_process.pid}")
+            _training_process.terminate()
+            _training_process.wait(timeout=10)
+        except subprocess.TimeoutExpired:
+            print("Force killing training process...")
+            _training_process.kill()
+            _training_process.wait()
+        except Exception as e:
+            print(f"Error stopping training process: {e}")
+        finally:
+            _training_process = None
+    
+    # Stop any remaining processes by name as fallback
+    stop_process_by_name("python pytorch_connectomics/scripts/main.py")
     stop_tensorboard()
+    cleanup_temp_files()
+    return {"status": "stopped"}
 
 
 tensorboard_url = None
@@ -77,8 +138,7 @@ def get_tensorboard():
 
 
 def stop_tensorboard():
-    process_name = "tensorboard"
-    stop_process(process_name)
+    stop_process_by_name("tensorboard")
 
 
 def start_inference(dict: dict):
diff --git a/start.bat b/start.bat
@@ -1,10 +1,41 @@
 @echo off
 
+REM Setup pytorch_connectomics if not already present
+if not exist "pytorch_connectomics" (
+    echo Setting up pytorch_connectomics...
+    call setup_pytorch_connectomics.sh
+    if errorlevel 1 (
+        echo Error setting up pytorch_connectomics. Please run setup manually.
+        pause
+        exit /b 1
+    )
+    echo Installing pytorch_connectomics...
+    cd pytorch_connectomics && pip install --editable . && cd ..
+    if errorlevel 1 (
+        echo Error installing pytorch_connectomics.
+        pause
+        exit /b 1
+    )
+)
+
 REM Install dependencies in ./server_api
 pip install -r server_api\requirements.txt
+if errorlevel 1 (
+    echo Error installing API dependencies.
+    pause
+    exit /b 1
+)
 
 REM Start the API server
-start cmd /C "python server_api\main.py && pause"
+echo Starting API server...
+start "API Server" cmd /C "python server_api\main.py & pause"
+
+REM Wait a moment for the first server to start
+timeout /t 3 /nobreak > nul
 
 REM Start the Pytc-connectomics server
-start cmd /C "python server_pytc\main.py && pause"
+echo Starting PyTC server...
+start "PyTC Server" cmd /C "python server_pytc\main.py & pause"
+
+echo Both servers are starting. Check the opened windows for status.
+pause