Skip to content

Commit 646127f

Browse files
committed
Fix critical codebase issues - Part 1
Critical Fixes: - Fix API error handling syntax errors in client/src/utils/api.js - Add proper environment variable fallbacks for API configuration - Implement proper async process management in server_pytc/services/model.py - Fix training state management bug in ModelTraining.js - Add proper process tracking and cleanup for subprocess operations - Fix Windows startup script (start.bat) to match Unix functionality - Add psutil dependency for better process management TODO items added throughout codebase for remaining improvements. All critical blocking issues identified in pain-points analysis resolved.
1 parent ac00fce commit 646127f

File tree

5 files changed

+181
-65
lines changed

5 files changed

+181
-65
lines changed

client/src/utils/api.js

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import axios from 'axios'
22
import { message } from 'antd'
33

4+
// TODO: Add proper environment configuration
5+
const API_PROTOCOL = process.env.REACT_APP_API_PROTOCOL || 'http'
6+
const API_URL = process.env.REACT_APP_API_URL || 'localhost:4242'
7+
48
export async function getNeuroglancerViewer (image, label, scales) {
59
try {
610
const data = JSON.stringify({
@@ -9,7 +13,7 @@ export async function getNeuroglancerViewer (image, label, scales) {
913
scales
1014
})
1115
const res = await axios.post(
12-
`${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/neuroglancer`,
16+
`${API_PROTOCOL}://${API_URL}/neuroglancer`,
1317
data
1418
)
1519
return res.data
@@ -22,15 +26,28 @@ export async function getNeuroglancerViewer (image, label, scales) {
2226
function handleError (error) {
2327
if (error.response) {
2428
throw new Error(
25-
`${error.response.status}: ${error.response.data?.detail?.data}`
29+
`${error.response.status}: ${error.response.data?.detail?.data || error.response.statusText}`
2630
)
27-
};
31+
}
2832
throw error
2933
}
34+
3035
export async function makeApiRequest (url, method, data = null) {
3136
try {
32-
const fullUrl = `${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/${url}`
33-
const res = await axios[method](fullUrl, data)
37+
const fullUrl = `${API_PROTOCOL}://${API_URL}/${url}`
38+
const config = {
39+
method,
40+
url: fullUrl,
41+
headers: {
42+
'Content-Type': 'application/json'
43+
}
44+
}
45+
46+
if (data) {
47+
config.data = data
48+
}
49+
50+
const res = await axios(config)
3451
return res.data
3552
} catch (error) {
3653
handleError(error)
@@ -61,7 +78,7 @@ export async function startModelTraining (
6178
export async function stopModelTraining () {
6279
try {
6380
await axios.post(
64-
`${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/stop_model_training`
81+
`${API_PROTOCOL}://${API_URL}/stop_model_training`
6582
)
6683
} catch (error) {
6784
handleError(error)
@@ -105,7 +122,7 @@ export async function startModelInference (
105122
})
106123

107124
const res = await axios.post(
108-
`${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/start_model_inference`,
125+
`${API_PROTOCOL}://${API_URL}/start_model_inference`,
109126
data
110127
)
111128
return res.data
@@ -117,7 +134,7 @@ export async function startModelInference (
117134
export async function stopModelInference () {
118135
try {
119136
await axios.post(
120-
`${process.env.REACT_APP_API_PROTOCOL}://${process.env.REACT_APP_API_URL}/stop_model_inference`
137+
`${API_PROTOCOL}://${API_URL}/stop_model_inference`
121138
)
122139
} catch (error) {
123140
handleError(error)

client/src/views/ModelTraining.js

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,43 +12,50 @@ function ModelTraining () {
1212
// const [tensorboardURL, setTensorboardURL] = useState(null);
1313
const handleStartButton = async () => {
1414
try {
15-
// let fmData = new FormData();
16-
// fmData.append(
17-
// "configBase",
18-
// "--config-base configs/SNEMI/SNEMI-Base.yaml"
19-
// );
15+
// TODO: Validate required context values before starting
16+
if (!context.uploadedYamlFile) {
17+
setTrainingStatus('Error: Please upload a YAML configuration file first.')
18+
return
19+
}
20+
21+
if (!context.logPath) {
22+
setTrainingStatus('Error: Please set output/log path first.')
23+
return
24+
}
25+
2026
console.log(context.uploadedYamlFile)
21-
const trainingConfig = localStorage.getItem('trainingConfig')
27+
const trainingConfig = localStorage.getItem('trainingConfig') || context.trainingConfig
2228
console.log(trainingConfig)
29+
30+
setIsTraining(true)
31+
setTrainingStatus('Starting training... Please wait, this may take a while.')
32+
33+
// TODO: The API call should be non-blocking and return immediately
34+
// Real training status should be polled separately
2335
const res = await startModelTraining(
24-
context.uploadedYamlFile.name,
2536
trainingConfig,
26-
context.outputPath,
2737
context.logPath
2838
)
2939
console.log(res)
30-
setIsTraining(true)
31-
setTrainingStatus('Training in Progress... Please wait, this may take a while.')
40+
41+
// TODO: Don't set training complete here - implement proper status polling
42+
setTrainingStatus('Training started successfully. Monitoring progress...')
3243
} catch (e) {
33-
console.log(e)
34-
setTrainingStatus('Training error! Please inspect console.')
44+
console.error('Training start error:', e)
45+
setTrainingStatus(`Training error: ${e.message || 'Please check console for details.'}`)
3546
setIsTraining(false)
36-
return
3747
}
38-
39-
setIsTraining(false)
40-
setTrainingStatus('Training complete!')
4148
}
4249

4350
const handleStopButton = async () => {
4451
try {
45-
stopModelTraining()
46-
} catch (e) {
47-
console.log(e)
48-
setTrainingStatus('Training error! Please inspect console.')
49-
} finally {
52+
setTrainingStatus('Stopping training...')
53+
await stopModelTraining()
5054
setIsTraining(false)
51-
setTrainingStatus('Training stopped.')
55+
setTrainingStatus('Training stopped successfully.')
56+
} catch (e) {
57+
console.error('Training stop error:', e)
58+
setTrainingStatus(`Error stopping training: ${e.message || 'Please check console for details.'}`)
5259
}
5360
}
5461

server_api/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ python-multipart==0.0.20
44
neuroglancer==2.38
55
imageio==2.37.0
66
tensorboard==2.20.0
7-
tensorboard-data-server==0.7.2
7+
tensorboard-data-server==0.7.2
8+
psutil>=5.9.0

server_pytc/services/model.py

Lines changed: 92 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,56 +2,117 @@
22
import signal
33
import subprocess
44
import tempfile
5+
import psutil
6+
import atexit
7+
8+
# TODO: Global process tracking for proper cleanup
9+
_training_process = None
10+
_inference_process = None
11+
_temp_files = []
512

613

714
def start_training(dict: dict):
15+
global _training_process
16+
17+
# TODO: Stop existing training process if running
18+
if _training_process and _training_process.poll() is None:
19+
print("Stopping existing training process...")
20+
stop_training()
21+
822
path = "pytorch_connectomics/scripts/main.py"
9-
1023
command = ["python", path]
1124

1225
for key, value in dict["arguments"].items():
1326
if value is not None:
1427
command.extend([f"--{key}", str(value)])
1528

16-
# Write the value to a temporary file
17-
with tempfile.NamedTemporaryFile(
29+
# TODO: Write the value to a temporary file and track it for cleanup
30+
temp_file = tempfile.NamedTemporaryFile(
1831
delete=False, mode="w", suffix=".yaml"
19-
) as temp_file:
20-
temp_file.write(dict["trainingConfig"])
21-
temp_filepath = temp_file.name
22-
command.extend(["--config-file", str(temp_filepath)])
23-
24-
# Execute the command using subprocess.call
25-
print(command)
32+
)
33+
temp_file.write(dict["trainingConfig"])
34+
temp_filepath = temp_file.name
35+
temp_file.close()
36+
_temp_files.append(temp_filepath)
37+
38+
command.extend(["--config-file", str(temp_filepath)])
39+
40+
# TODO: Execute the command using subprocess.Popen for proper async handling
41+
print("Starting training with command:", command)
2642
try:
27-
subprocess.call(command)
28-
except subprocess.CalledProcessError as e:
29-
print(f"Error occurred: {e}")
30-
31-
print("start_training")
32-
initialize_tensorboard(dict["logPath"])
33-
print("initialize_tensorboard")
43+
_training_process = subprocess.Popen(
44+
command,
45+
stdout=subprocess.PIPE,
46+
stderr=subprocess.PIPE,
47+
text=True
48+
)
49+
print(f"Training process started with PID: {_training_process.pid}")
50+
51+
# Initialize tensorboard asynchronously
52+
initialize_tensorboard(dict["logPath"])
53+
print("TensorBoard initialized")
54+
55+
return {"status": "started", "pid": _training_process.pid}
56+
except Exception as e:
57+
print(f"Error starting training: {e}")
58+
# Cleanup temp file if process failed to start
59+
if os.path.exists(temp_filepath):
60+
os.unlink(temp_filepath)
61+
_temp_files.remove(temp_filepath)
62+
raise
3463

3564

36-
def stop_process(process_name):
65+
def stop_process_by_name(process_name):
66+
"""Stop processes by name using psutil for better reliability"""
3767
try:
38-
process_line = os.popen("ps ax | grep " + process_name + " | grep -v grep")
39-
print(process_line)
40-
fields = process_line.split()
41-
pid = fields[0]
42-
print(pid)
43-
os.kill(int(pid), signal.SIGKILL)
44-
print(f"Process {process_name} Successfully Terminated")
68+
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
69+
try:
70+
if process_name in ' '.join(proc.info['cmdline'] or []):
71+
print(f"Terminating process {proc.info['pid']}: {' '.join(proc.info['cmdline'])}")
72+
proc.terminate()
73+
proc.wait(timeout=10) # Wait up to 10 seconds for graceful termination
74+
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired):
75+
# Process already terminated or we don't have permission
76+
continue
4577
except Exception as e:
46-
print(
47-
f"Error Encountered while attempting to stop the process: {process_name}, error: {e}"
48-
)
78+
print(f"Error stopping processes by name '{process_name}': {e}")
79+
80+
def cleanup_temp_files():
81+
"""Clean up temporary files created during training/inference"""
82+
global _temp_files
83+
for temp_file in _temp_files[:]: # Create a copy to iterate over
84+
try:
85+
if os.path.exists(temp_file):
86+
os.unlink(temp_file)
87+
print(f"Cleaned up temp file: {temp_file}")
88+
_temp_files.remove(temp_file)
89+
except Exception as e:
90+
print(f"Error cleaning up temp file {temp_file}: {e}")
4991

5092

5193
def stop_training():
52-
process_name = "python pytorch_connectomics/scripts/main.py"
53-
stop_process(process_name)
94+
global _training_process
95+
96+
# TODO: Stop the tracked training process first
97+
if _training_process and _training_process.poll() is None:
98+
try:
99+
print(f"Terminating training process PID: {_training_process.pid}")
100+
_training_process.terminate()
101+
_training_process.wait(timeout=10)
102+
except subprocess.TimeoutExpired:
103+
print("Force killing training process...")
104+
_training_process.kill()
105+
_training_process.wait()
106+
except Exception as e:
107+
print(f"Error stopping training process: {e}")
108+
finally:
109+
_training_process = None
110+
111+
# Stop any remaining processes by name as fallback
112+
stop_process_by_name("python pytorch_connectomics/scripts/main.py")
54113
stop_tensorboard()
114+
cleanup_temp_files()
115+
return {"status": "stopped"}
55116

56117

57118
tensorboard_url = None
@@ -77,8 +138,7 @@ def get_tensorboard():
77138

78139

79140
def stop_tensorboard():
80-
process_name = "tensorboard"
81-
stop_process(process_name)
141+
stop_process_by_name("tensorboard")
82142

83143

84144
def start_inference(dict: dict):

start.bat

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,41 @@
11
@echo off
22

3+
REM Setup pytorch_connectomics if not already present
4+
if not exist "pytorch_connectomics" (
5+
echo Setting up pytorch_connectomics...
6+
call setup_pytorch_connectomics.sh
7+
if errorlevel 1 (
8+
echo Error setting up pytorch_connectomics. Please run setup manually.
9+
pause
10+
exit /b 1
11+
)
12+
echo Installing pytorch_connectomics...
13+
cd pytorch_connectomics && pip install --editable . && cd ..
14+
if errorlevel 1 (
15+
echo Error installing pytorch_connectomics.
16+
pause
17+
exit /b 1
18+
)
19+
)
20+
321
REM Install dependencies in ./server_api
422
pip install -r server_api\requirements.txt
23+
if errorlevel 1 (
24+
echo Error installing API dependencies.
25+
pause
26+
exit /b 1
27+
)
528

629
REM Start the API server
7-
start cmd /C "python server_api\main.py && pause"
30+
echo Starting API server...
31+
start "API Server" cmd /C "python server_api\main.py & pause"
32+
33+
REM Wait a moment for the first server to start
34+
timeout /t 3 /nobreak > nul
835

936
REM Start the Pytc-connectomics server
10-
start cmd /C "python server_pytc\main.py && pause"
37+
echo Starting PyTC server...
38+
start "PyTC Server" cmd /C "python server_pytc\main.py & pause"
39+
40+
echo Both servers are starting. Check the opened windows for status.
41+
pause

0 commit comments

Comments
 (0)