Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 0 additions & 18 deletions .env

This file was deleted.

5 changes: 5 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"python-envs.defaultEnvManager": "ms-python.python:conda",
"python-envs.defaultPackageManager": "ms-python.python:conda",
"python-envs.pythonProjects": []
}
78 changes: 39 additions & 39 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,39 +1,39 @@
# Stage 1: Build the frontend
FROM node:latest as frontend-builder
# Set the working directory
WORKDIR /app
# Install frontent dependencies
COPY geochemistrypi/frontend/package.json /app/
RUN yarn install
# Stage 2: Build the backend
FROM python:3.9-slim AS backend-builder
# Set the working directory
WORKDIR /app
# Install backend dependencies
COPY requirements/production.txt /app/
RUN pip install -r production.txt
# Special case for Debian OS, update package lists and install Git and Node.js
RUN apt-get update && apt-get install -y libgomp1 git
RUN apt-get update && apt-get install -y nodejs
RUN apt-get update && apt-get install -y npm
# Install Yarn
RUN npm install -g yarn
# Copy the rest of the code
COPY . .
# Expose the port
EXPOSE 8000 3001
# Mount the volume
VOLUME /app
# Dummy CMD to prevent container from exiting immediately
CMD ["tail", "-f", "/dev/null"]
# Stage 1: Build the frontend
FROM node:latest as frontend-builder

# Set the working directory
WORKDIR /app

# Install frontent dependencies
COPY geochemistrypi/frontend/package.json /app/
RUN yarn install

# Stage 2: Build the backend
FROM python:3.9-slim AS backend-builder

# Set the working directory
WORKDIR /app

# Install backend dependencies
COPY requirements/production.txt /app/
RUN pip install -r production.txt

# Special case for Debian OS, update package lists and install Git and Node.js
RUN apt-get update && apt-get install -y libgomp1 git
RUN apt-get update && apt-get install -y nodejs
RUN apt-get update && apt-get install -y npm

# Install Yarn
RUN npm install -g yarn

# Copy the rest of the code
COPY . .

# Expose the port
EXPOSE 8000 3001

# Mount the volume
VOLUME /app

# Dummy CMD to prevent container from exiting immediately
CMD ["tail", "-f", "/dev/null"]
12 changes: 6 additions & 6 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def _data_requirement_print():
if my_os == "Windows" or my_os == "Linux":
if not check_package("basemap"):
print("[bold red]Downloading Basemap...[/bold red]")
install_package("basemap")
install_package("basemap==1.3.8")
print("[bold green]Successfully downloading![/bold green]")
print("[bold green]Download happens only once![/bold green]")
clear_output()
Expand Down Expand Up @@ -560,7 +560,7 @@ def _data_requirement_print():
print("Selected sub data set to create Y data set:")
show_data_columns(data_selected_imputed_fe.columns)
print("The selected Y data set:")
print("Notice: Normally, please choose only one column to be tag column Y, not multiple columns.")
print("Notice: You can now choose multiple columns for Y (multi-output regression).")
print("Notice: For classification model training, please choose the label column which has distinctive integers.")
y = create_sub_data_set(data_selected_imputed_fe, allow_empty_columns=False)
print("Successfully create Y data set.")
Expand Down Expand Up @@ -779,7 +779,7 @@ def _data_requirement_print():
inference_data_fe_selected_dropped = inference_data_fe_selected.dropna()
inference_data_fe_selected_dropped_name = inference_data_name.dropna()
inference_name_column_drop = inference_data_fe_selected_dropped_name[NAME]
model_inference(inference_data_fe_selected_dropped, inference_name_column_drop, is_inference, run, transformer_config, transform_pipeline)
model_inference(inference_data_fe_selected_dropped, inference_name_column_drop, is_inference, run, transformer_config, transform_pipeline, y.columns.tolist())
save_data(
inference_data_fe_selected_dropped,
inference_name_column_drop,
Expand All @@ -789,7 +789,7 @@ def _data_requirement_print():
)
else:
inference_name_column = inference_data[NAME]
model_inference(inference_data_fe_selected, inference_name_column, is_inference, run, transformer_config, transform_pipeline)
model_inference(inference_data_fe_selected, inference_name_column, is_inference, run, transformer_config, transform_pipeline, y.columns.tolist())
clear_output()

# <--- Data Dumping --->
Expand Down Expand Up @@ -836,7 +836,7 @@ def _data_requirement_print():
inference_data_fe_selected_dropped = inference_data_fe_selected.dropna()
inference_data_fe_selected_dropped_name = inference_data_name.dropna()
inference_name_column_drop = inference_data_fe_selected_dropped_name[NAME]
model_inference(inference_data_fe_selected_dropped, inference_name_column_drop, is_inference, run, transformer_config, transform_pipeline)
model_inference(inference_data_fe_selected_dropped, inference_name_column_drop, is_inference, run, transformer_config, transform_pipeline, y.columns.tolist())
save_data(
inference_data_fe_selected_dropped,
inference_name_column_drop,
Expand All @@ -846,7 +846,7 @@ def _data_requirement_print():
)
else:
inference_name_column = inference_data[NAME]
model_inference(inference_data_fe_selected, inference_name_column, is_inference, run, transformer_config, transform_pipeline)
model_inference(inference_data_fe_selected, inference_name_column, is_inference, run, transformer_config, transform_pipeline, y.columns.tolist())
clear_output()

# <--- Data Dumping --->
Expand Down
130 changes: 129 additions & 1 deletion geochemistrypi/data_mining/dash_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import dash
import flask
import pandas as pd
from dash import dash_table, dcc, html
from dash import Input, Output, State, dash_table, dcc, html
from dash.dependencies import Input, Output

from .data.data_readiness import read_data
Expand Down Expand Up @@ -52,6 +52,60 @@ def dash_pipeline(requests_pathname_prefix: str = None) -> dash.Dash:
data=[],
page_size=10,
),
# 新增:回归功能界面
html.H2(children="Part 2: Regression Analysis"),
html.Div(
[
html.Label("Select X variables (features):"),
dcc.Dropdown(
id="x-variables-dropdown",
multi=True,
placeholder="Select X variables...",
),
],
style={"margin": "10px"},
),
html.Div(
[
html.Label("Select Y variables (targets) - 支持多列Y:"),
dcc.Dropdown(
id="y-variables-dropdown",
multi=True,
placeholder="Select Y variables (can select multiple)...",
),
],
style={"margin": "10px"},
),
html.Div(
[
html.Label("Select Regression Model:"),
dcc.Dropdown(
id="model-dropdown",
options=[
{"label": "Linear Regression", "value": "Linear Regression"},
{"label": "Random Forest", "value": "Random Forest"},
{"label": "XGBoost", "value": "XGBoost"},
{"label": "Support Vector Machine", "value": "Support Vector Machine"},
{"label": "Decision Tree", "value": "Decision Tree"},
{"label": "Gradient Boosting", "value": "Gradient Boosting"},
{"label": "Lasso Regression", "value": "Lasso Regression"},
{"label": "Ridge Regression", "value": "Ridge Regression"},
{"label": "Elastic Net", "value": "Elastic Net"},
{"label": "K-Nearest Neighbors", "value": "K-Nearest Neighbors"},
{"label": "SGD Regression", "value": "SGD Regression"},
{"label": "BayesianRidge Regression", "value": "BayesianRidge Regression"},
{"label": "Multi-layer Perceptron", "value": "Multi-layer Perceptron"},
{"label": "Polynomial Regression", "value": "Polynomial Regression"},
{"label": "Extra-Trees", "value": "Extra-Trees"},
],
value="Linear Regression",
placeholder="Select a regression model...",
),
],
style={"margin": "10px"},
),
html.Button("Run Regression", id="run-regression-button", n_clicks=0),
html.Div(id="regression-results"),
html.Button("Toggle", id="toggle-button"),
html.Div(id="content-div", children="Content to be hidden or shown"),
]
Expand All @@ -78,6 +132,80 @@ def update_table(selected_dataset):
data = df.to_dict("records")
return columns, data

# 新增:更新变量选择下拉框
@app.callback(
[Output("x-variables-dropdown", "options"), Output("y-variables-dropdown", "options")],
[Input("dataset-dropdown", "value")],
)
def update_variable_options(selected_dataset):
"""Update variable options based on the selected dataset."""
df = pd.DataFrame()
if selected_dataset == "user_data":
df = pd.read_excel(user_data_path)
elif selected_dataset == "data_regression":
df = data_regression
elif selected_dataset == "data_classification":
df = data_classification
elif selected_dataset == "data_clustering":
df = data_clustering
elif selected_dataset == "data_decomposition":
df = data_decomposition

options = [{"label": col, "value": col} for col in df.columns]
return options, options

# 新增:运行回归分析
@app.callback(
Output("regression-results", "children"),
[Input("run-regression-button", "n_clicks")],
[State("dataset-dropdown", "value"), State("x-variables-dropdown", "value"), State("y-variables-dropdown", "value"), State("model-dropdown", "value")],
)
def run_regression(n_clicks, selected_dataset, x_vars, y_vars, model_name):
"""Run regression analysis with selected variables."""
if n_clicks == 0 or not all([selected_dataset, x_vars, y_vars, model_name]):
return "Please select dataset, X variables, Y variables, and model."

try:
# 获取数据
df = pd.DataFrame()
if selected_dataset == "user_data":
df = pd.read_excel(user_data_path)
elif selected_dataset == "data_regression":
df = data_regression
elif selected_dataset == "data_classification":
df = data_classification
elif selected_dataset == "data_clustering":
df = data_clustering
elif selected_dataset == "data_decomposition":
df = data_decomposition

# 准备X和Y数据
X = df[x_vars]
y = df[y_vars]

# 检查数据
if X.empty or y.empty:
return "Error: Selected variables contain no data."

# 显示数据信息
result_text = f"""
<h3>回归分析结果</h3>
<p><strong>模型:</strong> {model_name}</p>
<p><strong>X变量数量:</strong> {len(x_vars)} ({', '.join(x_vars)})</p>
<p><strong>Y变量数量:</strong> {len(y_vars)} ({', '.join(y_vars)})</p>
<p><strong>样本数量:</strong> {len(X)}</p>
<p><strong>支持多列Y:</strong> {'是' if len(y_vars) > 1 else '否'}</p>
"""

# 这里可以添加实际的回归分析代码
# 由于需要设置环境变量和输出路径,这里只显示基本信息
result_text += "<p><em>注意:完整的回归分析需要设置环境变量和输出路径。请使用CLI版本进行完整分析。</em></p>"

return html.Div([html.Div(result_text, dangerouslySetInnerHTML={"__html": result_text})])

except Exception as e:
return f"Error: {str(e)}"

@app.callback(Output("content-div", "style"), [Input("toggle-button", "n_clicks")])
def toggle_div_visibility(n_clicks):
if n_clicks and n_clicks % 2 == 1:
Expand Down
29 changes: 26 additions & 3 deletions geochemistrypi/data_mining/data/inference.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy
import json
import os
from typing import Dict, Optional, Tuple
from typing import Dict, List, Optional, Tuple

import mlflow
import pandas as pd
Expand Down Expand Up @@ -109,7 +109,15 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di
return transformer_config, transform_pipeline


def model_inference(inference_data: pd.DataFrame, inference_name_column: str, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
def model_inference(
inference_data: pd.DataFrame,
inference_name_column: str,
is_inference: bool,
run: object,
transformer_config: Dict,
transform_pipeline: Optional[object] = None,
y_columns: Optional[List[str]] = None,
):
"""Run the model inference.

Parameters
Expand All @@ -131,6 +139,9 @@ def model_inference(inference_data: pd.DataFrame, inference_name_column: str, is

transform_pipeline : Optional[object], optional
The transform pipeline object. The default is None.

y_columns : Optional[List[str]], optional
The column names of the target variables. The default is None.
"""
# If is_inference is True, then run the model inference.
if is_inference is True:
Expand All @@ -142,6 +153,18 @@ def model_inference(inference_data: pd.DataFrame, inference_name_column: str, is
inference_data_transformed = inference_data
loaded_model = mlflow.sklearn.load_model(f"runs:/{mlflow.active_run().info.run_id}/{run.model_name}")
inference_data_predicted_np = loaded_model.predict(inference_data_transformed)
inference_data_predicted = np2pd(inference_data_predicted_np, ["Predicted Value"])

# 支持多列Y:根据预测结果的形状生成列名
if y_columns is not None and len(y_columns) > 0:
# 使用原始Y的列名
predicted_columns = [f"Predicted_{col}" for col in y_columns]
else:
# 根据预测结果的形状生成列名
if inference_data_predicted_np.ndim == 1:
predicted_columns = ["Predicted Value"]
else:
predicted_columns = [f"Predicted_Value_{i+1}" for i in range(inference_data_predicted_np.shape[1])]

inference_data_predicted = np2pd(inference_data_predicted_np, predicted_columns)
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(inference_data_predicted, inference_name_column, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
Loading