Skip to content

Commit ed74cdd

Browse files
authored
Merge pull request #59 from transformerlab/add/job-subtypes-remote
Add methods to save_artifacts for saving different kinds of artifacts
2 parents fe1f012 + e06524f commit ed74cdd

File tree

5 files changed

+720
-128
lines changed

5 files changed

+720
-128
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "transformerlab"
7-
version = "0.0.42"
7+
version = "0.0.43"
88
description = "Python SDK for Transformer Lab"
99
readme = "README.md"
1010
requires-python = ">=3.10"
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
from datetime import datetime
2+
from time import sleep
3+
import pandas as pd
4+
5+
from lab import lab
6+
7+
8+
def generate_dataset():
9+
"""Fake dataset generation function that runs locally but reports to TransformerLab"""
10+
11+
# Dataset generation configuration
12+
config = {
13+
"experiment_name": "alpha",
14+
"model_name": "HuggingFaceTB/SmolLM-135M-Instruct",
15+
"template_name": "dataset-generation-demo",
16+
"_config": {
17+
"dataset_type": "synthetic",
18+
"num_samples": 100,
19+
"generation_method": "fake_data",
20+
},
21+
}
22+
23+
try:
24+
# Initialize lab with default/simple API
25+
lab.init()
26+
lab.set_config(config)
27+
28+
# Log start time
29+
start_time = datetime.now()
30+
lab.log(f"Dataset generation started at {start_time}")
31+
32+
# Generate a simple text classification dataset
33+
lab.log("Generating text classification dataset...")
34+
sleep(0.5)
35+
lab.update_progress(30)
36+
37+
dataset_data = []
38+
categories = ["positive", "negative", "neutral"]
39+
sample_texts = [
40+
"This is a great product!",
41+
"I don't like this at all.",
42+
"It's okay, nothing special.",
43+
"Amazing quality and fast delivery!",
44+
"Poor customer service experience.",
45+
"It meets my expectations.",
46+
]
47+
48+
for i in range(50):
49+
dataset_data.append({
50+
"text": sample_texts[i % len(sample_texts)] + f" (sample {i+1})",
51+
"label": categories[i % len(categories)],
52+
"label_id": i % len(categories),
53+
"confidence": 0.7 + (i % 3) * 0.1,
54+
})
55+
56+
df = pd.DataFrame(dataset_data)
57+
lab.log(f"Generated {len(df)} samples")
58+
59+
# Save dataset using save_artifact with type="dataset"
60+
lab.log("Saving dataset...")
61+
saved_path = lab.save_artifact(
62+
df,
63+
name="generated_text_classification_dataset",
64+
type="dataset",
65+
config={
66+
"dataset": {
67+
"description": "Synthetic text classification dataset generated from job",
68+
"task": "text_classification",
69+
"num_classes": 3,
70+
"source": "synthetic_generation",
71+
}
72+
}
73+
)
74+
lab.log(f"✅ Saved dataset: {saved_path}")
75+
lab.update_progress(80)
76+
77+
# Calculate generation time
78+
end_time = datetime.now()
79+
generation_duration = end_time - start_time
80+
lab.log(f"Dataset generation completed in {generation_duration}")
81+
82+
# Get generated dataset from job data
83+
job_data = lab.job.get_job_data()
84+
generated_datasets = job_data.get("generated_datasets", [])
85+
86+
if generated_datasets:
87+
lab.log(f"Generated dataset: {generated_datasets[0]}")
88+
89+
lab.update_progress(100)
90+
91+
print("Dataset Generation Complete")
92+
93+
# Complete the job in TransformerLab via facade
94+
lab.finish(
95+
"Dataset generation completed successfully",
96+
score={
97+
"total_samples": len(df),
98+
"dataset_id": generated_datasets[0] if generated_datasets else None,
99+
}
100+
)
101+
102+
return {
103+
"status": "success",
104+
"job_id": lab.job.id,
105+
"duration": str(generation_duration),
106+
"generated_dataset": generated_datasets[0] if generated_datasets else None,
107+
"total_samples": len(df),
108+
}
109+
110+
except KeyboardInterrupt:
111+
lab.error("Stopped by user or remotely")
112+
return {"status": "stopped", "job_id": lab.job.id}
113+
114+
except Exception as e:
115+
error_msg = str(e)
116+
print(f"Dataset generation failed: {error_msg}")
117+
118+
import traceback
119+
120+
traceback.print_exc()
121+
lab.error(error_msg)
122+
return {"status": "error", "job_id": lab.job.id, "error": error_msg}
123+
124+
125+
if __name__ == "__main__":
126+
result = generate_dataset()
127+
print(result)
128+

scripts/examples/fake_evals.py

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
from datetime import datetime
2+
from time import sleep
3+
import pandas as pd
4+
5+
from lab import lab
6+
7+
8+
def run_evaluation():
9+
"""Fake evaluation function that runs locally but reports to TransformerLab"""
10+
11+
# Evaluation configuration
12+
eval_config = {
13+
"experiment_name": "alpha",
14+
"model_name": "HuggingFaceTB/SmolLM-135M-Instruct",
15+
"eval_name": "fake-evaluation",
16+
"template_name": "eval-demo",
17+
"_config": {
18+
"model": "HuggingFaceTB/SmolLM-135M-Instruct",
19+
"dataset": "evaluation_dataset",
20+
"num_test_cases": 10,
21+
"metrics": ["accuracy", "f1_score", "bleu"],
22+
},
23+
}
24+
25+
try:
26+
# Initialize lab with default/simple API
27+
lab.init()
28+
lab.set_config(eval_config)
29+
30+
# Log start time
31+
start_time = datetime.now()
32+
lab.log(f"Evaluation started at {start_time}")
33+
34+
# Simulate loading test cases
35+
lab.log("Loading test cases...")
36+
sleep(0.5)
37+
lab.update_progress(10)
38+
39+
# Create fake evaluation results with default column names
40+
lab.log("Running evaluation on test cases...")
41+
sleep(0.5)
42+
43+
# Example 1: DataFrame with default column names (input, output, expected_output, score)
44+
test_cases = []
45+
for i in range(10):
46+
test_cases.append({
47+
"input": f"What is {i + 1} + {i + 2}?",
48+
"output": str((i + 1) + (i + 2)),
49+
"expected_output": str((i + 1) + (i + 2)),
50+
"score": 1.0 if (i + 1) + (i + 2) == (i + 1) + (i + 2) else 0.0,
51+
})
52+
53+
df_default = pd.DataFrame(test_cases)
54+
lab.log(f"Generated {len(df_default)} test cases with default column names")
55+
56+
# Save evaluation results with default column names
57+
lab.log("Saving evaluation results (default columns)...")
58+
saved_path_default = lab.save_artifact(
59+
df_default,
60+
name="eval_results_default.csv",
61+
type="eval"
62+
)
63+
lab.log(f"✅ Saved evaluation results: {saved_path_default}")
64+
lab.update_progress(50)
65+
66+
# Example 2: DataFrame with custom column names
67+
lab.log("Creating evaluation results with custom column names...")
68+
sleep(0.5)
69+
70+
test_cases_custom = []
71+
for i in range(10):
72+
# Simulate some incorrect answers for variety
73+
is_correct = i % 3 != 0 # Every 3rd answer is wrong
74+
correct_answer = str((i + 1) * 2)
75+
model_answer = correct_answer if is_correct else str((i + 1) * 2 + 1)
76+
77+
test_cases_custom.append({
78+
"question": f"Calculate {i + 1} * 2",
79+
"model_response": model_answer,
80+
"ground_truth": correct_answer,
81+
"accuracy": 1.0 if is_correct else 0.0,
82+
"response_time_ms": 50 + i * 5,
83+
})
84+
85+
df_custom = pd.DataFrame(test_cases_custom)
86+
lab.log(f"Generated {len(df_custom)} test cases with custom column names")
87+
88+
# Save evaluation results with custom column mappings
89+
lab.log("Saving evaluation results (custom columns)...")
90+
saved_path_custom = lab.save_artifact(
91+
df_custom,
92+
name="eval_results_custom.csv",
93+
type="eval",
94+
config={
95+
"evals": {
96+
"input": "question",
97+
"output": "model_response",
98+
"expected_output": "ground_truth",
99+
"score": "accuracy"
100+
}
101+
}
102+
)
103+
lab.log(f"✅ Saved evaluation results: {saved_path_custom}")
104+
lab.update_progress(70)
105+
106+
# Example 3: Multiple metrics evaluation
107+
lab.log("Creating multi-metric evaluation results...")
108+
sleep(0.5)
109+
110+
multi_metric_cases = []
111+
for i in range(10):
112+
multi_metric_cases.append({
113+
"input": "Translate 'Hello' to Spanish",
114+
"output": "Hola" if i % 2 == 0 else "Hallo",
115+
"expected_output": "Hola",
116+
"bleu_score": 1.0 if i % 2 == 0 else 0.3,
117+
"rouge_score": 0.95 if i % 2 == 0 else 0.4,
118+
"exact_match": 1.0 if i % 2 == 0 else 0.0,
119+
})
120+
121+
df_multi = pd.DataFrame(multi_metric_cases)
122+
123+
# For multi-metric, we'll use the first score column (bleu_score) as the primary score
124+
saved_path_multi = lab.save_artifact(
125+
df_multi,
126+
name="eval_results_multi_metric.csv",
127+
type="eval",
128+
config={
129+
"evals": {
130+
"input": "input",
131+
"output": "output",
132+
"expected_output": "expected_output",
133+
"score": "bleu_score" # Use bleu_score as the primary score
134+
}
135+
}
136+
)
137+
lab.log(f"✅ Saved multi-metric evaluation results: {saved_path_multi}")
138+
lab.update_progress(85)
139+
140+
# Calculate summary statistics
141+
lab.log("Calculating evaluation summary...")
142+
sleep(0.5)
143+
144+
# Calculate average scores from the default results
145+
avg_score = df_default["score"].mean()
146+
total_cases = len(df_default)
147+
correct_cases = len(df_default[df_default["score"] == 1.0])
148+
149+
summary = {
150+
"total_test_cases": total_cases,
151+
"correct_cases": correct_cases,
152+
"incorrect_cases": total_cases - correct_cases,
153+
"average_score": avg_score,
154+
"accuracy": correct_cases / total_cases,
155+
}
156+
157+
lab.log("Evaluation Summary:")
158+
lab.log(f" Total test cases: {summary['total_test_cases']}")
159+
lab.log(f" Correct: {summary['correct_cases']}")
160+
lab.log(f" Incorrect: {summary['incorrect_cases']}")
161+
lab.log(f" Average score: {summary['average_score']:.4f}")
162+
lab.log(f" Accuracy: {summary['accuracy']:.4f}")
163+
164+
# Save summary as a regular artifact (not eval results)
165+
import json
166+
summary_file = "/tmp/eval_summary.json"
167+
with open(summary_file, "w") as f:
168+
json.dump(summary, f, indent=2)
169+
170+
summary_artifact_path = lab.save_artifact(summary_file, "eval_summary.json")
171+
lab.log(f"✅ Saved evaluation summary: {summary_artifact_path}")
172+
173+
# Calculate evaluation time
174+
end_time = datetime.now()
175+
eval_duration = end_time - start_time
176+
lab.log(f"Evaluation completed in {eval_duration}")
177+
178+
lab.update_progress(100)
179+
180+
print("Evaluation Complete")
181+
182+
# Complete the job in TransformerLab via facade
183+
lab.finish(
184+
"Evaluation completed successfully",
185+
score={
186+
"average_score": avg_score,
187+
"accuracy": summary["accuracy"],
188+
"total_cases": total_cases,
189+
}
190+
)
191+
192+
return {
193+
"status": "success",
194+
"job_id": lab.job.id,
195+
"duration": str(eval_duration),
196+
"summary": summary,
197+
"eval_results_files": [
198+
saved_path_default,
199+
saved_path_custom,
200+
saved_path_multi,
201+
],
202+
}
203+
204+
except KeyboardInterrupt:
205+
lab.error("Stopped by user or remotely")
206+
return {"status": "stopped", "job_id": lab.job.id}
207+
208+
except Exception as e:
209+
error_msg = str(e)
210+
print(f"Evaluation failed: {error_msg}")
211+
212+
import traceback
213+
214+
traceback.print_exc()
215+
lab.error(error_msg)
216+
return {"status": "error", "job_id": lab.job.id, "error": error_msg}
217+
218+
219+
if __name__ == "__main__":
220+
result = run_evaluation()
221+
print(result)
222+

src/lab/dirs.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,16 @@ def get_job_checkpoints_dir(job_id: str | int) -> str:
198198
return path
199199

200200

201+
def get_job_eval_results_dir(job_id: str | int) -> str:
202+
"""
203+
Return the eval_results directory for a specific job, creating it if needed.
204+
Example: ~/.transformerlab/workspace/jobs/<job_id>/eval_results
205+
"""
206+
path = os.path.join(get_job_dir(job_id), "eval_results")
207+
os.makedirs(name=path, exist_ok=True)
208+
return path
209+
210+
201211
# Evals output file:
202212
# TODO: These should probably be in the plugin subclasses
203213

0 commit comments

Comments
 (0)