Skip to content

Commit 2fa16da

Browse files
committed
fix: fix all possible issues of the data
1 parent 8ce190e commit 2fa16da

File tree

1,132 files changed

+28312
-12211
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,132 files changed

+28312
-12211
lines changed

data/clean/f_1015_zhihan_refined.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,8 @@ def test_invalid_command(self):
107107
self.assertEqual(len(result), 1)
108108
with open(os.path.join(self.output_dir_path, result[0]), "r") as f:
109109
content = f.read()
110-
self.assertIn("invalid_command_xyz: not found", content)
110+
self.assertIn("invalid_command_xyz", content)
111+
self.assertIn("not found", content)
111112

112113
def test_empty_csv_file(self):
113114
# Test with an empty CSV file
@@ -128,8 +129,9 @@ def test_mixed_commands(self):
128129
self.assertEqual(len(result), 2)
129130
with open(os.path.join(self.output_dir_path, result[1]), "r") as f:
130131
content = f.read()
131-
self.assertIn("invalid_command_abc: not found", content)
132-
132+
self.assertIn("invalid_command_abc", content)
133+
self.assertIn("not found", content)
134+
133135
def test_command_failure_with_specific_exit_code(self):
134136
# Prepare a CSV with a command guaranteed to fail and return a specific exit code
135137
commands_path = os.path.join(self.temp_dir, "failing_commands.csv")

data/clean/f_1706_junda_james.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import pandas as pd
2+
import numpy as np
3+
import itertools
4+
from datetime import datetime, timedelta
5+
import seaborn as sns
6+
7+
def f_1706(df, fruits=None, days=None, seed=None, sales_lower_bound=1, sales_upper_bound=50):
8+
"""
9+
Appends randomly generated sales data for specified fruits over a given range of days to a DataFrame,
10+
and returns a seaborn boxplot of the sales.
11+
12+
Parameters:
13+
- df (pd.DataFrame): Initial Empty DataFrame to append sales data to. Must be empty.
14+
- fruits (List[str], optional): List of fruits for sales data. Defaults to ['Apple', 'Banana', 'Cherry', 'Date', 'Elderberry'].
15+
- days (List[datetime], optional): List of days for sales data. Defaults to the range from January 1, 2024, to January 7, 2024.
16+
- seed (int, optional): Seed for the random number generator. Defaults to None.
17+
- sales_lower_bound (int, optional): Lower bound for random sales values. Defaults to 1.
18+
- sales_upper_bound (int, optional): Upper bound for random sales values. Defaults to 50.
19+
20+
Returns:
21+
Tuple[pd.DataFrame, sns.axisgrid.FacetGrid]: Updated DataFrame with sales data and a seaborn boxplot of the sales.
22+
23+
Raises:
24+
TypeError: If 'df' is not a pandas DataFrame.
25+
ValueError: If 'df' is not empty or If 'sales_lower_bound' is not less than 'sales_upper_bound'.
26+
27+
Requirements:
28+
- pandas
29+
- numpy
30+
- itertools
31+
- datetime
32+
- seaborn
33+
34+
Example:
35+
>>> initial_df = pd.DataFrame()
36+
>>> report_df, plot = f_1706(initial_df, seed=42)
37+
>>> print(report_df.head())
38+
Fruit Day Sales
39+
0 Apple 2024-01-01 39
40+
1 Apple 2024-01-02 29
41+
2 Apple 2024-01-03 15
42+
3 Apple 2024-01-04 43
43+
4 Apple 2024-01-05 8
44+
>>> plot.figure.show()
45+
46+
"""
47+
if not isinstance(df, pd.DataFrame):
48+
raise TypeError("Input must be a pandas DataFrame")
49+
if not df.empty:
50+
raise ValueError("Input DataFrame must be empty")
51+
if sales_lower_bound >= sales_upper_bound:
52+
raise ValueError("sales_lower_bound must be less than sales_upper_bound")
53+
54+
if fruits is None:
55+
fruits = ['Apple', 'Banana', 'Cherry', 'Date', 'Elderberry']
56+
if days is None:
57+
# Set days to range from January 1, 2024, to January 7, 2024
58+
days = [datetime(2024, 1, 1) + timedelta(days=x) for x in range(7)]
59+
60+
if seed is not None:
61+
np.random.seed(seed)
62+
63+
data = list(itertools.product(fruits, days))
64+
sales_data = pd.DataFrame(data, columns=['Fruit', 'Day'])
65+
sales_data['Sales'] = np.random.randint(sales_lower_bound, sales_upper_bound, size=len(data))
66+
67+
result_df = pd.concat([df, sales_data])
68+
plot = sns.boxplot(x='Fruit', y='Sales', data=result_df)
69+
70+
return result_df, plot
71+
72+
import unittest
73+
import pandas as pd
74+
import numpy as np
75+
from datetime import datetime
76+
77+
class TestCases(unittest.TestCase):
78+
def setUp(self):
79+
# Define the default date range for comparison in tests
80+
self.default_days = [datetime(2024, 1, 1) + timedelta(days=x) for x in range(7)]
81+
82+
def test_default_days_range(self):
83+
"""Test the default days range is correctly applied."""
84+
initial_df = pd.DataFrame()
85+
report_df, _ = f_1706(initial_df, seed=42)
86+
unique_days = sorted(report_df['Day'].dt.date.unique())
87+
expected_days = [day.date() for day in self.default_days]
88+
self.assertEqual(len(unique_days), len(expected_days), "The number of unique days should match the default range.")
89+
for day in unique_days:
90+
self.assertIn(day, expected_days, "Each unique day should be within the default range.")
91+
92+
def test_custom_days_range(self):
93+
"""Test functionality with a custom days range."""
94+
initial_df = pd.DataFrame()
95+
custom_days = [datetime(2024, 1, 10), datetime(2024, 1, 11)]
96+
report_df, _ = f_1706(initial_df, days=custom_days, seed=42)
97+
unique_days = sorted(report_df['Day'].dt.date.unique())
98+
expected_custom_days = [day.date() for day in custom_days]
99+
self.assertEqual(len(unique_days), len(expected_custom_days), "The number of unique days should match the custom range.")
100+
for day in unique_days:
101+
self.assertIn(day, expected_custom_days, "Each unique day should be within the custom range.")
102+
103+
104+
def test_sales_bounds(self):
105+
"""Test custom sales bounds are respected."""
106+
initial_df = pd.DataFrame()
107+
report_df, _ = f_1706(initial_df, seed=42, sales_lower_bound=20, sales_upper_bound=30)
108+
sales_values = report_df['Sales'].unique()
109+
self.assertTrue(all(20 <= val < 30 for val in sales_values), "All sales values should be within the specified bounds.")
110+
111+
def test_invalid_sales_bounds(self):
112+
"""Test error handling for invalid sales bounds."""
113+
with self.assertRaises(ValueError):
114+
f_1706(pd.DataFrame(), sales_lower_bound=50, sales_upper_bound=10)
115+
116+
def test_with_non_dataframe_input(self):
117+
"""Test that providing a non-DataFrame input raises a TypeError."""
118+
with self.assertRaises(TypeError):
119+
f_1706("not_a_dataframe")
120+
121+
def test_reproducibility_with_seed(self):
122+
"""Test reproducibility of sales data generation with a fixed seed."""
123+
initial_df = pd.DataFrame()
124+
df1, _ = f_1706(initial_df, seed=42)
125+
df2, _ = f_1706(initial_df, seed=42)
126+
pd.testing.assert_frame_equal(df1, df2, "DataFrames generated with the same seed should be identical.")
127+
128+
def test_with_custom_fruits_and_days(self):
129+
fruits = ['Mango', 'Pineapple']
130+
days = [pd.Timestamp('2023-01-01'), pd.Timestamp('2023-01-02')]
131+
initial_df = pd.DataFrame()
132+
report_df, plot = f_1706(initial_df, fruits=fruits, days=days, sales_lower_bound=1, sales_upper_bound=50, seed=42)
133+
134+
self.assertEqual(len(report_df['Fruit'].unique()), len(fruits), "Number of unique fruits should match the input")
135+
self.assertEqual(len(report_df['Day'].unique()), len(days), "Number of unique days should match the input")
136+
self.assertTrue(hasattr(plot, 'figure'), "Plot object should have a 'figure' attribute")
137+
138+
# Convert DataFrame to a list of strings for each row
139+
df_list = report_df.apply(lambda row: ','.join(row.values.astype(str)), axis=1).tolist()
140+
141+
# Check if the converted list matches the expected output
142+
expect_output = ['Mango,2023-01-01 00:00:00,39', 'Mango,2023-01-02 00:00:00,29', 'Pineapple,2023-01-01 00:00:00,15', 'Pineapple,2023-01-02 00:00:00,43']
143+
self.assertAlmostEqual(df_list, expect_output, "DataFrame contents should match the expected output")
144+
145+
def test_error_on_non_empty_dataframe(self):
146+
"""Test that a ValueError is raised if the input DataFrame is not empty."""
147+
# Create a non-empty DataFrame
148+
non_empty_df = pd.DataFrame({'A': [1, 2, 3]})
149+
150+
# Attempt to call f_1706 with a non-empty DataFrame and check for ValueError
151+
with self.assertRaises(ValueError) as context:
152+
f_1706(non_empty_df, seed=42)
153+
154+
# Optionally, check the error message to ensure it's for the non-empty DataFrame condition
155+
self.assertTrue("Input DataFrame must be empty" in str(context.exception), "Function should raise ValueError for non-empty DataFrame input.")
156+
157+
def run_tests():
158+
"""Run all tests for this function."""
159+
loader = unittest.TestLoader()
160+
suite = loader.loadTestsFromTestCase(TestCases)
161+
runner = unittest.TextTestRunner()
162+
runner.run(suite)
163+
164+
if __name__ == "__main__":
165+
import doctest
166+
doctest.testmod()
167+
run_tests()

data/clean/f_1718_junda_james.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
def f_1718(products, n_samples=100, sales_lower=50, sales_upper=200, profit_margin_min=0.1, profit_margin_max=0.5, random_seed=42):
5+
"""
6+
Generate a sales report with randomly simulated sales and profit data for a given list of products.
7+
The data is aggregated by product and sorted by total profit in descending order.
8+
9+
Parameters:
10+
- products (list of str): List of product names.
11+
- n_samples (int): The number of data points to generate for the report. Default is 100.
12+
- sales_lower (int): The minimum sales value for the random generation. Default is 50.
13+
- sales_upper (int): The maximum sales value for the random generation. Default is 200.
14+
- profit_margin_min (float): The minimum profit margin as a fraction of sales. Default is 0.1.
15+
- profit_margin_max (float): The maximum profit margin as a fraction of sales. Default is 0.5.
16+
- random_seed (int): Seed for the random number generator to ensure reproducibility. Default is 42.
17+
18+
Returns:
19+
pd.DataFrame: A DataFrame containing aggregated sales and profit data for each product, sorted by profit.
20+
21+
Raises:
22+
ValueError: If n_samples is not a positive integer, or if sales_lower is greater than sales_upper.
23+
TypeError: If products is not a list of strings, or if sales_lower, sales_upper, profit_margin_min, or profit_margin_max are not numeric.
24+
25+
Requirements:
26+
- numpy
27+
- pandas
28+
29+
Example:
30+
>>> products = ["iPhone", "iPad", "Macbook", "Airpods", "Apple Watch"]
31+
>>> report = f_1718(products, n_samples=50, sales_lower=100, sales_upper=150, profit_margin_min=0.2, profit_margin_max=0.4, random_seed=42)
32+
>>> print(report)
33+
Product Sales Profit
34+
2 Macbook 1561 444.826709
35+
3 iPad 1383 401.925334
36+
0 Airpods 1297 381.482713
37+
1 Apple Watch 1123 308.078536
38+
4 iPhone 921 294.013887
39+
"""
40+
np.random.seed(random_seed)
41+
42+
if not products:
43+
return pd.DataFrame(columns=["Product", "Sales", "Profit"])
44+
45+
if not isinstance(products, list) or not all(isinstance(product, str) for product in products):
46+
raise TypeError("products must be a list of strings.")
47+
if not isinstance(n_samples, int) or n_samples <= 0:
48+
raise ValueError("n_samples must be a positive integer.")
49+
if not (isinstance(sales_lower, int) and isinstance(sales_upper, int)) or sales_lower >= sales_upper:
50+
raise ValueError("sales_lower must be less than sales_upper and both must be integers.")
51+
if not all(isinstance(x, (int, float)) for x in [profit_margin_min, profit_margin_max]) or profit_margin_min >= profit_margin_max:
52+
raise ValueError("profit_margin_min must be less than profit_margin_max and both must be numeric.")
53+
54+
data = []
55+
for _ in range(n_samples):
56+
product = np.random.choice(products)
57+
sales = np.random.randint(sales_lower, sales_upper + 1)
58+
profit = sales * np.random.uniform(profit_margin_min, profit_margin_max)
59+
data.append([product, sales, profit])
60+
61+
df = pd.DataFrame(data, columns=["Product", "Sales", "Profit"])
62+
df = df.groupby("Product", as_index=False).sum()
63+
df.sort_values("Profit", ascending=False, inplace=True)
64+
65+
return df
66+
67+
import pandas as pd
68+
import unittest
69+
70+
class TestCases(unittest.TestCase):
71+
72+
def test_random_reproducibility(self):
73+
report1 = f_1718(["iPhone", "iPad"], n_samples=50, sales_lower=50, sales_upper=200, profit_margin_min=0.1, profit_margin_max=0.5, random_seed=42)
74+
report2 = f_1718(["iPhone", "iPad"], n_samples=50, sales_lower=50, sales_upper=200, profit_margin_min=0.1, profit_margin_max=0.5, random_seed=42)
75+
pd.testing.assert_frame_equal(report1, report2)
76+
77+
def test_number_of_rows(self):
78+
report = f_1718(["iPhone", "iPad"], n_samples=50, sales_lower=50, sales_upper=200)
79+
self.assertEqual(len(report), len(set(["iPhone", "iPad"])))
80+
81+
def test_sorting_by_profit(self):
82+
report = f_1718(["iPhone", "iPad"], sales_lower=50, sales_upper=200)
83+
self.assertTrue(report["Profit"].is_monotonic_decreasing)
84+
85+
def test_custom_parameters(self):
86+
report = f_1718(["iPhone", "iPad", "Macbook", "Airpods", "Apple Watch"], n_samples=50, sales_lower=100, sales_upper=150, profit_margin_min=0.2, profit_margin_max=0.4, random_seed=42)
87+
# This test needs to be adjusted based on the expected outcome of the custom parameters.
88+
# Specific checks on DataFrame contents should account for the randomness and reproducibility aspects.
89+
self.assertTrue(len(report) > 0, "The report should contain aggregated sales and profit data.")
90+
91+
def test_new_custom_parameters(self):
92+
report1 = f_1718(["iPhone", "iPad", "Macbook", "Airpods", "Apple Watch"], n_samples=50, sales_lower=100, sales_upper=150, profit_margin_min=0.2, profit_margin_max=0.4, random_seed=42)
93+
df_list = report1.apply(lambda row: ','.join(row.values.astype(str)), axis=1).tolist()
94+
expect = ['Macbook,1561,444.82670855378143', 'iPad,1383,401.9253335536443', 'Airpods,1297,381.4827132170069', 'Apple Watch,1123,308.07853599252707', 'iPhone,921,294.0138866107959']
95+
96+
self.assertEqual(df_list, expect, "DataFrame contents should match the expected output")
97+
98+
def test_sales_bounds_validation(self):
99+
"""Test that an error is raised if sales_lower is greater than sales_upper."""
100+
with self.assertRaises(ValueError):
101+
f_1718(["Product1"], sales_lower=250, sales_upper=100)
102+
103+
def test_profit_margin_validation(self):
104+
"""Test that an error is raised if profit_margin_min is greater than or equal to profit_margin_max."""
105+
with self.assertRaises(ValueError):
106+
f_1718(["Product1"], profit_margin_min=0.6, profit_margin_max=0.5)
107+
108+
def test_product_list_validation(self):
109+
"""Test that an error is raised if the products list is not a list of strings."""
110+
with self.assertRaises(TypeError):
111+
f_1718([123, 456], n_samples=10)
112+
113+
def test_n_samples_validation(self):
114+
"""Test that an error is raised if n_samples is not a positive integer."""
115+
with self.assertRaises(ValueError):
116+
f_1718(["Product1"], n_samples=-10)
117+
118+
def test_empty_product_list(self):
119+
"""Test that the function can handle an empty product list."""
120+
report = f_1718([], n_samples=10)
121+
self.assertTrue(report.empty, "The report should be empty if no products are provided.")
122+
123+
def test_zero_samples(self):
124+
"""Test handling of zero samples."""
125+
with self.assertRaises(ValueError):
126+
f_1718(["Product1"], n_samples=-10)
127+
128+
def test_single_product_reproducibility(self):
129+
"""Test that the function generates consistent results for a single product across multiple runs."""
130+
report1 = f_1718(["Product1"], n_samples=10, random_seed=42)
131+
report2 = f_1718(["Product1"], n_samples=10, random_seed=42)
132+
pd.testing.assert_frame_equal(report1, report2)
133+
134+
135+
def run_tests():
136+
"""Run all tests for this function."""
137+
loader = unittest.TestLoader()
138+
suite = loader.loadTestsFromTestCase(TestCases)
139+
runner = unittest.TextTestRunner()
140+
runner.run(suite)
141+
142+
if __name__ == "__main__":
143+
import doctest
144+
doctest.testmod()
145+
run_tests()

0 commit comments

Comments
 (0)