Skip to content

Commit d00a9eb

Browse files
committed
fix feature engineering
1 parent 6499832 commit d00a9eb

File tree

1 file changed

+101
-176
lines changed

1 file changed

+101
-176
lines changed

kepco/kepco_eda.ipynb

Lines changed: 101 additions & 176 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
},
4848
{
4949
"cell_type": "code",
50-
"execution_count": 36,
50+
"execution_count": 1,
5151
"metadata": {},
5252
"outputs": [],
5353
"source": [
@@ -103,15 +103,6 @@
103103
"df.shape"
104104
]
105105
},
106-
{
107-
"cell_type": "code",
108-
"execution_count": null,
109-
"metadata": {},
110-
"outputs": [],
111-
"source": [
112-
"df"
113-
]
114-
},
115106
{
116107
"cell_type": "code",
117108
"execution_count": null,
@@ -221,7 +212,7 @@
221212
},
222213
{
223214
"cell_type": "code",
224-
"execution_count": 50,
215+
"execution_count": 14,
225216
"metadata": {},
226217
"outputs": [],
227218
"source": [
@@ -261,113 +252,101 @@
261252
"metadata": {},
262253
"outputs": [],
263254
"source": [
264-
"def advanced_feature_engineering(df):\n",
265-
" \"\"\"\n",
266-
" 고급 특성 공학 수행 - 47개 새로운 특성 생성\n",
267-
" \"\"\"\n",
268-
" print(\"고급 특성 공학 수행 중...\")\n",
269-
" df_features = df.copy()\n",
270-
" \n",
271-
" # 1. 결측치 처리\n",
272-
" df_features['상대습도'] = df_features['상대습도'].fillna(df_features['상대습도'].median())\n",
273-
" df_features['풍속'] = df_features['풍속'].fillna(df_features['풍속'].median())\n",
274-
" df_features['기온'] = df_features['기온'].fillna(df_features['기온'].median())\n",
275-
" \n",
276-
" # 2. 시간 기반 특성 (순환 인코딩)\n",
277-
" df_features['시간_sin'] = np.sin(2 * np.pi * df_features['시'] / 24)\n",
278-
" df_features['시간_cos'] = np.cos(2 * np.pi * df_features['시'] / 24)\n",
279-
" df_features['월_sin'] = np.sin(2 * np.pi * df_features['월'] / 12)\n",
280-
" df_features['월_cos'] = np.cos(2 * np.pi * df_features['월'] / 12)\n",
281-
" df_features['요일_sin'] = np.sin(2 * np.pi * df_features['요일'] / 7)\n",
282-
" df_features['요일_cos'] = np.cos(2 * np.pi * df_features['요일'] / 7)\n",
283-
" \n",
284-
" # 3. 시간대 구분\n",
285-
" df_features['주말'] = (df_features['요일'] >= 5).astype(int)\n",
286-
" df_features['새벽'] = ((df_features['시'] >= 0) & (df_features['시'] < 6)).astype(int)\n",
287-
" df_features['오전'] = ((df_features['시'] >= 6) & (df_features['시'] < 12)).astype(int)\n",
288-
" df_features['오후'] = ((df_features['시'] >= 12) & (df_features['시'] < 18)).astype(int)\n",
289-
" df_features['저녁'] = ((df_features['시'] >= 18) & (df_features['시'] < 24)).astype(int)\n",
290-
" df_features['오전피크'] = ((df_features['시'] >= 8) & (df_features['시'] <= 10)).astype(int)\n",
291-
" df_features['저녁피크'] = ((df_features['시'] >= 18) & (df_features['시'] <= 20)).astype(int)\n",
292-
" \n",
293-
" # 4. 계절 기반 특성\n",
294-
" df_features['봄'] = df_features['월'].isin([3, 4, 5]).astype(int)\n",
295-
" df_features['여름'] = df_features['월'].isin([6, 7, 8]).astype(int)\n",
296-
" df_features['가을'] = df_features['월'].isin([9, 10, 11]).astype(int)\n",
297-
" df_features['겨울'] = df_features['월'].isin([12, 1, 2]).astype(int)\n",
298-
" \n",
299-
" # 5. 기상 기반 특성\n",
300-
" df_features['냉방도일'] = np.maximum(0, df_features['기온'] - 24)\n",
301-
" df_features['난방도일'] = np.maximum(0, 18 - df_features['기온'])\n",
302-
" df_features['불쾌지수'] = 1.8 * df_features['기온'] - 0.55 * (1 - df_features['상대습도']/100) * (1.8 * df_features['기온'] - 26) + 32\n",
303-
" df_features['체감온도'] = df_features['기온'] - 0.4 * (df_features['기온'] - 10) * (1 - df_features['상대습도']/100)\n",
304-
" \n",
305-
" # 6. 극한 기상 조건\n",
306-
" df_features['고온'] = (df_features['기온'] > df_features['기온'].quantile(0.9)).astype(int)\n",
307-
" df_features['저온'] = (df_features['기온'] < df_features['기온'].quantile(0.1)).astype(int)\n",
308-
" df_features['고습도'] = (df_features['상대습도'] > df_features['상대습도'].quantile(0.9)).astype(int)\n",
309-
" df_features['강풍'] = (df_features['풍속'] > df_features['풍속'].quantile(0.9)).astype(int)\n",
310-
" \n",
311-
" # 7. 전력 관련 파생 특성\n",
312-
" df_features['이용률'] = df_features['전력부하합계'] / (df_features['계약전력합계'] + 1e-6)\n",
313-
" df_features['전력밀도'] = df_features['전력부하합계'] / (df_features['공동주택수'] + 1e-6)\n",
314-
" df_features['단지당계약전력'] = df_features['계약전력합계'] / (df_features['공동주택수'] + 1e-6)\n",
315-
" df_features['정규화부하'] = df_features['전력부하합계'] / (df_features['계약전력합계'] * df_features['공동주택수'] / 100 + 1e-6)\n",
316-
" \n",
317-
"\n",
318-
" # Combine columns into a datetime string\n",
319-
" dt_str = (\n",
320-
" df_features['연도'].astype(str) + '-' +\n",
321-
" df_features['월'].astype(str).str.zfill(2) + '-' +\n",
322-
" df_features['일'].astype(str).str.zfill(2) + ' ' +\n",
323-
" df_features['시'].astype(str).str.zfill(2) + ':00:00'\n",
324-
" )\n",
325255
"\n",
326-
" # Find rows where hour is 24\n",
327-
" mask_24 = df_features['시'] == 24\n",
328-
"\n",
329-
" # Set hour to 0 for those rows\n",
330-
" dt_str[mask_24] = (\n",
331-
" df_features.loc[mask_24, '연도'].astype(str) + '-' +\n",
332-
" df_features.loc[mask_24, '월'].astype(str).str.zfill(2) + '-' +\n",
333-
" df_features.loc[mask_24, '일'].astype(str).str.zfill(2) + ' 00:00:00'\n",
334-
" )\n",
256+
"# 1. 결측치 처리\n",
257+
"df['상대습도'] = df['상대습도'].fillna(df['상대습도'].median())\n",
258+
"df['풍속'] = df['풍속'].fillna(df['풍속'].median())\n",
259+
"df['기온'] = df['기온'].fillna(df['기온'].median())\n",
335260
"\n",
336-
" # Convert to datetime\n",
337-
" df_features['날짜'] = pd.to_datetime(dt_str, format='%Y-%m-%d %H:%M:%S')\n",
338-
"\n",
339-
"\n",
340-
" # 8. 시계열 지연 특성 (데이터 시간순 정렬 필요)\n",
341-
" # '연도', '월', '일', '시' 컬럼을 이용해 '날짜' 컬럼 생성 (datetime 대체)\n",
342-
"\n",
343-
" df_features = df_features.sort_values('날짜').reset_index(drop=True)\n",
344-
" for lag in [1, 24, 168]: # 1시간, 1일, 1주일 전\n",
345-
" if lag < len(df_features):\n",
346-
" df_features[f'전력부하_lag{lag}'] = df_features['전력부하합계'].shift(lag)\n",
347-
" df_features[f'기온_lag{lag}'] = df_features['기온'].shift(lag)\n",
348-
" # 9. 이동 평균 및 표준편차\n",
349-
" for window in [24, 168]: # 24시간, 1주일\n",
350-
" if window < len(df_features):\n",
351-
" df_features[f'전력부하_ma{window}'] = df_features['전력부하합계'].rolling(window=window, min_periods=1).mean()\n",
352-
" df_features[f'기온_ma{window}'] = df_features['기온'].rolling(window=window, min_periods=1).mean()\n",
353-
" df_features[f'전력부하_std{window}'] = df_features['전력부하합계'].rolling(window=window, min_periods=1).std()\n",
354-
" \n",
355-
" # 10. 상호작용 특성\n",
356-
" df_features['기온_시간'] = df_features['기온'] * df_features['시']\n",
357-
" df_features['기온_제곱'] = df_features['기온'] ** 2\n",
358-
" df_features['기온_세제곱'] = df_features['기온'] ** 3\n",
359-
" df_features['여름_오후'] = df_features['여름'] * df_features['오후']\n",
360-
" df_features['겨울_저녁'] = df_features['겨울'] * df_features['저녁']\n",
361-
" df_features['주말_오전'] = df_features['주말'] * df_features['오전']\n",
362-
" \n",
363-
" # 결측치 처리 (지연 특성으로 인한)\n",
364-
" df_features = df_features.fillna(method='bfill').fillna(method='ffill')\n",
365-
" \n",
366-
" return df_features\n",
367-
"\n",
368-
"# 특성 공학\n",
369-
"df_engineered = advanced_feature_engineering(df)\n",
370-
"df_engineered"
261+
"# 2. 시간 기반 특성 (순환 인코딩)\n",
262+
"df['시간_sin'] = np.sin(2 * np.pi * df['시'] / 24)\n",
263+
"df['시간_cos'] = np.cos(2 * np.pi * df['시'] / 24)\n",
264+
"df['월_sin'] = np.sin(2 * np.pi * df['월'] / 12)\n",
265+
"df['월_cos'] = np.cos(2 * np.pi * df['월'] / 12)\n",
266+
"df['요일_sin'] = np.sin(2 * np.pi * df['요일'] / 7)\n",
267+
"df['요일_cos'] = np.cos(2 * np.pi * df['요일'] / 7)\n",
268+
"\n",
269+
"# 3. 시간대 구분\n",
270+
"df['주말'] = (df['요일'] >= 5).astype(int)\n",
271+
"df['새벽'] = ((df['시'] >= 0) & (df['시'] < 6)).astype(int)\n",
272+
"df['오전'] = ((df['시'] >= 6) & (df['시'] < 12)).astype(int)\n",
273+
"df['오후'] = ((df['시'] >= 12) & (df['시'] < 18)).astype(int)\n",
274+
"df['저녁'] = ((df['시'] >= 18) & (df['시'] < 24)).astype(int)\n",
275+
"df['오전피크'] = ((df['시'] >= 8) & (df['시'] <= 10)).astype(int)\n",
276+
"df['저녁피크'] = ((df['시'] >= 18) & (df['시'] <= 20)).astype(int)\n",
277+
"\n",
278+
"# 4. 계절 기반 특성\n",
279+
"df['봄'] = df['월'].isin([3, 4, 5]).astype(int)\n",
280+
"df['여름'] = df['월'].isin([6, 7, 8]).astype(int)\n",
281+
"df['가을'] = df['월'].isin([9, 10, 11]).astype(int)\n",
282+
"df['겨울'] = df['월'].isin([12, 1, 2]).astype(int)\n",
283+
"\n",
284+
"# 5. 기상 기반 특성\n",
285+
"df['냉방도일'] = np.maximum(0, df['기온'] - 24)\n",
286+
"df['난방도일'] = np.maximum(0, 18 - df['기온'])\n",
287+
"df['불쾌지수'] = 1.8 * df['기온'] - 0.55 * (1 - df['상대습도']/100) * (1.8 * df['기온'] - 26) + 32\n",
288+
"df['체감온도'] = df['기온'] - 0.4 * (df['기온'] - 10) * (1 - df['상대습도']/100)\n",
289+
"\n",
290+
"# 6. 극한 기상 조건\n",
291+
"df['고온'] = (df['기온'] > df['기온'].quantile(0.9)).astype(int)\n",
292+
"df['저온'] = (df['기온'] < df['기온'].quantile(0.1)).astype(int)\n",
293+
"df['고습도'] = (df['상대습도'] > df['상대습도'].quantile(0.9)).astype(int)\n",
294+
"df['강풍'] = (df['풍속'] > df['풍속'].quantile(0.9)).astype(int)\n",
295+
"\n",
296+
"# 7. 전력 관련 파생 특성\n",
297+
"df['이용률'] = df['전력부하합계'] / (df['계약전력합계'] + 1e-6)\n",
298+
"df['전력밀도'] = df['전력부하합계'] / (df['공동주택수'] + 1e-6)\n",
299+
"df['단지당계약전력'] = df['계약전력합계'] / (df['공동주택수'] + 1e-6)\n",
300+
"df['정규화부하'] = df['전력부하합계'] / (df['계약전력합계'] * df['공동주택수'] / 100 + 1e-6)\n",
301+
"\n",
302+
"\n",
303+
"# Combine columns into a datetime string\n",
304+
"dt_str = (\n",
305+
" df['연도'].astype(str) + '-' +\n",
306+
" df['월'].astype(str).str.zfill(2) + '-' +\n",
307+
" df['일'].astype(str).str.zfill(2) + ' ' +\n",
308+
" df['시'].astype(str).str.zfill(2) + ':00:00'\n",
309+
")\n",
310+
"\n",
311+
"# Find rows where hour is 24\n",
312+
"mask_24 = df['시'] == 24\n",
313+
"\n",
314+
"# Set hour to 0 for those rows\n",
315+
"dt_str[mask_24] = (\n",
316+
" df.loc[mask_24, '연도'].astype(str) + '-' +\n",
317+
" df.loc[mask_24, '월'].astype(str).str.zfill(2) + '-' +\n",
318+
" df.loc[mask_24, '일'].astype(str).str.zfill(2) + ' 00:00:00'\n",
319+
")\n",
320+
"\n",
321+
"# Convert to datetime\n",
322+
"df['날짜'] = pd.to_datetime(dt_str, format='%Y-%m-%d %H:%M:%S')\n",
323+
"\n",
324+
"\n",
325+
"# 8. 시계열 지연 특성 (데이터 시간순 정렬 필요)\n",
326+
"# '연도', '월', '일', '시' 컬럼을 이용해 '날짜' 컬럼 생성 (datetime 대체)\n",
327+
"\n",
328+
"df = df.sort_values('날짜').reset_index(drop=True)\n",
329+
"for lag in [1, 24, 168]: # 1시간, 1일, 1주일 전\n",
330+
" if lag < len(df):\n",
331+
" df[f'전력부하_lag{lag}'] = df['전력부하합계'].shift(lag)\n",
332+
" df[f'기온_lag{lag}'] = df['기온'].shift(lag)\n",
333+
"# 9. 이동 평균 및 표준편차\n",
334+
"for window in [24, 168]: # 24시간, 1주일\n",
335+
" if window < len(df):\n",
336+
" df[f'전력부하_ma{window}'] = df['전력부하합계'].rolling(window=window, min_periods=1).mean()\n",
337+
" df[f'기온_ma{window}'] = df['기온'].rolling(window=window, min_periods=1).mean()\n",
338+
" df[f'전력부하_std{window}'] = df['전력부하합계'].rolling(window=window, min_periods=1).std()\n",
339+
"\n",
340+
"# 10. 상호작용 특성\n",
341+
"df['기온_시간'] = df['기온'] * df['시']\n",
342+
"df['기온_제곱'] = df['기온'] ** 2\n",
343+
"df['기온_세제곱'] = df['기온'] ** 3\n",
344+
"df['여름_오후'] = df['여름'] * df['오후']\n",
345+
"df['겨울_저녁'] = df['겨울'] * df['저녁']\n",
346+
"df['주말_오전'] = df['주말'] * df['오전']\n",
347+
"\n",
348+
"# 결측치 처리 (지연 특성으로 인한)\n",
349+
"df = df.fillna(method='bfill').fillna(method='ffill')"
371350
]
372351
},
373352
{
@@ -398,7 +377,7 @@
398377
},
399378
{
400379
"cell_type": "code",
401-
"execution_count": 54,
380+
"execution_count": 18,
402381
"metadata": {},
403382
"outputs": [],
404383
"source": [
@@ -432,7 +411,7 @@
432411
},
433412
{
434413
"cell_type": "code",
435-
"execution_count": 57,
414+
"execution_count": 21,
436415
"metadata": {},
437416
"outputs": [],
438417
"source": [
@@ -479,7 +458,7 @@
479458
},
480459
{
481460
"cell_type": "code",
482-
"execution_count": 59,
461+
"execution_count": 23,
483462
"metadata": {},
484463
"outputs": [],
485464
"source": [
@@ -557,7 +536,7 @@
557536
},
558537
{
559538
"cell_type": "code",
560-
"execution_count": 61,
539+
"execution_count": 25,
561540
"metadata": {},
562541
"outputs": [],
563542
"source": [
@@ -628,7 +607,7 @@
628607
},
629608
{
630609
"cell_type": "code",
631-
"execution_count": 73,
610+
"execution_count": 30,
632611
"metadata": {},
633612
"outputs": [],
634613
"source": [
@@ -657,61 +636,7 @@
657636
"execution_count": null,
658637
"metadata": {},
659638
"outputs": [],
660-
"source": [
661-
"# 특성 공학\n",
662-
"df_engineered = advanced_feature_engineering(df)"
663-
]
664-
},
665-
{
666-
"cell_type": "code",
667-
"execution_count": null,
668-
"metadata": {},
669-
"outputs": [],
670-
"source": [
671-
"# 데이터 준비\n",
672-
"X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled, scaler = prepare_modeling_data(df_engineered)\n"
673-
]
674-
},
675-
{
676-
"cell_type": "code",
677-
"execution_count": null,
678-
"metadata": {},
679-
"outputs": [],
680-
"source": [
681-
"\n",
682-
"# 모델 평가\n",
683-
"model_results = comprehensive_model_evaluation(X_train_scaled, X_test_scaled, y_train, y_test)\n"
684-
]
685-
},
686-
{
687-
"cell_type": "code",
688-
"execution_count": null,
689-
"metadata": {},
690-
"outputs": [],
691-
"source": [
692-
"\n",
693-
"# 하이퍼파라미터 최적화\n",
694-
"best_model = hyperparameter_optimization(X_train_scaled, y_train)\n",
695-
"\n",
696-
"# 앙상블 모델\n",
697-
"ensemble_model = create_ensemble_model(X_train_scaled, y_train)\n",
698-
"\n",
699-
"# 시계열 교차 검증\n",
700-
"tscv = TimeSeriesSplit(n_splits=3)\n",
701-
"cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=tscv, scoring='r2')\n",
702-
"\n",
703-
"print(f\"\\n최종 모델 성능:\")\n",
704-
"print(f\"시계열 교차검증 R²: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})\")\n",
705-
"\n",
706-
"# 9. 모델 저장 (pickle 사용)\n",
707-
"import pickle\n",
708-
"with open('best_power_prediction_model.pkl', 'wb') as f:\n",
709-
" pickle.dump(best_model, f)\n",
710-
"with open('scaler.pkl', 'wb') as f:\n",
711-
" pickle.dump(scaler, f)\n",
712-
"\n",
713-
"print(\"모델 및 스케일러 저장 완료\")\n"
714-
]
639+
"source": []
715640
},
716641
{
717642
"cell_type": "code",

0 commit comments

Comments
 (0)