-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
171 lines (144 loc) · 5.83 KB
/
utils.py
File metadata and controls
171 lines (144 loc) · 5.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import pandas as pd
import numpy as np
import streamlit as st
from datetime import datetime, timedelta
import plotly.express as px
import plotly.graph_objects as go
import logging
logger = logging.getLogger(__name__)
# Define COLUMN_MAPPING as a constant
COLUMN_MAPPING = {
'tanggal': 'date',
'judul': 'title',
'sentimen': 'sentiment',
'sumber': 'source',
'isi': 'content',
'kategori': 'category'
}
def create_metric_card(title, value, icon, color):
"""Create modern metric card with hover effect"""
return f"""
<div style="
background: #ffffff;
border-radius: 10px;
padding: 15px;
border-left: 4px solid {color};
margin-bottom: 15px;
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
transition: transform 0.2s;
">
<div style="display: flex; align-items: center; gap: 15px;">
<div style="font-size: 28px; color: {color};">{icon}</div>
<div>
<div style="font-size: 14px; color: #666;">{title}</div>
<div style="font-size: 24px; font-weight: bold; color: {color};">{value}</div>
</div>
</div>
</div>
"""
def validate_and_clean_data(df):
"""Validate and clean the news data"""
# Use the defined COLUMN_MAPPING
column_mapping = COLUMN_MAPPING
# Rename columns
df.columns = [col.strip().lower() for col in df.columns]
df.rename(columns=column_mapping, inplace=True)
# Check required columns
required_columns = ['date', 'title', 'source', 'content']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Kolom wajib tidak ditemukan: {', '.join(missing_columns)}")
# Clean date
if 'date' in df.columns:
df['date'] = pd.to_datetime(df['date'], errors='coerce', format='%Y-%m-%d')
invalid_dates = df[df['date'].isna()]
if not invalid_dates.empty:
logger.warning(f"Ditemukan {len(invalid_dates)} baris dengan tanggal tidak valid")
# Clean sentiment
if 'sentiment' in df.columns:
valid_sentiments = ['positif', 'negatif', 'netral']
df['sentiment'] = df['sentiment'].str.lower()
invalid_sentiments = ~df['sentiment'].isin(valid_sentiments)
if invalid_sentiments.any():
logger.warning(f"Ditemukan {invalid_sentiments.sum()} sentimen tidak valid")
return df
def generate_data_profile(df):
"""Generate comprehensive data profile"""
profile = {
'total_news': len(df),
'sources': {},
'date_range': None,
'sentiment_dist': {}
}
if 'date' in df.columns and not df.empty:
min_date = df['date'].min()
max_date = df['date'].max()
if pd.notnull(min_date) and pd.notnull(max_date):
profile['date_range'] = {
'min': min_date.strftime('%Y-%m-%d'),
'max': max_date.strftime('%Y-%m-%d')
}
if 'source' in df.columns and not df.empty:
profile['sources'] = df['source'].value_counts().head(5).to_dict()
if 'sentiment' in df.columns and not df.empty:
sentiment_counts = df['sentiment'].value_counts(normalize=True)
profile['sentiment_dist'] = sentiment_counts.to_dict()
return profile
def plot_sentiment_timeseries(df):
"""Plot sentiment over time"""
if 'date' not in df.columns or 'sentiment' not in df.columns:
return None
# Resample by week
df_weekly = df.set_index('date').resample('W')['sentiment'].value_counts().unstack().fillna(0)
df_weekly = df_weekly.div(df_weekly.sum(axis=1), axis=0)
fig = px.area(
df_weekly,
title='Proporsi Sentimen Mingguan',
labels={'value': 'Proporsi', 'date': 'Tanggal'},
color_discrete_map={
'positif': '#2ca02c',
'netral': '#7f7f7f',
'negatif': '#d62728'
}
)
fig.update_layout(yaxis_tickformat=".0%")
return fig
def process_upload(uploaded_file):
"""Proses file CSV yang diupload"""
try:
# Deteksi format file
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file)
elif uploaded_file.name.endswith(('.xlsx', '.xls')):
df = pd.read_excel(uploaded_file)
else:
return None, "Format file tidak didukung. Gunakan CSV atau Excel."
# Auto rename columns
df.columns = [col.strip().lower() for col in df.columns]
# Use the defined COLUMN_MAPPING
df.rename(columns=lambda x: COLUMN_MAPPING.get(x, x), inplace=True)
# Validasi kolom wajib
required_columns = ['date', 'title', 'sentiment', 'source', 'content']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
return None, f"Kolom wajib tidak ditemukan: {', '.join(missing_columns)}"
# Konversi tanggal
df['date'] = pd.to_datetime(df['date'], errors='coerce')
# Validasi sentimen
valid_sentiments = ['positif', 'negatif', 'netral']
invalid_sentiments = df[~df['sentiment'].str.lower().isin(valid_sentiments)]
if not invalid_sentiments.empty:
st.warning(f"Terdapat {len(invalid_sentiments)} sentimen tidak valid. Hanya gunakan: Positif, Negatif, Netral")
return df, None
except Exception as e:
return None, f"Error memproses file: {str(e)}"
def generate_template():
"""Generate sample CSV template"""
data = {
'date': ['2024-01-01', '2024-01-02'],
'title': ['Contoh Judul Berita 1', 'Contoh Judul Berita 2'],
'sentiment': ['Positif', 'Netral'],
'source': ['Media Satu', 'Media Dua'],
'content': ['Isi berita pertama...', 'Isi berita kedua...']
}
return pd.DataFrame(data)