-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathupdates.py
More file actions
191 lines (153 loc) · 7.17 KB
/
updates.py
File metadata and controls
191 lines (153 loc) · 7.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from datetime import datetime
from typing import List, Optional
import logging
from dataclasses import dataclass
import pandas as pd
@dataclass
class Update:
"""Represents a single update entry with date and content."""
date: datetime
content: str
def __str__(self) -> str:
"""String representation of the update."""
return f"{self.date.strftime('%Y-%m-%d')}: {self.content}"
def __repr__(self) -> str:
"""Official string representation of the update."""
return f"Update(date={self.date!r}, content={self.content!r})"
class UpdateReader:
"""
Reads and manages a collection of updates from a public Google Spreadsheet.
This class uses pandas to read updates directly from the Google Sheets CSV export.
"""
def __init__(self, spreadsheet_id: str, sheet_gid: Optional[str] = None):
"""
Initialize the UpdateReader with a Google Spreadsheet ID.
Args:
spreadsheet_id: The ID of the Google Spreadsheet containing updates
sheet_gid: Optional specific sheet GID number (found in sheet URL)
"""
self.spreadsheet_id = spreadsheet_id
self.sheet_gid = sheet_gid
self.updates: List[Update] = []
self.metadata = {
"title": "Recent Updates",
"description": "Recent updates and news about the project",
"link_text": "View all updates"
}
self.logger = logging.getLogger(__name__)
@property
def update_count(self) -> int:
"""Returns the number of updates currently stored."""
return len(self.updates)
@property
def spreadsheet_url(self) -> str:
"""Returns the URL to the Google Spreadsheet."""
return f"https://docs.google.com/spreadsheets/d/{self.spreadsheet_id}/edit"
@property
def csv_export_url(self) -> str:
"""Returns the CSV export URL for the spreadsheet."""
base_url = f"https://docs.google.com/spreadsheets/d/{self.spreadsheet_id}/export?format=csv"
if self.sheet_gid:
return f"{base_url}&gid={self.sheet_gid}"
return base_url
def get_recent_updates(self, limit: int = 5) -> List[Update]:
"""
Get the most recent updates.
Args:
limit: Maximum number of updates to return
Returns:
List of the most recent updates (already sorted by date, newest first)
"""
return self.updates[:limit]
def get_all_updates(self) -> List[Update]:
"""
Get all updates.
Returns:
List of all updates (already sorted by date, newest first)
"""
return self.updates
def get_updates_by_date_range(self, start_date: datetime, end_date: datetime) -> List[Update]:
"""
Get updates within a specific date range.
Args:
start_date: The start date (inclusive)
end_date: The end date (inclusive)
Returns:
List of updates within the specified date range
"""
return [
update for update in self.updates
if start_date <= update.date <= end_date
]
def load_from_spreadsheet(self, date_column: str = 'Date', content_column: str = 'Content',
date_format: Optional[str] = None, dayfirst: bool = True):
"""
Load updates from the Google Spreadsheet using pandas and the CSV export URL.
Args:
date_column: Name of the column containing dates
content_column: Name of the column containing update content
date_format: Optional explicit date format string (e.g., '%d/%m/%Y')
dayfirst: Whether the date format has day before month (e.g., 31/12/2023 vs 12/31/2023)
Returns:
The number of updates loaded
"""
try:
# Load the CSV directly using pandas
self.logger.info(f"Loading updates from spreadsheet URL: {self.csv_export_url}")
df = pd.read_csv(self.csv_export_url)
# Check if required columns exist
if date_column not in df.columns or content_column not in df.columns:
available_columns = ', '.join(df.columns)
self.logger.error(f"Required columns not found. Available columns: {available_columns}")
raise ValueError(f"Required columns {date_column} and/or {content_column} not found in spreadsheet")
# Process the data
self.updates = []
# Convert date column to datetime with explicit format if provided
if not pd.api.types.is_datetime64_any_dtype(df[date_column]):
if date_format:
# Use explicit format if provided
df[date_column] = pd.to_datetime(df[date_column], format=date_format, errors='coerce')
else:
# Otherwise use pandas inference with dayfirst parameter
df[date_column] = pd.to_datetime(df[date_column], dayfirst=dayfirst, errors='coerce')
# Drop rows with invalid dates
df = df.dropna(subset=[date_column])
# Create Update objects
for _, row in df.iterrows():
try:
date = row[date_column]
# Convert pandas timestamp to Python datetime if needed
if isinstance(date, pd.Timestamp):
date = date.to_pydatetime()
content = row[content_column]
self.updates.append(Update(date=date, content=content))
except Exception as e:
self.logger.warning(f"Error processing row {row}: {e}")
# Sort updates by date (newest first)
self._sort_updates()
self.logger.info(f"Loaded {len(self.updates)} updates from spreadsheet")
return len(self.updates)
except Exception as e:
self.logger.error(f"Error loading spreadsheet: {e}")
raise
def _sort_updates(self) -> None:
"""Sort updates by date in descending order (newest first)."""
self.updates.sort(key=lambda update: update.date, reverse=True)
# Example usage
if __name__ == "__main__":
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# Use the actual spreadsheet ID
reader = UpdateReader("1P3W_ybjMBduJWohitdB5fdXZEqrknS25FzYmMkxqmG0")
# Load updates from the spreadsheet with explicit date handling
# Use dayfirst=True if dates are in DD/MM/YYYY format
reader.load_from_spreadsheet(date_format='%d/%m/%Y', dayfirst=True)
# Alternative: if you don't know the format exactly but know day comes first
# reader.load_from_spreadsheet(dayfirst=True)
# Print total number of updates
print(f"Total updates: {reader.update_count}")
# Print recent updates
print("\nRecent updates:")
for update in reader.get_recent_updates(5):
print(update)