Skip to content

Commit eef0d14

Browse files
committed
update streamlit app
1 parent d693cb2 commit eef0d14

File tree

10 files changed

+1039
-956
lines changed

10 files changed

+1039
-956
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,7 @@ if archived_tweets:
9797

9898
A prototype written in Python with the Streamlit framework and hosted on Streamlit Cloud.
9999

100-
> [!NOTE]
101-
> Starting from version 1.0, the web app will not receive all updates from the official package. To access all features, prefer the package via PyPI.
100+
Important: Starting from version 1.0, the web app will no longer receive all updates from the official package. To access all features, prefer using the package from PyPI.
102101

103102
## Documentation
104103

app/app.py

Lines changed: 81 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import base64
21
from datetime import datetime, timedelta
32

43
import streamlit as st
@@ -8,17 +7,32 @@
87
from waybacktweets.api.parse import TweetsParser
98
from waybacktweets.api.request import WaybackTweets
109
from waybacktweets.api.visualize import HTMLTweetsVisualizer
11-
from waybacktweets.config import FIELD_OPTIONS, config
10+
from waybacktweets.config import config
1211

1312
# ------ Initial Settings ------ #
1413

1514
PAGE_ICON = "assets/parthenon.png"
1615
TITLE = "assets/waybacktweets.png"
17-
DOWNLOAD = "assets/download.svg"
16+
FIELD_OPTIONS = [
17+
"archived_urlkey",
18+
"archived_timestamp",
19+
"parsed_archived_timestamp",
20+
"archived_tweet_url",
21+
"parsed_archived_tweet_url",
22+
"original_tweet_url",
23+
"parsed_tweet_url",
24+
"available_tweet_text",
25+
"available_tweet_is_RT",
26+
"available_tweet_info",
27+
"archived_mimetype",
28+
"archived_statuscode",
29+
"archived_digest",
30+
"archived_length",
31+
]
1832

1933
collapse = None
2034
matchtype = None
21-
start_date = datetime.now() - timedelta(days=365)
35+
start_date = datetime.now() - timedelta(days=30 * 6)
2236
end_date = datetime.now()
2337
min_date = datetime(2006, 1, 1)
2438

@@ -34,13 +48,9 @@
3448
layout="centered",
3549
menu_items={
3650
"About": f"""
37-
[![License](https://img.shields.io/github/license/claromes/waybacktweets)](https://github.com/claromes/waybacktweets/blob/main/LICENSE.md)
51+
© 2023-{end_date.year} [Claromes](https://claromes.com). Licensed under the [GPL-3.0](https://raw.githubusercontent.com/claromes/waybacktweets/refs/heads/main/LICENSE.md). Icon by The Doodle Library. Title font by Google, licensed under the Open Font License (OFL).
3852
39-
The application is a prototype hosted on Streamlit Cloud, serving as an alternative to the command line tool.
40-
41-
© 2023 - {end_date.year}, [Claromes](https://claromes.com)
42-
43-
---
53+
---
4454
""", # noqa: E501
4555
"Report a bug": "https://github.com/claromes/waybacktweets/issues",
4656
},
@@ -55,7 +65,7 @@
5565
st.session_state.count = False
5666

5767
if "archived_timestamp_filter" not in st.session_state:
58-
st.session_state.archived_timestamp_filter = None
68+
st.session_state.archived_timestamp_filter = (start_date, end_date)
5969

6070
if "username_value" not in st.session_state:
6171
st.session_state.username_value = ""
@@ -97,7 +107,7 @@
97107
"""
98108
)
99109

100-
# ------ Requestings ------ #
110+
# ------ Functions ------ #
101111

102112

103113
@st.cache_data(ttl=600, show_spinner=False)
@@ -107,7 +117,6 @@ def wayback_tweets(
107117
timestamp_from,
108118
timestamp_to,
109119
limit,
110-
offset,
111120
matchtype,
112121
):
113122
response = WaybackTweets(
@@ -116,21 +125,22 @@ def wayback_tweets(
116125
timestamp_from,
117126
timestamp_to,
118127
limit,
119-
offset,
120128
matchtype,
121129
)
122130
archived_tweets = response.get()
123131

124132
return archived_tweets
125133

126134

135+
@st.cache_data(ttl=600, show_spinner=False)
127136
def tweets_parser(archived_tweets, username, field_options):
128137
parser = TweetsParser(archived_tweets, username, field_options)
129138
parsed_tweets = parser.parse()
130139

131140
return parsed_tweets
132141

133142

143+
@st.cache_data(ttl=600, show_spinner=False)
134144
def tweets_exporter(parsed_tweets, username, field_options):
135145
exporter = TweetsExporter(parsed_tweets, username, field_options)
136146

@@ -164,75 +174,57 @@ def scroll_page():
164174
st.session_state.update_component += 1
165175
scroll_page()
166176

167-
# ------ User Interface Settings ------ #
168-
169-
st.image(TITLE, use_column_width="never")
170-
st.caption(
171-
"[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/claromes/waybacktweets?include_prereleases)](https://github.com/claromes/waybacktweets/releases)" # noqa: E501
172-
)
173-
st.write(
174-
"Retrieves archived tweets CDX data in HTML (for easy viewing of the tweets), CSV, and JSON formats." # noqa: E501
175-
)
177+
# ------ UI Settings ------ #
176178

179+
st.image(TITLE, use_container_width="never")
177180
st.write(
178-
"For better performance, use the CLI version, available on [PyPI](https://pypi.org/project/waybacktweets)." # noqa: E501
181+
"Retrieves archived tweets CDX data in HTML, CSV, and JSON formats." # noqa: E501
179182
)
180183

181184
st.write(
182-
"To access the legacy version of Wayback Tweets, [click here](https://waybacktweets-legacy.streamlit.app)." # noqa: E501
185+
"This application is a prototype based on the Python package and does not include all available features. To explore the package, including CLI and Module usage, visit the [GitHub repository](https://github.com/claromes/waybacktweets)." # noqa: E501
183186
)
184187

185188
st.divider()
186189

187190
# -- Filters -- #
188191

189192
username = st.text_input(
190-
"Username *",
193+
"Username",
191194
value=st.session_state.username_value,
192195
key="username",
193196
placeholder="Without @",
194197
)
195198

196-
with st.expander("Filtering", expanded=st.session_state.expanded_value):
197-
198-
col1, col2 = st.columns(2)
199-
200-
with col1:
201-
limit = st.number_input(
202-
"Limit",
203-
value=500,
204-
max_value=500,
205-
key="limit",
206-
help="Query result limits. A maximum of 500 tweets per search to enhance the tool's performance", # noqa: E501
207-
)
208-
209-
with col2:
210-
offset = st.text_input(
211-
"Offset",
212-
key="offset",
213-
help="Enables efficient pagination. For instance, after retrieving an initial batch of 500 tweets, setting an offset of 500 fetches the next batch from 501 to 1000", # noqa: E501
214-
)
199+
st.session_state.archived_timestamp_filter = st.date_input(
200+
"Tweets saved between",
201+
(start_date, end_date),
202+
min_date,
203+
end_date,
204+
format="YYYY/MM/DD",
205+
help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
206+
)
207+
st.caption(
208+
":gray[Note: Large date ranges may take longer to process and exceed the app's resource limits. Use smaller ranges for faster results.]" # noqa: E501
209+
)
215210

216-
st.session_state.archived_timestamp_filter = st.date_input(
217-
"Tweets saved between",
218-
None,
219-
min_date,
220-
end_date,
221-
format="YYYY/MM/DD",
222-
help="Using the `from` and `to` filters. Format: YYYY/MM/DD",
223-
)
211+
limit = st.text_input(
212+
"Limit",
213+
key="limit",
214+
help="Query result limits",
215+
)
224216

225-
unique = st.checkbox(
226-
"Only unique Wayback Machine URLs",
227-
key="unique",
228-
help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
229-
)
230-
st.caption(
231-
":orange[note: according to the official documentation of the Wayback CDX Server API, the query to retrieve unique URLs may be slow at the moment.]" # noqa: E501
232-
)
217+
unique = st.checkbox(
218+
"Only unique Wayback Machine URLs",
219+
key="unique",
220+
help="Filtering by the collapse option using the `urlkey` field and the URL Match Scope `prefix`", # noqa: E501
221+
)
222+
st.caption(
223+
":gray[Note: As noted in the official Wayback CDX Server API documentation, retrieving unique URLs may experience slow performance at this time.]" # noqa: E501
224+
)
233225

234226

235-
query = st.button("Query", type="primary", use_container_width=True)
227+
query = st.button("Go", type="primary", use_container_width=True)
236228

237229
if st.query_params.username == "":
238230
st.query_params.clear()
@@ -248,34 +240,22 @@ def scroll_page():
248240
collapse = "urlkey"
249241
matchtype = "prefix"
250242

251-
archived_timestamp_from = None
252-
archived_timestamp_to = None
253-
254-
if st.session_state.archived_timestamp_filter:
255-
archived_timestamp_from = st.session_state.archived_timestamp_filter[0]
256-
archived_timestamp_to = st.session_state.archived_timestamp_filter[1]
257-
258243
try:
259-
with st.spinner(
260-
f"Retrieving the archived tweets of @{st.session_state.current_username}..."
261-
):
244+
with st.spinner(f"Retrieving @{st.session_state.current_username}..."):
262245
wayback_tweets = wayback_tweets(
263246
st.session_state.current_username,
264247
collapse,
265-
archived_timestamp_from,
266-
archived_timestamp_to,
248+
st.session_state.archived_timestamp_filter[0],
249+
st.session_state.archived_timestamp_filter[1],
267250
limit,
268-
offset,
269251
matchtype,
270252
)
271253

272254
if not wayback_tweets:
273255
st.error("No data was saved due to an empty response.")
274256
st.stop()
275257

276-
with st.spinner(
277-
f"Parsing the archived tweets of @{st.session_state.current_username}"
278-
):
258+
with st.spinner(f"Parsing @{st.session_state.current_username}..."):
279259
parsed_tweets = tweets_parser(
280260
wayback_tweets, st.session_state.current_username, FIELD_OPTIONS
281261
)
@@ -292,73 +272,47 @@ def scroll_page():
292272
# -- Rendering -- #
293273

294274
st.session_state.count = len(df)
295-
st.write(f"**{st.session_state.count} URLs have been captured**")
275+
st.caption(f"{st.session_state.count} URLs have been captured.")
296276

297277
tab1, tab2, tab3 = st.tabs(["HTML", "CSV", "JSON"])
298278

299279
# -- HTML -- #
300280
with tab1:
301-
st.write(
302-
f"Visualize tweets more efficiently through iframe tags. Download the @{st.session_state.current_username}'s archived tweets in HTML." # noqa: E501
281+
st.download_button(
282+
label=f"Download @{st.session_state.current_username} in HTML",
283+
data=html_content,
284+
file_name=f"{file_name}.html",
285+
mime="text/html",
286+
icon=":material/download:",
303287
)
304288

305-
col5, col6 = st.columns([1, 18])
306-
307-
with col5:
308-
st.image(DOWNLOAD, width=22)
309-
310-
with col6:
311-
b64_html = base64.b64encode(html_content.encode()).decode()
312-
href_html = f"data:text/html;base64,{b64_html}"
313-
314-
st.markdown(
315-
f'<a href="{href_html}" download="{file_name}.html" title="Download {file_name}.html">{file_name}.html</a>', # noqa: E501
316-
unsafe_allow_html=True,
317-
)
289+
st.caption("Note: The iframes are best viewed in Firefox.")
318290

319291
# -- CSV -- #
320292
with tab2:
321-
st.write(
322-
"Check the data returned in the dataframe below and download the file."
293+
st.download_button(
294+
label=f"Download @{st.session_state.current_username} in CSV",
295+
data=csv_data,
296+
file_name=f"{file_name}.csv",
297+
mime="text/csv",
298+
icon=":material/download:",
323299
)
324300

325-
col7, col8 = st.columns([1, 18])
326-
327-
with col7:
328-
st.image(DOWNLOAD, width=22)
329-
330-
with col8:
331-
b64_csv = base64.b64encode(csv_data.encode()).decode()
332-
href_csv = f"data:file/csv;base64,{b64_csv}"
333-
334-
st.markdown(
335-
f'<a href="{href_csv}" download="{file_name}.csv" title="Download {file_name}.csv">{file_name}.csv</a>', # noqa: E501
336-
unsafe_allow_html=True,
337-
)
338-
301+
st.caption("Preview:")
339302
st.dataframe(df, use_container_width=True)
340303

341304
# -- JSON -- #
342305
with tab3:
343-
st.write(
344-
"Check the data returned in JSON format below and download the file."
306+
st.download_button(
307+
label=f"Download @{st.session_state.current_username} in JSON",
308+
data=json_data,
309+
file_name=f"{file_name}.json",
310+
mime="application/json",
311+
icon=":material/download:",
345312
)
346313

347-
col9, col10 = st.columns([1, 18])
348-
349-
with col9:
350-
st.image(DOWNLOAD, width=22)
351-
352-
with col10:
353-
b64_json = base64.b64encode(json_data.encode()).decode()
354-
href_json = f"data:file/json;base64,{b64_json}"
355-
356-
st.markdown(
357-
f'<a href="{href_json}" download="{file_name}.json" title="Download {file_name}.json">{file_name}.json</a>', # noqa: E501
358-
unsafe_allow_html=True,
359-
)
360-
361-
st.json(json_data, expanded=False)
314+
st.caption("Preview:")
315+
st.json(json_data, expanded=1)
362316
except TypeError as e:
363317
st.error(
364318
f"""

app/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
streamlit==1.36.0
2-
waybacktweets==1.0a5
1+
streamlit==1.45.0
2+
waybacktweets

assets/download.svg

Lines changed: 0 additions & 9 deletions
This file was deleted.

docs/conf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,12 @@
3434
html_css_files = ["css/custom.css"]
3535
html_context = {
3636
"project_links": [
37-
ProjectLink("PyPI Releases", "https://pypi.org/project/waybacktweets/"),
37+
ProjectLink("PyPI", "https://pypi.org/project/waybacktweets/"),
3838
ProjectLink("Source Code", "https://github.com/claromes/waybacktweets/"),
39+
ProjectLink(
40+
"License",
41+
"https://raw.githubusercontent.com/claromes/waybacktweets/refs/heads/main/LICENSE.md", # noqa: E501
42+
),
3943
ProjectLink(
4044
"Issue Tracker", "https://github.com/claromes/waybacktweets/issues/"
4145
),

0 commit comments

Comments
 (0)