Skip to content

Commit bd42692

Browse files
Add search feature to copypasta dashboard + top 100 search filtering and edge case fixes (#248)
* add search features as a test * Add search functionality to ngram web app * Fix ngram web app issues * chore: format code with black and isort * refactor: cleanup and merging search functionalities * fix: move click information to a separate routine * feat: update the data viewer panel to work with filtering * feat: add optional df arg to get_top_n_stats * feat: handle reset button in data viewer * ux: formating in data_info --------- Co-authored-by: jaehoon <jaehoonr@gwu.edu>
1 parent b562f25 commit bd42692

File tree

1 file changed

+216
-63
lines changed
  • analyzers/ngrams/ngram_web

1 file changed

+216
-63
lines changed

analyzers/ngrams/ngram_web/app.py

Lines changed: 216 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -112,26 +112,34 @@ def _get_app_layout(ngram_choices_dict: dict):
112112
app_layout = [
113113
ui.card(
114114
ui.card_header("N-gram statistics"),
115-
ui.input_selectize(
116-
id="ngram_selector",
117-
label="Included n-grams:",
118-
choices=ngram_choices_dict,
119-
selected=list(ngram_choices_dict.keys()),
120-
multiple=True,
115+
ui.layout_columns(
116+
ui.input_text(
117+
id="ngram_content_search",
118+
label="Search N-gram Content:",
119+
placeholder="Enter keywords to search",
120+
),
121+
ui.input_selectize(
122+
id="ngram_length_selector",
123+
label="Included n-grams:",
124+
choices=ngram_choices_dict,
125+
selected=list(ngram_choices_dict.keys()),
126+
multiple=True,
127+
),
128+
ui.div(
129+
ui.input_action_button(
130+
id="reset_button",
131+
label="Clear selection",
132+
fill=False,
133+
),
134+
style="margin-top: 25px;", # Align with labeled inputs
135+
),
136+
col_widths=[4, 4, 4], # Equal width columns
121137
),
122138
output_widget("scatter_plot", height="400px"),
123139
),
124140
ui.card(
125141
ui.card_header("Data viewer"),
126142
ui.output_text(id="data_info"),
127-
ui.div(
128-
ui.input_action_button(
129-
id="reset_button",
130-
label="Reset selection (show summary)",
131-
fill=False,
132-
),
133-
style="display: inline-block; margin-bottom: 10px;",
134-
),
135143
ui.output_data_frame("data_viewer"),
136144
),
137145
]
@@ -140,11 +148,66 @@ def _get_app_layout(ngram_choices_dict: dict):
140148

141149

142150
def server(input, output, sessions):
151+
152+
def _format_data_for_display(df: pl.DataFrame) -> pl.DataFrame:
153+
"""Format dataframe for display with column selection and renaming"""
154+
SEL_COLUMNS = [
155+
COL_AUTHOR_ID,
156+
COL_NGRAM_WORDS,
157+
COL_MESSAGE_TEXT,
158+
COL_MESSAGE_TIMESTAMP,
159+
]
160+
COL_RENAME = ["Unique user ID", "N-gram content", "Post content", "Timestamp"]
161+
old2new = {k: v for k, v in zip(SEL_COLUMNS, COL_RENAME)}
162+
163+
if df.is_empty():
164+
return df.select(SEL_COLUMNS).head(0).rename(old2new)
165+
166+
return (
167+
df.with_columns(
168+
pl.col(COL_MESSAGE_TIMESTAMP).dt.strftime("%B %d, %Y %I:%M %p")
169+
)
170+
.select(SEL_COLUMNS)
171+
.rename(old2new)
172+
)
173+
143174
@reactive.calc
144-
def get_data():
175+
def get_search_filtered_stats():
176+
"""Get n-gram statistics filtered by search term and n-gram length only.
177+
178+
Note: Click filtering is NOT applied here to avoid re-rendering the scatter plot.
179+
Click filtering is applied downstream in get_filtered_full_data().
180+
"""
181+
145182
global data_stats
146183

147-
return data_stats.filter(pl.col("n").is_in(input.ngram_selector()))
184+
# Start with full stats data
185+
data_out = data_stats
186+
187+
# Get filter inputs (excluding clicks)
188+
ngram_search_term = (input.ngram_content_search() or "").strip()
189+
ngram_lengths = (
190+
input.ngram_length_selector()
191+
) # tuple[str] e.g. ('3', '5') or empty tuple
192+
193+
# Apply filters in order: search → length
194+
195+
# 1. Filter by search term (if provided)
196+
if ngram_search_term:
197+
# Match the search term as whole word(s)
198+
regex_pattern = f"\\b{ngram_search_term}\\b"
199+
data_out = data_out.filter(
200+
pl.col(COL_NGRAM_WORDS).str.contains(pattern=regex_pattern),
201+
)
202+
203+
# 2. Filter by n-gram length (if any selected)
204+
# If ngram_lengths is empty (user deselected all), return empty dataframe
205+
if not ngram_lengths:
206+
return data_out.head(0)
207+
208+
data_out = data_out.filter(pl.col(COL_NGRAM_LENGTH).is_in(ngram_lengths))
209+
210+
return data_out
148211

149212
# Store the figure widget reference
150213
current_figure_widget = reactive.value(None)
@@ -165,12 +228,22 @@ def handle_reset_button():
165228
except Exception:
166229
pass
167230

168-
@reactive.calc
169-
def get_top_n_data():
170-
global data_stats
231+
def get_top_n_stats(df: pl.DataFrame = None, n: int = 100) -> pl.DataFrame:
232+
"""Format and return top N n-gram statistics.
233+
234+
Args:
235+
df: Optional dataframe to format. If None, uses global data_stats.
236+
n: Number of top n-grams to return (default 100)
237+
238+
Returns:
239+
Formatted dataframe with sorted and renamed columns
240+
"""
241+
if df is None:
242+
global data_stats
243+
df = data_stats
171244

172245
data_top_n = (
173-
data_stats.select(
246+
df.select(
174247
[
175248
COL_NGRAM_WORDS,
176249
COL_NGRAM_TOTAL_REPS,
@@ -181,7 +254,8 @@ def get_top_n_data():
181254
.sort(
182255
pl.col(COL_NGRAM_TOTAL_REPS),
183256
pl.col(COL_NGRAM_DISTINCT_POSTER_COUNT),
184-
descending=[True, True],
257+
pl.col(COL_NGRAM_LENGTH),
258+
descending=[True, True, False],
185259
)
186260
.rename(
187261
{
@@ -191,39 +265,12 @@ def get_top_n_data():
191265
COL_NGRAM_LENGTH: "N-gram length",
192266
}
193267
)
194-
.head(100)
268+
.head(n)
195269
)
196270
return data_top_n
197271

198272
click_data = reactive.value(None)
199273

200-
@reactive.calc
201-
def get_filtered_data():
202-
clicked = click_data()
203-
204-
SEL_COLUMNS = [
205-
COL_AUTHOR_ID,
206-
COL_NGRAM_WORDS,
207-
COL_MESSAGE_TEXT,
208-
COL_MESSAGE_TIMESTAMP,
209-
]
210-
COL_RENAME = ["Unique user ID", "N-gram content", "Post content", "Timestamp"]
211-
212-
old2new = {k: v for k, v in zip(SEL_COLUMNS, COL_RENAME)}
213-
214-
if clicked and isinstance(clicked, dict) and COL_NGRAM_ID in clicked:
215-
ngram_id = clicked[COL_NGRAM_ID]
216-
filtered = data_full.filter(pl.col(COL_NGRAM_ID) == ngram_id)
217-
218-
filtered = filtered.with_columns(
219-
pl.col(COL_MESSAGE_TIMESTAMP).dt.strftime("%B %d, %Y %I:%M %p")
220-
)
221-
filtered = filtered.select(SEL_COLUMNS).rename(old2new)
222-
return filtered
223-
else:
224-
# Return empty dataframe with the right columns when nothing is clicked
225-
return data_full.select(SEL_COLUMNS).head(0)
226-
227274
def on_point_click(trace, points, state):
228275
if not hasattr(points, "point_inds") or not points.point_inds:
229276
return
@@ -261,7 +308,9 @@ def on_point_click(trace, points, state):
261308

262309
@render_widget
263310
def scatter_plot():
264-
df = get_data()
311+
312+
df = get_search_filtered_stats()
313+
265314
fig = plot_scatter(data=df)
266315

267316
# Create FigureWidget directly from the figure
@@ -276,23 +325,127 @@ def scatter_plot():
276325

277326
return fig_widget
278327

328+
@reactive.calc
329+
def get_filtered_full_data():
330+
"""Get filtered data respecting search, ngram size selection, and clicks.
331+
332+
This applies all three filter types:
333+
1. Search term (via get_search_filtered_stats)
334+
2. N-gram length (via get_search_filtered_stats)
335+
3. Click selection (applied here)
336+
"""
337+
global data_full
338+
339+
# Start with search and length filtered stats
340+
stats_filtered = get_search_filtered_stats()
341+
342+
# Apply click filter if a point was clicked
343+
clicked = click_data()
344+
if clicked and isinstance(clicked, dict) and COL_NGRAM_ID in clicked:
345+
ngram_id = clicked[COL_NGRAM_ID]
346+
stats_filtered = stats_filtered.filter(pl.col(COL_NGRAM_ID) == ngram_id)
347+
348+
# If stats are empty, return empty formatted dataframe
349+
if stats_filtered.is_empty():
350+
return _format_data_for_display(data_full.head(0))
351+
352+
# Get unique ngram IDs and lengths from filtered stats
353+
ngram_ids = stats_filtered.select(pl.col(COL_NGRAM_ID).unique()).to_series()
354+
ngram_lengths = (
355+
stats_filtered.select(pl.col(COL_NGRAM_LENGTH).unique()).cast(pl.Int8)
356+
).to_series()
357+
358+
# Filter the full dataframe with individual posts
359+
data_filtered = data_full.filter(
360+
pl.col(COL_NGRAM_ID).is_in(ngram_ids),
361+
pl.col(COL_NGRAM_LENGTH).is_in(ngram_lengths),
362+
)
363+
364+
return _format_data_for_display(data_filtered)
365+
279366
@render.text
280-
def data_info():
281-
filtered = get_filtered_data()
367+
def data_info() -> str:
368+
"""Display context-aware information about the current data view"""
369+
370+
# Check if a point was clicked
371+
clicked = click_data()
372+
has_click = clicked and isinstance(clicked, dict) and COL_NGRAM_ID in clicked
373+
374+
# Check if there's a search term
375+
content_search = (input.ngram_content_search() or "").strip()
376+
has_search = bool(content_search)
377+
378+
# Get filtered data
379+
filtered_data = get_filtered_full_data()
380+
381+
if has_click:
382+
# Show specific ngram info
383+
if not filtered_data.is_empty():
384+
total_reps = len(filtered_data)
385+
ngram_string = filtered_data["N-gram content"][0]
386+
return f"N-gram: '{ngram_string}' — {total_reps:,} total repetitions"
387+
else:
388+
return "Selected n-gram not found in current filters. Try adjusting your search or n-gram length selection."
389+
390+
if has_search:
391+
# Show search results summary
392+
search_stats = get_search_filtered_stats()
393+
394+
if search_stats.is_empty():
395+
return f"No results found for '{content_search}'. Try adjusting your search or n-gram length selection."
282396

283-
if filtered.is_empty():
284-
return "Showing summary (top 100 n-grams). Select a data point by clicking on the scatter plot above to show data for selected n-gram."
285-
else:
286-
total_reps = len(filtered)
287-
ngram_string = filtered["N-gram content"][0]
288-
return f"Ngram: {ngram_string}, Nr. total repetitions: {total_reps}"
397+
total_ngrams = len(search_stats)
398+
if not filtered_data.is_empty():
399+
total_records = len(filtered_data)
400+
return f"Search results for '{content_search}': {total_ngrams:,} unique n-grams (showing top 100) with {total_records:,} total occurrences"
401+
else:
402+
return f"Search results for '{content_search}': {total_ngrams:,} unique n-grams"
403+
404+
# Default: show summary message
405+
return "Showing summary (top 100 n-grams by frequency). Click a data point on the scatter plot to view all occurrences."
406+
407+
reset_click_count = reactive.value(0)
289408

290409
@render.data_frame
291410
def data_viewer():
292-
data_for_display = get_filtered_data()
411+
"""Display appropriate data based on user interactions.
412+
413+
Three scenarios:
414+
1. No filters: Show top 100 n-grams (stats summary)
415+
2. Search/length filter only: Show top 100 filtered n-grams (stats summary)
416+
3. Click on n-gram: Show all individual posts for that n-gram (full data)
417+
"""
418+
# Check if user clicked on a specific n-gram
419+
clicked = click_data()
420+
has_click = clicked and isinstance(clicked, dict) and COL_NGRAM_ID in clicked
421+
422+
# Check if user has search term
423+
content_search = (input.ngram_content_search() or "").strip()
424+
has_search = bool(content_search)
425+
426+
# if a new click is detected, show summary
427+
if input.reset_button.get() > reset_click_count.get():
428+
429+
reset_click_count.set(input.reset_button.get())
430+
431+
return render.DataGrid(get_top_n_stats(n=100), width="100%")
432+
433+
if has_click:
434+
# Show individual posts for clicked n-gram
435+
data_for_display = get_filtered_full_data()
436+
return render.DataGrid(data_for_display, width="100%")
437+
438+
# No click: show n-gram statistics (top 100)
439+
if has_search:
440+
# Show filtered n-gram stats (search + length filters)
441+
stats_filtered = get_search_filtered_stats()
442+
if stats_filtered.is_empty():
443+
# Return empty with proper column structure
444+
return render.DataGrid(get_top_n_stats().head(0), width="100%")
293445

294-
# Show summary data if no point is selected (filtered data is empty)
295-
if data_for_display.is_empty():
296-
data_for_display = get_top_n_data()
446+
# Format and return top 100 filtered stats
447+
data_for_display = get_top_n_stats(df=stats_filtered, n=100)
448+
return render.DataGrid(data_for_display, width="100%")
297449

298-
return render.DataGrid(data_for_display, width="100%")
450+
# Default: show top 100 n-grams by frequency
451+
return render.DataGrid(get_top_n_stats(n=100), width="100%")

0 commit comments

Comments
 (0)