@@ -112,26 +112,34 @@ def _get_app_layout(ngram_choices_dict: dict):
112112 app_layout = [
113113 ui .card (
114114 ui .card_header ("N-gram statistics" ),
115- ui .input_selectize (
116- id = "ngram_selector" ,
117- label = "Included n-grams:" ,
118- choices = ngram_choices_dict ,
119- selected = list (ngram_choices_dict .keys ()),
120- multiple = True ,
115+ ui .layout_columns (
116+ ui .input_text (
117+ id = "ngram_content_search" ,
118+ label = "Search N-gram Content:" ,
119+ placeholder = "Enter keywords to search" ,
120+ ),
121+ ui .input_selectize (
122+ id = "ngram_length_selector" ,
123+ label = "Included n-grams:" ,
124+ choices = ngram_choices_dict ,
125+ selected = list (ngram_choices_dict .keys ()),
126+ multiple = True ,
127+ ),
128+ ui .div (
129+ ui .input_action_button (
130+ id = "reset_button" ,
131+ label = "Clear selection" ,
132+ fill = False ,
133+ ),
134+ style = "margin-top: 25px;" , # Align with labeled inputs
135+ ),
136+ col_widths = [4 , 4 , 4 ], # Equal width columns
121137 ),
122138 output_widget ("scatter_plot" , height = "400px" ),
123139 ),
124140 ui .card (
125141 ui .card_header ("Data viewer" ),
126142 ui .output_text (id = "data_info" ),
127- ui .div (
128- ui .input_action_button (
129- id = "reset_button" ,
130- label = "Reset selection (show summary)" ,
131- fill = False ,
132- ),
133- style = "display: inline-block; margin-bottom: 10px;" ,
134- ),
135143 ui .output_data_frame ("data_viewer" ),
136144 ),
137145 ]
@@ -140,11 +148,66 @@ def _get_app_layout(ngram_choices_dict: dict):
140148
141149
142150def server (input , output , sessions ):
151+
152+ def _format_data_for_display (df : pl .DataFrame ) -> pl .DataFrame :
153+ """Format dataframe for display with column selection and renaming"""
154+ SEL_COLUMNS = [
155+ COL_AUTHOR_ID ,
156+ COL_NGRAM_WORDS ,
157+ COL_MESSAGE_TEXT ,
158+ COL_MESSAGE_TIMESTAMP ,
159+ ]
160+ COL_RENAME = ["Unique user ID" , "N-gram content" , "Post content" , "Timestamp" ]
161+ old2new = {k : v for k , v in zip (SEL_COLUMNS , COL_RENAME )}
162+
163+ if df .is_empty ():
164+ return df .select (SEL_COLUMNS ).head (0 ).rename (old2new )
165+
166+ return (
167+ df .with_columns (
168+ pl .col (COL_MESSAGE_TIMESTAMP ).dt .strftime ("%B %d, %Y %I:%M %p" )
169+ )
170+ .select (SEL_COLUMNS )
171+ .rename (old2new )
172+ )
173+
143174 @reactive .calc
144- def get_data ():
175+ def get_search_filtered_stats ():
176+ """Get n-gram statistics filtered by search term and n-gram length only.
177+
178+ Note: Click filtering is NOT applied here to avoid re-rendering the scatter plot.
179+ Click filtering is applied downstream in get_filtered_full_data().
180+ """
181+
145182 global data_stats
146183
147- return data_stats .filter (pl .col ("n" ).is_in (input .ngram_selector ()))
184+ # Start with full stats data
185+ data_out = data_stats
186+
187+ # Get filter inputs (excluding clicks)
188+ ngram_search_term = (input .ngram_content_search () or "" ).strip ()
189+ ngram_lengths = (
190+ input .ngram_length_selector ()
191+ ) # tuple[str] e.g. ('3', '5') or empty tuple
192+
193+ # Apply filters in order: search → length
194+
195+ # 1. Filter by search term (if provided)
196+ if ngram_search_term :
197+ # Match the search term as whole word(s)
198+ regex_pattern = f"\\ b{ ngram_search_term } \\ b"
199+ data_out = data_out .filter (
200+ pl .col (COL_NGRAM_WORDS ).str .contains (pattern = regex_pattern ),
201+ )
202+
203+ # 2. Filter by n-gram length (if any selected)
204+ # If ngram_lengths is empty (user deselected all), return empty dataframe
205+ if not ngram_lengths :
206+ return data_out .head (0 )
207+
208+ data_out = data_out .filter (pl .col (COL_NGRAM_LENGTH ).is_in (ngram_lengths ))
209+
210+ return data_out
148211
149212 # Store the figure widget reference
150213 current_figure_widget = reactive .value (None )
@@ -165,12 +228,22 @@ def handle_reset_button():
165228 except Exception :
166229 pass
167230
168- @reactive .calc
169- def get_top_n_data ():
170- global data_stats
231+ def get_top_n_stats (df : pl .DataFrame = None , n : int = 100 ) -> pl .DataFrame :
232+ """Format and return top N n-gram statistics.
233+
234+ Args:
235+ df: Optional dataframe to format. If None, uses global data_stats.
236+ n: Number of top n-grams to return (default 100)
237+
238+ Returns:
239+ Formatted dataframe with sorted and renamed columns
240+ """
241+ if df is None :
242+ global data_stats
243+ df = data_stats
171244
172245 data_top_n = (
173- data_stats .select (
246+ df .select (
174247 [
175248 COL_NGRAM_WORDS ,
176249 COL_NGRAM_TOTAL_REPS ,
@@ -181,7 +254,8 @@ def get_top_n_data():
181254 .sort (
182255 pl .col (COL_NGRAM_TOTAL_REPS ),
183256 pl .col (COL_NGRAM_DISTINCT_POSTER_COUNT ),
184- descending = [True , True ],
257+ pl .col (COL_NGRAM_LENGTH ),
258+ descending = [True , True , False ],
185259 )
186260 .rename (
187261 {
@@ -191,39 +265,12 @@ def get_top_n_data():
191265 COL_NGRAM_LENGTH : "N-gram length" ,
192266 }
193267 )
194- .head (100 )
268+ .head (n )
195269 )
196270 return data_top_n
197271
198272 click_data = reactive .value (None )
199273
200- @reactive .calc
201- def get_filtered_data ():
202- clicked = click_data ()
203-
204- SEL_COLUMNS = [
205- COL_AUTHOR_ID ,
206- COL_NGRAM_WORDS ,
207- COL_MESSAGE_TEXT ,
208- COL_MESSAGE_TIMESTAMP ,
209- ]
210- COL_RENAME = ["Unique user ID" , "N-gram content" , "Post content" , "Timestamp" ]
211-
212- old2new = {k : v for k , v in zip (SEL_COLUMNS , COL_RENAME )}
213-
214- if clicked and isinstance (clicked , dict ) and COL_NGRAM_ID in clicked :
215- ngram_id = clicked [COL_NGRAM_ID ]
216- filtered = data_full .filter (pl .col (COL_NGRAM_ID ) == ngram_id )
217-
218- filtered = filtered .with_columns (
219- pl .col (COL_MESSAGE_TIMESTAMP ).dt .strftime ("%B %d, %Y %I:%M %p" )
220- )
221- filtered = filtered .select (SEL_COLUMNS ).rename (old2new )
222- return filtered
223- else :
224- # Return empty dataframe with the right columns when nothing is clicked
225- return data_full .select (SEL_COLUMNS ).head (0 )
226-
227274 def on_point_click (trace , points , state ):
228275 if not hasattr (points , "point_inds" ) or not points .point_inds :
229276 return
@@ -261,7 +308,9 @@ def on_point_click(trace, points, state):
261308
262309 @render_widget
263310 def scatter_plot ():
264- df = get_data ()
311+
312+ df = get_search_filtered_stats ()
313+
265314 fig = plot_scatter (data = df )
266315
267316 # Create FigureWidget directly from the figure
@@ -276,23 +325,127 @@ def scatter_plot():
276325
277326 return fig_widget
278327
328+ @reactive .calc
329+ def get_filtered_full_data ():
330+ """Get filtered data respecting search, ngram size selection, and clicks.
331+
332+ This applies all three filter types:
333+ 1. Search term (via get_search_filtered_stats)
334+ 2. N-gram length (via get_search_filtered_stats)
335+ 3. Click selection (applied here)
336+ """
337+ global data_full
338+
339+ # Start with search and length filtered stats
340+ stats_filtered = get_search_filtered_stats ()
341+
342+ # Apply click filter if a point was clicked
343+ clicked = click_data ()
344+ if clicked and isinstance (clicked , dict ) and COL_NGRAM_ID in clicked :
345+ ngram_id = clicked [COL_NGRAM_ID ]
346+ stats_filtered = stats_filtered .filter (pl .col (COL_NGRAM_ID ) == ngram_id )
347+
348+ # If stats are empty, return empty formatted dataframe
349+ if stats_filtered .is_empty ():
350+ return _format_data_for_display (data_full .head (0 ))
351+
352+ # Get unique ngram IDs and lengths from filtered stats
353+ ngram_ids = stats_filtered .select (pl .col (COL_NGRAM_ID ).unique ()).to_series ()
354+ ngram_lengths = (
355+ stats_filtered .select (pl .col (COL_NGRAM_LENGTH ).unique ()).cast (pl .Int8 )
356+ ).to_series ()
357+
358+ # Filter the full dataframe with individual posts
359+ data_filtered = data_full .filter (
360+ pl .col (COL_NGRAM_ID ).is_in (ngram_ids ),
361+ pl .col (COL_NGRAM_LENGTH ).is_in (ngram_lengths ),
362+ )
363+
364+ return _format_data_for_display (data_filtered )
365+
279366 @render .text
280- def data_info ():
281- filtered = get_filtered_data ()
367+ def data_info () -> str :
368+ """Display context-aware information about the current data view"""
369+
370+ # Check if a point was clicked
371+ clicked = click_data ()
372+ has_click = clicked and isinstance (clicked , dict ) and COL_NGRAM_ID in clicked
373+
374+ # Check if there's a search term
375+ content_search = (input .ngram_content_search () or "" ).strip ()
376+ has_search = bool (content_search )
377+
378+ # Get filtered data
379+ filtered_data = get_filtered_full_data ()
380+
381+ if has_click :
382+ # Show specific ngram info
383+ if not filtered_data .is_empty ():
384+ total_reps = len (filtered_data )
385+ ngram_string = filtered_data ["N-gram content" ][0 ]
386+ return f"N-gram: '{ ngram_string } ' — { total_reps :,} total repetitions"
387+ else :
388+ return "Selected n-gram not found in current filters. Try adjusting your search or n-gram length selection."
389+
390+ if has_search :
391+ # Show search results summary
392+ search_stats = get_search_filtered_stats ()
393+
394+ if search_stats .is_empty ():
395+ return f"No results found for '{ content_search } '. Try adjusting your search or n-gram length selection."
282396
283- if filtered .is_empty ():
284- return "Showing summary (top 100 n-grams). Select a data point by clicking on the scatter plot above to show data for selected n-gram."
285- else :
286- total_reps = len (filtered )
287- ngram_string = filtered ["N-gram content" ][0 ]
288- return f"Ngram: { ngram_string } , Nr. total repetitions: { total_reps } "
397+ total_ngrams = len (search_stats )
398+ if not filtered_data .is_empty ():
399+ total_records = len (filtered_data )
400+ return f"Search results for '{ content_search } ': { total_ngrams :,} unique n-grams (showing top 100) with { total_records :,} total occurrences"
401+ else :
402+ return f"Search results for '{ content_search } ': { total_ngrams :,} unique n-grams"
403+
404+ # Default: show summary message
405+ return "Showing summary (top 100 n-grams by frequency). Click a data point on the scatter plot to view all occurrences."
406+
407+ reset_click_count = reactive .value (0 )
289408
290409 @render .data_frame
291410 def data_viewer ():
292- data_for_display = get_filtered_data ()
411+ """Display appropriate data based on user interactions.
412+
413+ Three scenarios:
414+ 1. No filters: Show top 100 n-grams (stats summary)
415+ 2. Search/length filter only: Show top 100 filtered n-grams (stats summary)
416+ 3. Click on n-gram: Show all individual posts for that n-gram (full data)
417+ """
418+ # Check if user clicked on a specific n-gram
419+ clicked = click_data ()
420+ has_click = clicked and isinstance (clicked , dict ) and COL_NGRAM_ID in clicked
421+
422+ # Check if user has search term
423+ content_search = (input .ngram_content_search () or "" ).strip ()
424+ has_search = bool (content_search )
425+
426+ # if a new click is detected, show summary
427+ if input .reset_button .get () > reset_click_count .get ():
428+
429+ reset_click_count .set (input .reset_button .get ())
430+
431+ return render .DataGrid (get_top_n_stats (n = 100 ), width = "100%" )
432+
433+ if has_click :
434+ # Show individual posts for clicked n-gram
435+ data_for_display = get_filtered_full_data ()
436+ return render .DataGrid (data_for_display , width = "100%" )
437+
438+ # No click: show n-gram statistics (top 100)
439+ if has_search :
440+ # Show filtered n-gram stats (search + length filters)
441+ stats_filtered = get_search_filtered_stats ()
442+ if stats_filtered .is_empty ():
443+ # Return empty with proper column structure
444+ return render .DataGrid (get_top_n_stats ().head (0 ), width = "100%" )
293445
294- # Show summary data if no point is selected ( filtered data is empty)
295- if data_for_display . is_empty ():
296- data_for_display = get_top_n_data ( )
446+ # Format and return top 100 filtered stats
447+ data_for_display = get_top_n_stats ( df = stats_filtered , n = 100 )
448+ return render . DataGrid ( data_for_display , width = "100%" )
297449
298- return render .DataGrid (data_for_display , width = "100%" )
450+ # Default: show top 100 n-grams by frequency
451+ return render .DataGrid (get_top_n_stats (n = 100 ), width = "100%" )
0 commit comments