Skip to content

Commit 93f4671

Browse files
committed
adding muli grepl
1 parent ffc364c commit 93f4671

File tree

7 files changed

+534
-2
lines changed

7 files changed

+534
-2
lines changed

R/graph_functions.R

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,4 +438,111 @@ add_component_column <- function(dt, from_col = "from", to_col = "to",
438438
edge_components <- function(dt, from_col, to_col, compress = TRUE) {
439439
edges_matrix <- as.matrix(dt[, c(from_col, to_col), with = FALSE])
440440
group_edges(edges_matrix, compress = compress)
441+
}
442+
443+
#' Multi-Pattern String Matching
444+
#'
445+
#' Fast multi-pattern string matching using C++. Similar to applying
446+
#' grepl(pattern, x, fixed=TRUE) for multiple patterns, but much faster.
447+
#'
448+
#' @param strings Character vector of strings to search in
449+
#' @param patterns Character vector of fixed patterns to search for
450+
#' @param match_any Logical. If TRUE (default), returns TRUE if ANY pattern matches.
451+
#' If FALSE, returns detailed results for each pattern.
452+
#' @param ignore_case Logical. Whether to ignore case when matching. Default FALSE.
453+
#' @param return_matrix Logical. If TRUE and match_any=FALSE, returns a matrix.
454+
#' If FALSE and match_any=FALSE, returns a data.frame. Default FALSE.
455+
#'
456+
#' @return If match_any=TRUE: Logical vector same length as strings.
457+
#' If match_any=FALSE: Matrix or data.frame showing which patterns match which strings.
458+
#'
459+
#' @examples
460+
#' strings <- c("hello world", "goodbye", "hello there", "world peace")
461+
#' patterns <- c("hello", "world")
462+
#'
463+
#' # Check if any pattern matches each string
464+
#' multi_grepl(strings, patterns)
465+
#'
466+
#' # Get detailed results
467+
#' multi_grepl(strings, patterns, match_any = FALSE)
468+
#'
469+
#' # Case insensitive matching
470+
#' multi_grepl(c("Hello", "WORLD"), c("hello", "world"), ignore_case = TRUE)
471+
#'
472+
#' @export
473+
multi_grepl <- function(strings, patterns, match_any = TRUE, ignore_case = FALSE, return_matrix = FALSE) {
474+
475+
# Input validation
476+
if (!is.character(strings)) {
477+
stop("strings must be a character vector")
478+
}
479+
480+
if (!is.character(patterns)) {
481+
stop("patterns must be a character vector")
482+
}
483+
484+
if (length(patterns) == 0) {
485+
if (match_any) {
486+
return(rep(FALSE, length(strings)))
487+
} else {
488+
return(matrix(FALSE, nrow = length(strings), ncol = 0))
489+
}
490+
}
491+
492+
if (match_any) {
493+
# Use optimized single-vector version
494+
return(multi_grepl_any_cpp(strings, patterns, ignore_case))
495+
} else {
496+
# Use matrix version
497+
result_matrix <- multi_grepl_cpp(strings, patterns, match_any = FALSE, ignore_case)
498+
499+
if (return_matrix) {
500+
# Add row and column names
501+
rownames(result_matrix) <- paste0("string_", seq_len(nrow(result_matrix)))
502+
colnames(result_matrix) <- patterns
503+
return(result_matrix)
504+
} else {
505+
# Convert to data.frame with better column names
506+
result_df <- as.data.frame(result_matrix)
507+
colnames(result_df) <- patterns
508+
rownames(result_df) <- NULL
509+
return(result_df)
510+
}
511+
}
512+
}
513+
514+
#' Fast String Filtering
515+
#'
516+
#' Efficiently filter strings that contain any of the specified patterns.
517+
#' Equivalent to strings[grepl(pattern1, strings, fixed=TRUE) | grepl(pattern2, strings, fixed=TRUE) | ...]
518+
#' but much faster for multiple patterns.
519+
#'
520+
#' @param strings Character vector of strings to filter
521+
#' @param patterns Character vector of patterns to search for
522+
#' @param ignore_case Logical. Whether to ignore case. Default FALSE.
523+
#' @param invert Logical. If TRUE, return strings that do NOT match any pattern. Default FALSE.
524+
#'
525+
#' @return Character vector of strings that match (or don't match if invert=TRUE) any pattern
526+
#'
527+
#' @examples
528+
#' strings <- c("apple pie", "banana bread", "cherry tart", "date cake")
529+
#' patterns <- c("apple", "cherry")
530+
#'
531+
#' # Get strings containing any pattern
532+
#' filter_strings(strings, patterns)
533+
#' # Returns: "apple pie" "cherry tart"
534+
#'
535+
#' # Get strings NOT containing any pattern
536+
#' filter_strings(strings, patterns, invert = TRUE)
537+
#' # Returns: "banana bread" "date cake"
538+
#'
539+
#' @export
540+
filter_strings <- function(strings, patterns, ignore_case = FALSE, invert = FALSE) {
541+
matches <- multi_grepl(strings, patterns, match_any = TRUE, ignore_case = ignore_case)
542+
543+
if (invert) {
544+
return(strings[!matches])
545+
} else {
546+
return(strings[matches])
547+
}
441548
}

build_string_functions.R

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env Rscript
2+
# Quick build script to compile new C++ functions
3+
4+
cat("Building GraphFast package with new string functions...\n")
5+
6+
# Remove old package if installed
7+
try(remove.packages("graphfast"), silent = TRUE)
8+
9+
# Clean and build
10+
if (file.exists("src/graphfast.dll")) {
11+
file.remove("src/graphfast.dll")
12+
}
13+
14+
# Build and install
15+
devtools::document()
16+
devtools::install(upgrade = "never")
17+
18+
cat("Package rebuilt. Testing string functions...\n")
19+
20+
# Quick test
21+
library(graphfast)
22+
test_result <- multi_grepl(c("hello", "world"), c("hello"))
23+
cat("Test successful:", test_result, "\n")
24+
cat("Ready to run string_benchmark.R\n")

compare_with_igraph.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ cat("=== Data Preparation (Your Code) ===\n")
6565
# Generate massive dataset: 50M edges using your sampling approach
6666
cat("Generating 50 million edges using sample(1E8, 5E7)...\n")
6767
edges_dt <- data.table(
68-
x = sample(1E8, 5E7), # 50M edges from 100M possible node IDs
69-
y = sample(1E8, 5E7)
68+
x = sample(1E9, 180E6), # 50M edges from 100M possible node IDs
69+
y = sample(1E9, 180E6)
7070
)
7171

7272
cat("Generated", nrow(edges_dt), "edges\n")

examples/basic.R

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Load required libraries.
2+
require(graphfast)
3+
require(data.table)
4+
5+
# Create a large data.table with random edges.
6+
edges_dt <- data.table(
7+
x = sample(1E9, 180E6),
8+
y = sample(1E9, 180E6)
9+
)
10+
11+
# Measure performance of edge_components on large data.table.
12+
system.time({
13+
edges_dt[, component := edge_components(.SD, "x", "y")]
14+
}, gcFirst = FALSE)

src/graph_algorithms.cpp

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,4 +298,144 @@ Rcpp::List get_edge_components_cpp(const Rcpp::IntegerMatrix& edges, int n_nodes
298298
Rcpp::Named("to_components") = to_components,
299299
Rcpp::Named("n_components") = next_component_id
300300
);
301+
}
302+
303+
//' Multi-Pattern Fixed String Matching
304+
//'
305+
//' Fast C++ implementation for finding multiple fixed patterns in strings.
306+
//' Equivalent to multiple grepl(pattern, x, fixed=TRUE) calls but much faster.
307+
//'
308+
//' @param strings Character vector of strings to search in
309+
//' @param patterns Character vector of fixed patterns to search for
310+
//' @param match_any Logical. If TRUE, returns TRUE if ANY pattern matches.
311+
//' If FALSE, returns a matrix showing which pattern matches which string.
312+
//' @param ignore_case Logical. Whether to ignore case when matching. Default FALSE.
313+
//'
314+
//' @return If match_any=TRUE: Logical vector same length as strings.
315+
//' If match_any=FALSE: Logical matrix with nrow=length(strings), ncol=length(patterns).
316+
//'
317+
//' @examples
318+
//' strings <- c("hello world", "goodbye", "hello there", "world peace")
319+
//' patterns <- c("hello", "world")
320+
//'
321+
//' # Check if ANY pattern matches each string
322+
//' multi_grepl_cpp(strings, patterns, match_any = TRUE)
323+
//' # Returns: TRUE FALSE TRUE TRUE
324+
//'
325+
//' # Get detailed matrix of which patterns match which strings
326+
//' multi_grepl_cpp(strings, patterns, match_any = FALSE)
327+
//' # Returns 4x2 matrix showing hello/world matches for each string
328+
//'
329+
// [[Rcpp::export]]
330+
Rcpp::LogicalMatrix multi_grepl_cpp(const Rcpp::CharacterVector& strings,
331+
const Rcpp::CharacterVector& patterns,
332+
bool match_any = true,
333+
bool ignore_case = false) {
334+
335+
int n_strings = strings.size();
336+
int n_patterns = patterns.size();
337+
338+
// Convert patterns to std::string for easier manipulation
339+
std::vector<std::string> pattern_vec(n_patterns);
340+
for (int p = 0; p < n_patterns; p++) {
341+
pattern_vec[p] = Rcpp::as<std::string>(patterns[p]);
342+
if (ignore_case) {
343+
// Convert pattern to lowercase
344+
std::transform(pattern_vec[p].begin(), pattern_vec[p].end(),
345+
pattern_vec[p].begin(), ::tolower);
346+
}
347+
}
348+
349+
if (match_any) {
350+
// Return vector indicating if ANY pattern matches each string
351+
Rcpp::LogicalVector result(n_strings);
352+
353+
for (int i = 0; i < n_strings; i++) {
354+
std::string str = Rcpp::as<std::string>(strings[i]);
355+
if (ignore_case) {
356+
std::transform(str.begin(), str.end(), str.begin(), ::tolower);
357+
}
358+
359+
bool found_match = false;
360+
for (int p = 0; p < n_patterns && !found_match; p++) {
361+
if (str.find(pattern_vec[p]) != std::string::npos) {
362+
found_match = true;
363+
}
364+
}
365+
result[i] = found_match;
366+
}
367+
368+
// Convert to matrix format for consistent return type
369+
Rcpp::LogicalMatrix result_matrix(n_strings, 1);
370+
for (int i = 0; i < n_strings; i++) {
371+
result_matrix(i, 0) = result[i];
372+
}
373+
return result_matrix;
374+
375+
} else {
376+
// Return matrix showing which patterns match which strings
377+
Rcpp::LogicalMatrix result(n_strings, n_patterns);
378+
379+
for (int i = 0; i < n_strings; i++) {
380+
std::string str = Rcpp::as<std::string>(strings[i]);
381+
if (ignore_case) {
382+
std::transform(str.begin(), str.end(), str.begin(), ::tolower);
383+
}
384+
385+
for (int p = 0; p < n_patterns; p++) {
386+
result(i, p) = (str.find(pattern_vec[p]) != std::string::npos);
387+
}
388+
}
389+
390+
return result;
391+
}
392+
}
393+
394+
//' Multi-Pattern Fixed String Matching (Any Match)
395+
//'
396+
//' Simplified version that returns TRUE if any pattern matches each string.
397+
//' Optimized for the common use case of "does this string contain any of these patterns?"
398+
//'
399+
//' @param strings Character vector of strings to search in
400+
//' @param patterns Character vector of fixed patterns to search for
401+
//' @param ignore_case Logical. Whether to ignore case. Default FALSE.
402+
//'
403+
//' @return Logical vector same length as strings
404+
//'
405+
// [[Rcpp::export]]
406+
Rcpp::LogicalVector multi_grepl_any_cpp(const Rcpp::CharacterVector& strings,
407+
const Rcpp::CharacterVector& patterns,
408+
bool ignore_case = false) {
409+
410+
int n_strings = strings.size();
411+
int n_patterns = patterns.size();
412+
413+
// Convert patterns to std::string
414+
std::vector<std::string> pattern_vec(n_patterns);
415+
for (int p = 0; p < n_patterns; p++) {
416+
pattern_vec[p] = Rcpp::as<std::string>(patterns[p]);
417+
if (ignore_case) {
418+
std::transform(pattern_vec[p].begin(), pattern_vec[p].end(),
419+
pattern_vec[p].begin(), ::tolower);
420+
}
421+
}
422+
423+
Rcpp::LogicalVector result(n_strings);
424+
425+
for (int i = 0; i < n_strings; i++) {
426+
std::string str = Rcpp::as<std::string>(strings[i]);
427+
if (ignore_case) {
428+
std::transform(str.begin(), str.end(), str.begin(), ::tolower);
429+
}
430+
431+
bool found_match = false;
432+
for (int p = 0; p < n_patterns && !found_match; p++) {
433+
if (str.find(pattern_vec[p]) != std::string::npos) {
434+
found_match = true;
435+
}
436+
}
437+
result[i] = found_match;
438+
}
439+
440+
return result;
301441
}

string_benchmark.R

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/bin/env Rscript
2+
# Simple Multi-Pattern String Matching Benchmark
3+
4+
suppressPackageStartupMessages({
5+
library(graphfast)
6+
library(stringi)
7+
})
8+
9+
# Generate test data
10+
cat("Generating 5M random strings...\n")
11+
set.seed(42)
12+
13+
n <- 5e6 # number of strings
14+
min_len <- 1
15+
max_len <- 30
16+
set.seed(123)
17+
lengths <- sample(min_len:max_len, n, replace = TRUE)
18+
# Generate random strings efficiently
19+
random_strings <- stri_rand_strings(n, lengths, pattern = "[A-Za-z0-9]")
20+
21+
22+
patterns <- c("log", "tmp", "test", "ca", "da", "longer stringgggg", "far", "foo", "bar", "baz")
23+
cat("Testing", length(random_strings), "strings with", length(patterns), "patterns\n\n")
24+
25+
# Benchmark 1: Multiple grepl calls
26+
cat("1. Multiple grepl(fixed=TRUE) calls:\n")
27+
start_time <- Sys.time()
28+
result1 <- Reduce(`|`, lapply(patterns, function(p) grepl(p, random_strings, fixed = TRUE)))
29+
time1 <- as.numeric(Sys.time() - start_time)
30+
cat(" Time:", round(time1, 3), "seconds\n")
31+
cat(" Matches:", sum(result1), "\n\n")
32+
33+
# Benchmark 2: Single regex with alternation
34+
cat("2. Single grepl with regex alternation:\n")
35+
regex_pattern <- paste(patterns, collapse = "|")
36+
start_time <- Sys.time()
37+
result2 <- grepl(regex_pattern, random_strings, perl = TRUE)
38+
time2 <- as.numeric(Sys.time() - start_time)
39+
cat(" Time:", round(time2, 3), "seconds\n")
40+
cat(" Matches:", sum(result2), "\n\n")
41+
42+
# Benchmark 3: Our multi_grepl function
43+
cat("3. multi_grepl() function:\n")
44+
start_time <- Sys.time()
45+
result3 <- multi_grepl(random_strings, patterns, match_any = TRUE)
46+
time3 <- as.numeric(Sys.time() - start_time)
47+
cat(" Time:", round(time3, 3), "seconds\n")
48+
cat(" Matches:", sum(result3), "\n\n")
49+
50+
# Results comparison
51+
cat("Results verification:\n")
52+
cat(" Multiple grepl == regex:", identical(result1, result2), "\n")
53+
cat(" Multiple grepl == multi_grepl:", identical(result1, result3), "\n")
54+
cat(" Regex == multi_grepl:", identical(result2, result3), "\n\n")
55+
56+
# Performance summary
57+
cat("Performance summary:\n")
58+
fastest_time <- min(time1, time2, time3)
59+
cat(" Multiple grepl speedup:", round(time1 / fastest_time, 2), "x\n")
60+
cat(" Regex speedup:", round(time2 / fastest_time, 2), "x\n")
61+
cat(" multi_grepl speedup:", round(time3 / fastest_time, 2), "x\n\n")
62+
63+
winner <- which.min(c(time1, time2, time3))
64+
methods <- c("Multiple grepl", "Regex alternation", "multi_grepl")
65+
cat("Winner:", methods[winner], "\n")

0 commit comments

Comments
 (0)