Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
973a433
First attempt at genericizing data source
jcheng5 Apr 4, 2025
8de0ac7
Unify prompts by adding chevron Python dependency
jcheng5 Apr 4, 2025
53c7df3
Make prompt aware of what engine is being used
jcheng5 Apr 18, 2025
a2122f2
Replace SQLite support with SQLAlchemy support
jcheng5 Apr 18, 2025
a218fb9
Don't fail when given table name's case differs from SQLAlchemy Inspe…
jcheng5 Apr 23, 2025
dc0814e
Forgot import
jcheng5 May 1, 2025
9d95d1d
Have server() return proper class with typed methods, instead of dict
jcheng5 Jun 2, 2025
aeb87dd
Auto-create sqlite database for example
jcheng5 Jun 2, 2025
c38b567
Have init() take data frame or sqlalchemy engine directly
jcheng5 Jun 2, 2025
e7972e8
Merge remote-tracking branch 'origin/main' into generic-datasource-im…
jcheng5 Jun 3, 2025
57922b3
Use GPT-4.1 by default, not GPT-4, yuck
jcheng5 Jun 3, 2025
84d30ad
Merge remote-tracking branch 'origin/generic-datasource' into generic…
jcheng5 Jun 3, 2025
a08764b
Update README
jcheng5 Jun 3, 2025
374bdfb
this should significantly speed up schema generation
npelikan Jun 6, 2025
e294b1b
another speedup
npelikan Jun 6, 2025
b179ea6
ruff formatting
npelikan Jun 6, 2025
2cbe199
updating so formatting checks pass
npelikan Jun 6, 2025
8f59aa7
adding a generic r datasource
npelikan Jun 7, 2025
2ececf5
critical change: should return a lazy table rather than executing by …
npelikan Jun 7, 2025
f4ca445
edits to test suite and devtools::check() passing
npelikan Jun 7, 2025
c9b03da
Merge pull request #1 from posit-dev/main
npelikan Jun 7, 2025
48503f0
example update
npelikan Jun 7, 2025
4809615
error message for a footgun
npelikan Jun 9, 2025
a1ae3b6
Merge branch 'main' into r-generic-datasource
npelikan Jun 12, 2025
24ef182
Merge pull request #4 from npelikan/r-generic-datasource
npelikan Jun 12, 2025
3b289c7
update to use s3 classes to simplify the code
npelikan Jun 19, 2025
7052d6e
Merge pull request #5 from npelikan/r-generic-datasource
npelikan Jun 19, 2025
146777a
README update
npelikan Jun 19, 2025
9911965
added injection of SQL dialect into prompt. Also cleaned up test naming
npelikan Jun 19, 2025
8d05d7f
more simplification
npelikan Jun 19, 2025
b18b570
Merge branch 'main' into main
npelikan Jun 25, 2025
41c9e1e
merge fix
npelikan Jun 25, 2025
e347110
small dep edit
npelikan Jun 26, 2025
753c5af
Code review
jcheng5 Jun 26, 2025
1ee065b
more tests, and code review edits
npelikan Jun 26, 2025
5492b0f
testing changes
npelikan Jun 27, 2025
1ff4fe5
more test passing
npelikan Jun 27, 2025
eb9104c
cleaning up gitignores
npelikan Jun 27, 2025
09231fa
updating python datasource to prevent collisions
npelikan Jun 27, 2025
9e53ca3
Merge remote-tracking branch 'posit-dev/main'
npelikan Jul 1, 2025
150e550
fix for github actions
npelikan Jul 1, 2025
c589444
adding tests to python github action (as we have some tests now!)
npelikan Jul 1, 2025
98b2f29
edits for gha
npelikan Jul 1, 2025
3fd17e4
makefile edit
npelikan Jul 1, 2025
e6731be
air format
npelikan Jul 8, 2025
d45820f
code cleanup, better tests, and dropping `glue` dependency
npelikan Jul 9, 2025
3f55974
Fix error in qc.df() when no query is active
jcheng5 Jul 16, 2025
395e116
Adding dplyr::sql() identifier to get_lazy_query() to fix failing tests.
npelikan Jul 17, 2025
d86888d
adding more tests to cover the empty execute_data query use case and …
npelikan Jul 17, 2025
765250e
description edit to pass routine test
npelikan Jul 17, 2025
6432fa1
edit to remove `tbl` output per discussion on #28
npelikan Jul 28, 2025
de0a31e
better data source nested identifier handling
npelikan Jul 29, 2025
b6eeb4a
fixing a missing quote identifier
npelikan Jul 29, 2025
32a65fc
doc cleanup
npelikan Jul 29, 2025
1325ed1
a bit more helpful error message
npelikan Jul 29, 2025
0d01d82
even more helpful erroring
npelikan Jul 29, 2025
8503f66
dplyr lazy query!
npelikan Jul 30, 2025
7bd3208
adding query chaining example
npelikan Aug 1, 2025
4cb93bf
more examples!
npelikan Aug 1, 2025
982c58f
Merge remote-tracking branch 'posit-dev/main'
npelikan Aug 1, 2025
a836857
Merge branch 'main' into r-lazy-queries
npelikan Aug 1, 2025
c9ab8bf
formatting updates
npelikan Aug 1, 2025
5b4bdf4
Merge remote-tracking branch 'posit-dev/main' into r-lazy-queries
npelikan Aug 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg-r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ Depends:
Imports:
bslib,
DBI,
dbplyr,
dplyr,
duckdb,
ellmer (>= 0.3.0),
htmltools,
Expand Down
2 changes: 2 additions & 0 deletions pkg-r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ S3method(create_system_prompt,querychat_data_source)
S3method(execute_query,dbi_source)
S3method(get_db_type,data_frame_source)
S3method(get_db_type,dbi_source)
S3method(get_lazy_data,dbi_source)
S3method(get_schema,dbi_source)
S3method(querychat_data_source,DBIConnection)
S3method(querychat_data_source,data.frame)
Expand All @@ -13,6 +14,7 @@ export(cleanup_source)
export(create_system_prompt)
export(execute_query)
export(get_db_type)
export(get_lazy_data)
export(get_schema)
export(querychat_app)
export(querychat_data_source)
Expand Down
327 changes: 39 additions & 288 deletions pkg-r/R/data_source.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,108 +120,64 @@ execute_query.dbi_source <- function(source, query, ...) {
DBI::dbGetQuery(source$conn, query)
}

#' Test a SQL query on a data source.
#'
#' @param source A querychat_data_source object
#' @param query SQL query string
#' @param ... Additional arguments passed to methods
#' @return Result of the query, limited to one row of data.
#' @export
test_query <- function(source, query, ...) {
UseMethod("test_query")
}

#' @export
test_query.dbi_source <- function(source, query, ...) {
rs <- DBI::dbSendQuery(source$conn, query)
df <- DBI::dbFetch(rs, n = 1)
DBI::dbClearResult(rs)
df
}


#' Get type information for a data source
#' Get a lazy representation of a data source
#'
#' @param source A querychat_data_source object
#' @param query SQL query string
#' @param ... Additional arguments passed to methods
#' @return A character string containing the type information
#' @return A lazy representation (typically a dbplyr tbl)
#' @export
get_db_type <- function(source, ...) {
UseMethod("get_db_type")
get_lazy_data <- function(source, query, ...) {
UseMethod("get_lazy_data")
}

#' @export
get_db_type.data_frame_source <- function(source, ...) {
# Local dataframes are always duckdb!
return("DuckDB")
}

#' @export
get_db_type.dbi_source <- function(source, ...) {
conn <- source$conn
conn_info <- DBI::dbGetInfo(conn)
# default to 'POSIX' if dbms name not found
dbms_name <- purrr::pluck(conn_info, "dbms.name", .default = "POSIX")
# Special handling for known database types
if (inherits(conn, "SQLiteConnection")) {
return("SQLite")
get_lazy_data.dbi_source <- function(
source,
query = NULL,
...
) {
if (is.null(query) || query == "") {
# For a null or empty query, default to returning the whole table (ie SELECT *)
dplyr::tbl(source$conn, source$table_name)
} else {
# Clean the SQL query to avoid dbplyr issues with syntax problems
cleaned_query <- clean_sql(query, enforce_select = TRUE)

if (is.null(cleaned_query)) {
# If cleaning results in an empty query, raise an error
rlang::abort(c(
"Query cleaning resulted in an empty query.",
"i" = "Check the original query for proper syntax.",
"i" = "Query may consist only of comments or invalid SQL."
))
} else {
# Use dbplyr::sql to create a safe SQL query object with the cleaned query
# No fallback to full table on error - let errors propagate to the caller
dplyr::tbl(source$conn, dbplyr::sql(cleaned_query))
}
}
# remove ' SQL', if exists (SQL is already in the prompt)
return(gsub(" SQL", "", dbms_name))
}


#' Create a system prompt for the data source
#' Test a SQL query on a data source.
#'
#' @param source A querychat_data_source object
#' @param data_description Optional description of the data
#' @param extra_instructions Optional additional instructions
#' @param query SQL query string
#' @param ... Additional arguments passed to methods
#' @return A string with the system prompt
#' @return Result of the query, limited to one row of data.
#' @export
create_system_prompt <- function(
source,
data_description = NULL,
extra_instructions = NULL,
...
) {
UseMethod("create_system_prompt")
test_query <- function(source, query, ...) {
UseMethod("test_query")
}

#' @export
create_system_prompt.querychat_data_source <- function(
source,
data_description = NULL,
extra_instructions = NULL,
...
) {
if (!is.null(data_description)) {
data_description <- paste(data_description, collapse = "\n")
}
if (!is.null(extra_instructions)) {
extra_instructions <- paste(extra_instructions, collapse = "\n")
}

# Read the prompt file
prompt_path <- system.file("prompt", "prompt.md", package = "querychat")
prompt_content <- readLines(prompt_path, warn = FALSE)
prompt_text <- paste(prompt_content, collapse = "\n")

# Get schema for the data source
schema <- get_schema(source)

# Examine the data source and get the type for the prompt
db_type <- get_db_type(source)

whisker::whisker.render(
prompt_text,
list(
schema = schema,
data_description = data_description,
extra_instructions = extra_instructions,
db_type = db_type
)
)
test_query.dbi_source <- function(source, query, ...) {
rs <- DBI::dbSendQuery(source$conn, query)
df <- DBI::dbFetch(rs, n = 1)
DBI::dbClearResult(rs)
df
}

#' Clean up a data source (close connections, etc.)
Expand All @@ -241,208 +197,3 @@ cleanup_source.dbi_source <- function(source, ...) {
}
invisible(NULL)
}


#' Get schema for a data source
#'
#' @param source A querychat_data_source object
#' @param ... Additional arguments passed to methods
#' @return A character string describing the schema
#' @export
get_schema <- function(source, ...) {
UseMethod("get_schema")
}

#' @export
get_schema.dbi_source <- function(source, ...) {
conn <- source$conn
table_name <- source$table_name
categorical_threshold <- source$categorical_threshold

# Get column information
columns <- DBI::dbListFields(conn, table_name)

schema_lines <- c(
paste("Table:", DBI::dbQuoteIdentifier(conn, table_name)),
"Columns:"
)

# Build single query to get column statistics
select_parts <- character(0)
numeric_columns <- character(0)
text_columns <- character(0)

# Get sample of data to determine types
sample_query <- paste0(
"SELECT * FROM ",
DBI::dbQuoteIdentifier(conn, table_name),
" LIMIT 1"
)
sample_data <- DBI::dbGetQuery(conn, sample_query)

for (col in columns) {
col_class <- class(sample_data[[col]])[1]

if (
col_class %in%
c("integer", "numeric", "double", "Date", "POSIXct", "POSIXt")
) {
numeric_columns <- c(numeric_columns, col)
select_parts <- c(
select_parts,
paste0(
"MIN(",
DBI::dbQuoteIdentifier(conn, col),
") as ",
DBI::dbQuoteIdentifier(conn, paste0(col, '__min'))
),
paste0(
"MAX(",
DBI::dbQuoteIdentifier(conn, col),
") as ",
DBI::dbQuoteIdentifier(conn, paste0(col, '__max'))
)
)
} else if (col_class %in% c("character", "factor")) {
text_columns <- c(text_columns, col)
select_parts <- c(
select_parts,
paste0(
"COUNT(DISTINCT ",
DBI::dbQuoteIdentifier(conn, col),
") as ",
DBI::dbQuoteIdentifier(conn, paste0(col, '__distinct_count'))
)
)
}
}

# Execute statistics query
column_stats <- list()
if (length(select_parts) > 0) {
tryCatch(
{
stats_query <- paste0(
"SELECT ",
paste0(select_parts, collapse = ", "),
" FROM ",
DBI::dbQuoteIdentifier(conn, table_name)
)
result <- DBI::dbGetQuery(conn, stats_query)
if (nrow(result) > 0) {
column_stats <- as.list(result[1, ])
}
},
error = function(e) {
# Fall back to no statistics if query fails
}
)
}

# Get categorical values for text columns below threshold
categorical_values <- list()
text_cols_to_query <- character(0)

for (col_name in text_columns) {
distinct_count_key <- paste0(col_name, "__distinct_count")
if (
distinct_count_key %in%
names(column_stats) &&
!is.na(column_stats[[distinct_count_key]]) &&
column_stats[[distinct_count_key]] <= categorical_threshold
) {
text_cols_to_query <- c(text_cols_to_query, col_name)
}
}

# Remove duplicates
text_cols_to_query <- unique(text_cols_to_query)

# Get categorical values
if (length(text_cols_to_query) > 0) {
for (col_name in text_cols_to_query) {
tryCatch(
{
cat_query <- paste0(
"SELECT DISTINCT ",
DBI::dbQuoteIdentifier(conn, col_name),
" FROM ",
DBI::dbQuoteIdentifier(conn, table_name),
" WHERE ",
DBI::dbQuoteIdentifier(conn, col_name),
" IS NOT NULL ORDER BY ",
DBI::dbQuoteIdentifier(conn, col_name)
)
result <- DBI::dbGetQuery(conn, cat_query)
if (nrow(result) > 0) {
categorical_values[[col_name]] <- result[[1]]
}
},
error = function(e) {
# Skip categorical values if query fails
}
)
}
}

# Build schema description
for (col in columns) {
col_class <- class(sample_data[[col]])[1]
sql_type <- r_class_to_sql_type(col_class)

column_info <- paste0("- ", col, " (", sql_type, ")")

# Add range info for numeric columns
if (col %in% numeric_columns) {
min_key <- paste0(col, "__min")
max_key <- paste0(col, "__max")
if (
min_key %in%
names(column_stats) &&
max_key %in% names(column_stats) &&
!is.na(column_stats[[min_key]]) &&
!is.na(column_stats[[max_key]])
) {
range_info <- paste0(
" Range: ",
column_stats[[min_key]],
" to ",
column_stats[[max_key]]
)
column_info <- paste(column_info, range_info, sep = "\n")
}
}

# Add categorical values for text columns
if (col %in% names(categorical_values)) {
values <- categorical_values[[col]]
if (length(values) > 0) {
values_str <- paste0("'", values, "'", collapse = ", ")
cat_info <- paste0(" Categorical values: ", values_str)
column_info <- paste(column_info, cat_info, sep = "\n")
}
}

schema_lines <- c(schema_lines, column_info)
}

paste(schema_lines, collapse = "\n")
}


# Helper function to map R classes to SQL types
r_class_to_sql_type <- function(r_class) {
switch(
r_class,
"integer" = "INTEGER",
"numeric" = "FLOAT",
"double" = "FLOAT",
"logical" = "BOOLEAN",
"Date" = "DATE",
"POSIXct" = "TIMESTAMP",
"POSIXt" = "TIMESTAMP",
"character" = "TEXT",
"factor" = "TEXT",
"TEXT" # default
)
}
Loading
Loading