Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/workflows/pg-extension-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,22 @@ jobs:
python3 scripts/build_pg_ext.py prod --deeplake-static --pg-versions "${PG_VERSION}"
echo -e "${GREEN}Done.${DEFAULT}"

- name: install clang-tidy
shell: bash
run: |-
echo -e "${YELLOW}Installing clang-tidy...${DEFAULT}"
yum install -y clang-tools-extra
clang-tidy --version
echo -e "${GREEN}Clang-tidy installed.${DEFAULT}"

- name: run clang-tidy
shell: bash
continue-on-error: true
run: |-
echo -e "${YELLOW}Running clang-tidy static analysis...${DEFAULT}"
bash scripts/run_clang_tidy.sh builds/deeplake-pg-prod
echo -e "${GREEN}Clang-tidy analysis complete.${DEFAULT}"

- name: extract version
id: extract-version
shell: bash
Expand Down
198 changes: 198 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
version: '3'

vars:
BUILD_MODE: '{{.BUILD_MODE | default "dev"}}'
PG_VERSION: 18
POSTGRES_SOURCE: "cpp/.ext/postgres-REL_{{.PG_VERSION}}_0"
POSTGRES_INSTALL: "{{.POSTGRES_SOURCE}}/install"
POSTGRES_DATA: "{{.POSTGRES_SOURCE}}/data"
POSTGRES_LOG: "postgres/tests/taskfile_server.log"
BUILD_DIR: "builds/deeplake-pg-{{.BUILD_MODE}}"
USER:
sh: echo ${USER:-postgres}

tasks:
build:
desc: "Build the pg_deeplake PostgreSQL extension (incremental)"
cmds:
- python3 scripts/build_pg_ext.py

"build:dev":
desc: "Full build in dev mode"
cmds:
- python3 scripts/build_pg_ext.py dev

"build:prod":
desc: "Full build in production mode"
cmds:
- python3 scripts/build_pg_ext.py prod

"build:debug":
desc: "Full build in debug mode"
cmds:
- python3 scripts/build_pg_ext.py debug

lint:
desc: "Run clang-tidy linter on cpp/deeplake_pg"
deps:
- build
cmds:
- bash scripts/run_clang_tidy.sh {{.BUILD_DIR}}

"lint:file":
desc: "Run clang-tidy on a specific file"
dir: cpp/deeplake_pg
cmds:
- |
if [ -z "{{.FILE}}" ]; then
echo "Error: Please specify FILE parameter"
echo "Usage: task lint:file FILE=table_am.cpp"
exit 1
fi
clang-tidy -p "../../{{.BUILD_DIR}}" "{{.FILE}}"

test:
desc: "Run PostgreSQL extension tests (Python pytest)"
dir: postgres/tests/py_tests
cmds:
- pytest -v {{.CLI_ARGS}}

"test:parallel":
desc: "Run tests in parallel using pytest-xdist"
dir: postgres/tests/py_tests
cmds:
- pytest -v -n auto {{.CLI_ARGS}}

"test:coverage":
desc: "Run tests with coverage report"
dir: postgres/tests/py_tests
cmds:
- pytest -v --cov=. --cov-report=html --cov-report=term {{.CLI_ARGS}}
- echo "Coverage report generated in postgres/tests/py_tests/htmlcov/index.html"

"test:single":
desc: "Run a single test file"
dir: postgres/tests/py_tests
cmds:
- |
if [ -z "{{.TEST}}" ]; then
echo "Error: Please specify TEST parameter"
echo "Usage: task test:single TEST=test_basic_ops.py"
exit 1
fi
pytest -v "{{.TEST}}" {{.CLI_ARGS}}

run:
desc: "Start PostgreSQL server with pg_deeplake extension loaded"
deps:
- build
cmds:
- |
echo "Starting PostgreSQL server with pg_deeplake extension..."

PG_CTL="{{.POSTGRES_INSTALL}}/bin/pg_ctl"
INITDB="{{.POSTGRES_INSTALL}}/bin/initdb"
PSQL="{{.POSTGRES_INSTALL}}/bin/psql"
DATA_DIR="{{.POSTGRES_DATA}}"
INSTALL_DIR="{{.POSTGRES_INSTALL}}"
LOG_FILE="{{.POSTGRES_LOG}}"

if [ -f "$DATA_DIR/postmaster.pid" ]; then
echo "PostgreSQL server appears to be running. Use 'task stop' to stop it first."
exit 1
fi

if [ ! -d "$DATA_DIR" ]; then
echo "Initializing PostgreSQL database cluster..."
"$INITDB" -D "$DATA_DIR" -U "{{.USER}}"

echo "shared_preload_libraries = 'pg_deeplake'" >> "$DATA_DIR/postgresql.conf"
echo "max_connections = 100" >> "$DATA_DIR/postgresql.conf"
fi

echo "Installing pg_deeplake extension files..."
EXT_DIR="$INSTALL_DIR/share/extension"
LIB_DIR="$INSTALL_DIR/lib"
mkdir -p "$EXT_DIR"

cp postgres/*.control "$EXT_DIR/" 2>/dev/null || true
cp postgres/*.sql "$EXT_DIR/" 2>/dev/null || grep -v "utils.psql" || true
cp postgres/pg_deeplake_{{.PG_VERSION}}.so "$LIB_DIR/pg_deeplake.so" 2>/dev/null || true

export LD_LIBRARY_PATH="$LIB_DIR:${LD_LIBRARY_PATH:-}"

echo "Starting PostgreSQL server..."
"$PG_CTL" -D "$DATA_DIR" -l "$LOG_FILE" start

sleep 2

echo "Creating pg_deeplake extension..."
"$PSQL" -U "{{.USER}}" -d postgres -c "CREATE EXTENSION IF NOT EXISTS pg_deeplake;"

echo ""
echo "✓ PostgreSQL server is running with pg_deeplake extension loaded"
echo ""
echo "Connection details:"
echo " Host: localhost"
echo " Database: postgres"
echo " User: {{.USER}}"
echo ""
echo "Connect with: psql -U {{.USER}} -d postgres"
echo "Stop server with: task stop"
echo "View logs: tail -f {{.POSTGRES_LOG}}"

stop:
desc: "Stop the PostgreSQL server"
cmds:
- |
PG_CTL="{{.POSTGRES_INSTALL}}/bin/pg_ctl"
DATA_DIR="{{.POSTGRES_DATA}}"

if [ ! -f "$DATA_DIR/postmaster.pid" ]; then
echo "PostgreSQL server is not running"
exit 0
fi

echo "Stopping PostgreSQL server..."
"$PG_CTL" -D "$DATA_DIR" stop -m fast
echo "✓ PostgreSQL server stopped"

status:
desc: "Check PostgreSQL server status"
cmds:
- |
PG_CTL="{{.POSTGRES_INSTALL}}/bin/pg_ctl"
DATA_DIR="{{.POSTGRES_DATA}}"

"$PG_CTL" -D "$DATA_DIR" status || echo "PostgreSQL server is not running"

clean:
desc: "Clean build artifacts and test outputs"
cmds:
- rm -rf builds/
- rm -rf postgres/tests/logs/*
- rm -rf postgres/tests/results/*
- rm -rf postgres/tests/py_tests/htmlcov/
- rm -rf postgres/tests/py_tests/.pytest_cache/
- rm -rf postgres/tests/py_tests/.coverage
- find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
- echo "✓ Cleaned build artifacts and test outputs"

"clean:db":
desc: "Clean PostgreSQL data directory"
cmds:
- task: stop
- rm -rf "{{.POSTGRES_DATA}}"
- echo "✓ PostgreSQL data directory cleaned"

dev:
desc: "Full development workflow (build, lint, test)"
cmds:
- task: build
- task: lint
- task: test

help:
desc: "Show available tasks"
cmds:
- task --list
45 changes: 45 additions & 0 deletions cpp/deeplake_pg/.clang-tidy
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
---
# Clang-Tidy configuration for pg_deeplake
Checks: >
bugprone-*,
cert-*,
cppcoreguidelines-*,
modernize-*,
performance-*,
readability-*,
clang-analyzer-*,
misc-*,
-modernize-use-trailing-return-type,
-readability-identifier-length,
-readability-function-cognitive-complexity,
-cppcoreguidelines-avoid-non-const-global-variables,
-cppcoreguidelines-pro-type-vararg,
-cppcoreguidelines-pro-bounds-pointer-arithmetic,
-cppcoreguidelines-pro-type-reinterpret-cast,
-cppcoreguidelines-pro-bounds-array-to-pointer-decay,
-bugprone-easily-swappable-parameters,
-cppcoreguidelines-pro-type-const-cast

WarningsAsErrors: ''

CheckOptions:
- key: readability-identifier-naming.NamespaceCase
value: lower_case
- key: readability-identifier-naming.ClassCase
value: lower_case
- key: readability-identifier-naming.StructCase
value: CamelCase
- key: readability-identifier-naming.FunctionCase
value: lower_case
- key: readability-identifier-naming.VariableCase
value: lower_case
- key: readability-identifier-naming.ConstantCase
value: UPPER_CASE
- key: readability-identifier-naming.ParameterCase
value: lower_case
- key: readability-identifier-naming.EnumConstantCase
value: CamelCase
- key: cppcoreguidelines-avoid-magic-numbers.IgnoredIntegerValues
value: '0;1;2;3;4;-1'
- key: cppcoreguidelines-avoid-magic-numbers.IgnoredFloatingPointValues
value: '0.0;1.0;-1.0'
21 changes: 11 additions & 10 deletions cpp/deeplake_pg/deeplake_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ void analyze_plan(PlannedStmt* plan)
if (plan->permInfos == nullptr) {
return;
}
ListCell* lc;
ListCell* lc = nullptr;
bool all_tables_are_deeplake = true;
foreach (lc, plan->permInfos) {
RTEPermissionInfo* perminfo = (RTEPermissionInfo*)lfirst(lc);
Expand All @@ -69,7 +69,7 @@ void analyze_plan(PlannedStmt* plan)
while ((col = bms_next_member(perminfo->selectedCols, col)) >= 0) {
// bms_next_member returns 0-based index, but AttrNumber is 1-based
// The bitmapset stores (attnum - FirstLowInvalidHeapAttributeNumber)
AttrNumber attnum = col + FirstLowInvalidHeapAttributeNumber;
AttrNumber attnum = static_cast<AttrNumber>(col + FirstLowInvalidHeapAttributeNumber);
if (attnum <= 0) { // Only positive attribute numbers are real columns
continue;
}
Expand Down Expand Up @@ -104,7 +104,8 @@ struct DeeplakeExecutorState
size_t total_rows = 0;

DeeplakeExecutorState()
: printer("DeeplakeExecutorState")
: css{}
, printer("DeeplakeExecutorState")
{
}

Expand Down Expand Up @@ -133,8 +134,8 @@ struct DeeplakeExecutorState
// Simple executor state for COUNT(*) fast path
struct CountExecutorState
{
CustomScanState css;
int64_t count_value;
CustomScanState css{};
int64_t count_value = 0;
bool returned = false;
};

Expand Down Expand Up @@ -205,10 +206,10 @@ Datum deeplake_sample_to_datum(
try {
if (!type_is_array(target_type) && nd::dtype_is_numeric(samples.dtype())) {
return nd::switch_numeric_dtype(samples.dtype(), [&]<typename T>() {
return pg::utils::pointer_to_datum<T>(samples.data().data(), target_type, attr_typmod, index);
return pg::utils::pointer_to_datum<T>(samples.data().data(), target_type, attr_typmod, static_cast<int64_t>(index));
});
}
nd::array sample = (samples.dimensions() == 0 ? samples : samples[index]);
nd::array sample = (samples.dimensions() == 0 ? samples : samples[static_cast<int64_t>(index)]);
if (sample.is_none()) {
is_null = true;
return (Datum)0;
Expand Down Expand Up @@ -340,8 +341,8 @@ void deeplake_executor_explain(CustomScanState* node, List* ancestors, ExplainSt
DeeplakeExecutorState* state = (DeeplakeExecutorState*)node;
ExplainPropertyText("DeepLake Query", state->query_string.c_str(), es);
if (state->total_rows > 0) {
ExplainPropertyInteger("Rows", nullptr, state->total_rows, es);
ExplainPropertyInteger("Chunks", nullptr, state->duckdb_result.get_chunk_count(), es);
ExplainPropertyInteger("Rows", nullptr, static_cast<int64>(state->total_rows), es);
ExplainPropertyInteger("Chunks", nullptr, static_cast<int64>(state->duckdb_result.get_chunk_count()), es);
}
}

Expand Down Expand Up @@ -429,7 +430,7 @@ void count_executor_explain(CustomScanState* node, List* ancestors, ExplainState
List* create_simple_targetlist(List* original_targetlist)
{
List* new_targetlist = NIL;
ListCell* lc;
ListCell* lc = nullptr;
AttrNumber attno = 1;

foreach (lc, original_targetlist) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/deeplake_pg/duckdb_deeplake_convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ nd::array to_deeplake_value(std::shared_ptr<duckdb::Vector>&& vector, size_t tot
const auto dt = nd::dtype_enum<T>::value;
auto data_span = std::span<const uint8_t>(
reinterpret_cast<const uint8_t*>(src_data),
total_rows * sizeof(T)
static_cast<size_t>(total_rows) * sizeof(T)
);
return nd::array(nd::impl::std_span_array_nd(std::move(vector), data_span,
icm::shape{static_cast<int64_t>(total_rows)}, dt));
Expand Down
6 changes: 3 additions & 3 deletions cpp/deeplake_pg/duckdb_deeplake_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ class deeplake_scan_function_helper
// Copy actual data to grandchild vector
auto* data_ptr = duckdb::FlatVector::GetData<T>(grandchild_vec);
const T* array_data = reinterpret_cast<const T*>(sample.data().data());
std::memcpy(data_ptr + grandchild_offset, array_data, nrows * ncols * sizeof(T));
std::memcpy(data_ptr + grandchild_offset, array_data, nrows * static_cast<size_t>(ncols) * sizeof(T));

// Log first few values being written
elog(LOG, " WRITE: copying %ld elements to grandchild at offset %zu",
Expand Down Expand Up @@ -673,7 +673,7 @@ class deeplake_scan_function_helper
if constexpr (std::is_arithmetic_v<T>) {
auto* list_data = duckdb::FlatVector::GetData<T>(list_entry_vec);
const T* array_data = reinterpret_cast<const T*>(sample.data().data());
std::memcpy(list_data + offset, array_data, array_len * sizeof(T));
std::memcpy(list_data + offset, array_data, static_cast<size_t>(array_len) * sizeof(T));
} else if constexpr (std::is_same_v<T, std::span<const uint8_t>>) {
auto* list_data = duckdb::FlatVector::GetData<duckdb::string_t>(list_entry_vec);
for (int64_t i = 0; i < array_len; ++i) {
Expand Down Expand Up @@ -828,7 +828,7 @@ class deeplake_scan_function_helper
}
return;
}
std::memcpy(duckdb::FlatVector::GetData<T>(output_vector), value_ptr, batch_size * sizeof(T));
std::memcpy(duckdb::FlatVector::GetData<T>(output_vector), value_ptr, static_cast<size_t>(batch_size) * sizeof(T));
} else if constexpr (std::is_same_v<T, nd::dict>) {
auto* duckdb_data = duckdb::FlatVector::GetData<duckdb::string_t>(output_vector);
for (duckdb::idx_t row_in_batch = 0; row_in_batch < batch_size; ++row_in_batch) {
Expand Down
8 changes: 4 additions & 4 deletions cpp/deeplake_pg/duckdb_pg_convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,8 +411,8 @@ Datum duckdb_value_to_pg_datum(
int total_elements = dim0 * dim1;

// Allocate flat arrays for all elements
Datum* elem_datums = (Datum*)palloc(total_elements * sizeof(Datum));
bool* elem_nulls = (bool*)palloc(total_elements * sizeof(bool));
Datum* elem_datums = (Datum*)palloc(static_cast<size_t>(total_elements) * sizeof(Datum));
bool* elem_nulls = (bool*)palloc(static_cast<size_t>(total_elements) * sizeof(bool));

// Get the data vector (grandchild of the outer list)
auto& data_vec = ListVector::GetEntry(list_child);
Expand Down Expand Up @@ -493,8 +493,8 @@ Datum duckdb_value_to_pg_datum(
}

// Convert each element
Datum* elem_datums = (Datum*)palloc(list_size * sizeof(Datum));
bool* elem_nulls = (bool*)palloc(list_size * sizeof(bool));
Datum* elem_datums = (Datum*)palloc(static_cast<size_t>(list_size) * sizeof(Datum));
bool* elem_nulls = (bool*)palloc(static_cast<size_t>(list_size) * sizeof(bool));

for (idx_t i = 0; i < list_size; i++) {
elem_datums[i] = duckdb_value_to_pg_datum(list_child,
Expand Down
Loading