dgtlss
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 49 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 117 additions & 0 deletions b/‎README.md‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎composer.json‎
Lines changed: 38 additions & 0 deletions b/‎composer.json‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎config/parqbridge.php‎
Lines changed: 95 additions & 0 deletions b/‎config/parqbridge.php‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎phpunit.xml‎
Lines changed: 13 additions & 0 deletions b/‎phpunit.xml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/Console/ExportAllTablesCommand.php‎
Lines changed: 76 additions & 0 deletions b/‎src/Console/ExportAllTablesCommand.php‎
Lines changed: 76 additions & 0 deletions
@@ -0,0 +1,49 @@
+name: CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        php: [ '8.2', '8.3' ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup PHP
+        uses: shivammathur/setup-php@v2
+        with:
+          php-version: ${{ matrix.php }}
+          coverage: none
+          extensions: sqlite3, pdo_sqlite
+          ini-values: memory_limit=-1
+
+      - name: Validate composer.json and composer.lock
+        run: composer validate --no-check-publish
+
+      - name: Cache Composer packages
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.composer/cache/files
+            vendor
+          key: ${{ runner.os }}-php-${{ matrix.php }}-composer-${{ hashFiles('**/composer.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-php-${{ matrix.php }}-composer-
+
+      - name: Install dependencies
+        run: |
+          composer install --no-interaction --no-progress --prefer-dist
+          python3 -m pip install --upgrade pip
+          python3 -m pip install pyarrow
+
+      - name: Run test suite
+        run: vendor/bin/phpunit --no-coverage
@@ -0,0 +1,117 @@
+# ParqBridge
+
+Export your Laravel database tables to real Apache Parquet files on any Storage disk (local, S3, etc.) with a simple artisan command.
+
+ParqBridge focuses on zero PHP dependency bloat while still producing spec-compliant Parquet files by delegating the final write step to a tiny, embedded Python script using PyArrow (or any custom CLI you prefer). You keep full Laravel DX for configuration and Storage; we bridge your data to Parquet.
+
+## Installation
+
+- Require the package in your app (path repo or VCS):
+
+```bash
+composer require dgtlss/parqbridge
+```
+
+- Laravel will auto-discover the service provider. Alternatively, register `ParqBridge\\ParqBridgeServiceProvider` manually.
+
+- Publish the config if you want to customize defaults:
+
+```bash
+php artisan vendor:publish --tag="parqbridge-config"
+```
+
+## Configuration
+
+Set your export disk and options in `.env` or `config/parqbridge.php`.
+
+- `PARQUET_DISK`: which filesystem disk to use (e.g., `s3`, `local`).
+- `PARQUET_OUTPUT_DIR`: directory prefix within the disk (default `parquet-exports`).
+- `PARQUET_CHUNK_SIZE`: rows per DB chunk when exporting (default 1000).
+- `PARQUET_INFERENCE`: `database|sample|hybrid` (default `hybrid`).
+- `PARQUET_COMPRESSION`: compression codec for Parquet (`UNCOMPRESSED`/`NONE`, `SNAPPY`, `GZIP`, `ZSTD`, `BROTLI`, `LZ4_RAW`) when using PyArrow backend.
+- `PARQBRIDGE_WRITER`: `pyarrow` (default) or `custom`. If `custom`, set `PARQBRIDGE_CUSTOM_CMD`.
+- `PARQBRIDGE_PYTHON`: python executable for PyArrow (default `python3`).
+
+Example `.env`:
+
+```ini
+PARQUET_DISK=s3
+PARQUET_OUTPUT_DIR=parquet-exports
+PARQUET_CHUNK_SIZE=2000
+```
+
+Ensure your `filesystems` disk is configured (e.g., `s3`) in `config/filesystems.php`.
+
+## Usage
+
+- List tables:
+
+```bash
+php artisan parqbridge:tables
+```
+
+- Export a table to the configured disk:
+
+```bash
+php artisan parqbridge:export users --where="active = 1" --limit=1000 --output="parquet-exports" --disk=s3
+```
+
+On success, the command prints the full path written within the disk. Files are named `{table}-{YYYYMMDD_HHMMSS}.parquet`.
+
+- Export ALL tables into one folder (timestamped subfolder inside `parqbridge.output_directory`):
+
+```bash
+php artisan parqbridge:export-all --disk=s3 --output="parquet-exports" --exclude=migrations,password_resets
+```
+
+Options:
+- `--include=`: comma-separated allowlist of table names
+- `--exclude=`: comma-separated denylist of table names
+
+## Data types
+
+The schema inferrer maps common DB types to a set of Parquet primitive types and logical annotations. With the PyArrow backend, an Arrow schema is constructed to faithfully write types:
+
+- Primitive: `BOOLEAN`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BYTE_ARRAY`, `FIXED_LEN_BYTE_ARRAY`
+- Logical: `UTF8`, `DATE`, `TIME_MILLIS`, `TIME_MICROS`, `TIMESTAMP_MILLIS`, `TIMESTAMP_MICROS`, `DECIMAL`
+
+For decimals we write Arrow decimal types (`decimal128`/`decimal256`) with declared `precision`/`scale`.
+
+## Testing
+
+Run the test suite:
+
+```bash
+composer install
+vendor/bin/phpunit
+```
+
+The tests bootstrap a minimal container, create a SQLite database, and verify:
+- listing tables works on SQLite
+- exporting a table writes a Parquet file to the configured disk (magic `PAR1`)
+- schema inference on SQLite maps major families
+
+## Backend requirements
+
+- By default ParqBridge uses Python + PyArrow. Ensure `python3` is available and install PyArrow:
+
+```bash
+python3 -m pip install --upgrade pip
+python3 -m pip install pyarrow
+```
+
+- Alternatively set a custom converter command via `PARQBRIDGE_WRITER=custom` and `PARQBRIDGE_CUSTOM_CMD` (must read `{input}` CSV and write `{output}` Parquet).
+
+You can automate setup via the included command:
+
+```bash
+php artisan parqbridge:setup --write-env
+```
+
+Options:
+- `--python=`: path/name of Python (default from config `parqbridge.pyarrow_python`)
+- `--venv=`: location for virtualenv (default `./parqbridge-venv`)
+- `--no-venv`: install into global Python instead of a venv
+- `--write-env`: append `PARQBRIDGE_PYTHON` and `PARQBRIDGE_WRITER` to `.env`
+- `--upgrade`: upgrade pip first
+- `--dry-run`: print commands without executing
@@ -0,0 +1,38 @@
+{
+  "name": "dgtlss/parqbridge",
+  "description": "Export Laravel database tables to Parquet files using Storage disks (no external deps).",
+  "type": "library",
+  "license": "MIT",
+  "authors": [
+    { "name": "ParqBridge", "email": "dev@example.com" }
+  ],
+  "require": {
+    "php": ">=8.2"
+  },
+  "require-dev": {
+    "phpunit/phpunit": "^10.5",
+    "illuminate/support": "^12.0",
+    "illuminate/container": "^12.0",
+    "illuminate/config": "^12.0",
+    "illuminate/filesystem": "^12.0",
+    "illuminate/database": "^12.0",
+    "illuminate/console": "^12.0"
+  },
+  "autoload": {
+    "psr-4": {
+      "ParqBridge\\": "src/"
+    }
+  },
+  "autoload-dev": {
+    "psr-4": {
+      "ParqBridge\\Tests\\": "tests/"
+    }
+  },
+  "extra": {
+    "laravel": {
+      "providers": [
+        "ParqBridge\\ParqBridgeServiceProvider"
+      ]
+    }
+  }
+}
@@ -0,0 +1,95 @@
+<?php
+
+return [
+    /*
+    |--------------------------------------------------------------------------
+    | Export Disk
+    |--------------------------------------------------------------------------
+    | The filesystem disk where Parquet files will be written. This uses
+    | Laravel's Storage facade under the hood, so any disk configured in
+    | config/filesystems.php is supported (e.g., "local", "s3").
+    |
+    | .env: PARQUET_DISK=s3
+    */
+    'disk' => env('PARQUET_DISK', env('FILESYSTEM_DISK', 'local')),
+
+    /*
+    |--------------------------------------------------------------------------
+    | Output Directory
+    |--------------------------------------------------------------------------
+    | Directory path prefix inside the selected disk. The final path will be
+    | {output_directory}/{table}-{timestamp}.parquet
+    |
+    | .env: PARQUET_OUTPUT_DIR=parquet-exports
+    */
+    'output_directory' => env('PARQUET_OUTPUT_DIR', 'parquet-exports'),
+
+    /*
+    |--------------------------------------------------------------------------
+    | Chunk Size
+    |--------------------------------------------------------------------------
+    | Number of rows fetched per chunk when streaming data out of the database.
+    | Larger chunks are faster but use more memory.
+    |
+    | .env: PARQUET_CHUNK_SIZE=1000
+    */
+    'chunk_size' => (int) env('PARQUET_CHUNK_SIZE', 1000),
+
+    /*
+    |--------------------------------------------------------------------------
+    | Date/Time Formatting for Fallbacks
+    |--------------------------------------------------------------------------
+    | When a database driver returns date/time types as strings or DateTime,
+    | these formats are used for the Parquet logical annotations we emit.
+    | You usually don't need to change these.
+    */
+    'date_format' => 'Y-m-d',
+    'datetime_format' => \DateTimeInterface::ATOM,
+    'time_format' => 'H:i:s',
+
+    /*
+    |--------------------------------------------------------------------------
+    | Schema Inference Strategy
+    |--------------------------------------------------------------------------
+    | "database" will use the database column types from the schema to choose
+    | Parquet primitive/logical types. "sample" will inspect the first chunk
+    | of data to refine types (e.g., booleans stored as tinyint(1)).
+    | Options: database | sample | hybrid
+    */
+    'inference' => env('PARQUET_INFERENCE', 'hybrid'),
+
+    /*
+    |--------------------------------------------------------------------------
+    | Compression
+    |--------------------------------------------------------------------------
+    | Compression codec for Parquet files. When using the PyArrow backend you
+    | may choose from: NONE (alias UNCOMPRESSED), SNAPPY, GZIP, ZSTD, BROTLI,
+    | LZ4_RAW. Default is UNCOMPRESSED.
+    */
+    'compression' => env('PARQUET_COMPRESSION', 'UNCOMPRESSED'),
+
+    /*
+    |--------------------------------------------------------------------------
+    | Writer Backend
+    |--------------------------------------------------------------------------
+    | Controls how ParqBridge produces Apache Parquet files.
+    | - pyarrow: Uses Python + PyArrow (requires `python3` with `pyarrow` installed)
+    | - custom:  Uses a custom shell command template provided below
+    |
+    | .env: PARQBRIDGE_WRITER=pyarrow
+    */
+    'writer' => env('PARQBRIDGE_WRITER', 'pyarrow'),
+
+    /*
+    | Python executable name/path for the PyArrow backend. E.g., python3 or /usr/bin/python3
+    | .env: PARQBRIDGE_PYTHON=python3
+    */
+    'pyarrow_python' => env('PARQBRIDGE_PYTHON', 'python3'),
+
+    /*
+    | Custom command template when writer=custom. Use {input} and {output} placeholders.
+    | Example (DuckDB CLI): duckdb -c "COPY (SELECT * FROM read_csv_auto({input})) TO {output} (FORMAT PARQUET)"
+    | .env: PARQBRIDGE_CUSTOM_CMD="duckdb -c \"COPY (SELECT * FROM read_csv_auto({input})) TO {output} (FORMAT PARQUET)\""
+    */
+    'custom_command' => env('PARQBRIDGE_CUSTOM_CMD', ''),
+];
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<phpunit colors="true" bootstrap="tests/bootstrap.php" cacheResultFile=".phpunit.result.cache">
+  <testsuites>
+    <testsuite name="ParqBridge Test Suite">
+      <directory suffix="Test.php">tests</directory>
+    </testsuite>
+  </testsuites>
+  <coverage processUncoveredFiles="true">
+    <include>
+      <directory suffix=".php">src</directory>
+    </include>
+  </coverage>
+</phpunit>
@@ -0,0 +1,76 @@
+<?php
+
+namespace ParqBridge\Console;
+
+use Illuminate\Console\Command;
+use Illuminate\Support\Facades\DB;
+
+class ExportAllTablesCommand extends Command
+{
+    protected $signature = 'parqbridge:export-all {--disk=} {--output=} {--exclude=} {--include=}';
+    protected $description = 'Export all database tables to Parquet files into a single folder on the chosen disk.';
+
+    public function handle(): int
+    {
+        $disk = (string) ($this->option('disk') ?: config('parqbridge.disk'));
+        $rootOutput = (string) ($this->option('output') ?: config('parqbridge.output_directory'));
+
+        $include = $this->parseCsvOption('include');
+        $exclude = $this->parseCsvOption('exclude');
+
+        $tables = $this->getTables();
+        if (!empty($include)) {
+            $tables = array_values(array_intersect($tables, $include));
+        }
+        if (!empty($exclude)) {
+            $tables = array_values(array_diff($tables, $exclude));
+        }
+
+        if (empty($tables)) {
+            $this->warn('No tables to export.');
+            return self::SUCCESS;
+        }
+
+        $subdir = now()->format('Ymd_His');
+        $finalOutput = trim($rootOutput, '/').'/'.$subdir;
+
+        $this->info('Exporting '.count($tables).' tables to folder: '.$finalOutput.' on disk '.$disk);
+
+        $ok = 0; $fail = 0;
+        foreach ($tables as $t) {
+            $exit = $this->call('parqbridge:export', [
+                'table' => $t,
+                '--output' => $finalOutput,
+                '--disk' => $disk,
+            ]);
+            if ($exit === self::SUCCESS) {
+                $ok++;
+            } else {
+                $fail++;
+            }
+        }
+
+        $this->line("Completed. Success: {$ok}, Failed: {$fail}. Folder: {$finalOutput}");
+        $this->line($finalOutput);
+        return $fail === 0 ? self::SUCCESS : self::FAILURE;
+    }
+
+    private function parseCsvOption(string $name): array
+    {
+        $raw = (string) ($this->option($name) ?: '');
+        if ($raw === '') return [];
+        return array_values(array_filter(array_map(fn($v) => trim($v), explode(',', $raw)), fn($v) => $v !== ''));
+    }
+
+    private function getTables(): array
+    {
+        $driver = DB::getDriverName();
+        return match ($driver) {
+            'mysql', 'mariadb' => collect(DB::select('SHOW TABLES'))->map(fn($r) => array_values((array)$r)[0])->all(),
+            'pgsql' => collect(DB::select("SELECT tablename FROM pg_tables WHERE schemaname = 'public'"))->pluck('tablename')->all(),
+            'sqlite' => collect(DB::select("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"))->pluck('name')->all(),
+            'sqlsrv' => collect(DB::select("SELECT table_name FROM information_schema.tables WHERE table_type = 'BASE TABLE'"))->pluck('table_name')->all(),
+            default => throw new \RuntimeException("Unsupported driver: {$driver}"),
+        };
+    }
+}