Merge pull request #23 from rdhyee/issue-13-parquet-duckdb

rdhyee · web-flow · commit 6c564702e6dd · 2025-09-18T14:45:16.000-07:00
Enhanced parquet analysis with object types and property distribution
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -1,7 +1,14 @@
 {
   "permissions": {
     "allow": [
-      "Bash(git branch:*)"
+      "Bash(git branch:*)",
+      "WebFetch(domain:localhost)",
+      "Bash(git add:*)",
+      "Read(//Users/raymondyee/dev-journal/daily/**)",
+      "Bash(git commit:*)",
+      "Bash(git push:*)",
+      "Bash(git pull:*)",
+      "Bash(git fetch:*)"
     ],
     "deny": [],
     "ask": []
diff --git a/tutorials/index.qmd b/tutorials/index.qmd
@@ -33,6 +33,7 @@ With iSamples Central currently unavailable, all tutorials now use **geoparquet
 ## Why Geoparquet?
 
 Our tutorials showcase how **geoparquet + DuckDB-WASM** enables:
+
 - ✅ **Universal access**: No software installation required
 - ✅ **Fast analysis**: 5-10x faster than traditional approaches (e.g., downloading full CSV datasets and analyzing them locally). [See benchmark](https://duckdb.org/2023/05/10/duckdb-wasm.html)
 - ✅ **Memory efficient**: Analyze 300MB datasets using <100MB browser memory
diff --git a/tutorials/parquet_cesium.qmd b/tutorials/parquet_cesium.qmd
@@ -8,8 +8,8 @@ One key development of the iSamples project centers on the demonstration of low-
 This page demonstrates how geospatial data can be dynamically accessed from a remote parquet file in cloud storage. The page uses Cesium for browser visualization of these spatial data on a 3D global map. The data in this demonstration comes from [Open Context's](https://opencontext.org/) export of specimen (archaeological artifact and ecofact) records for iSamples. However, this demonstration can also work with any other iSamples compliant parquet data source made publicly accessible on the Web.
 
 
-<script src="https://cesium.com/downloads/cesiumjs/releases/1.127/Build/Cesium/Cesium.js"></script>
-<link href="https://cesium.com/downloads/cesiumjs/releases/1.127/Build/Cesium/Widgets/widgets.css" rel="stylesheet"></link>
+<script src="https://cesium.com/downloads/cesiumjs/releases/1.133/Build/Cesium/Cesium.js"></script>
+<link href="https://cesium.com/downloads/cesiumjs/releases/1.133/Build/Cesium/Widgets/widgets.css" rel="stylesheet"></link>
 <style>
     div.cesium-topleft {
         display: block;
@@ -238,6 +238,8 @@ viewof pointdata = {
 
 :::
 
+The number of locations in the file is: ${pointdata.length}.
+
 The click point ID is "${clickedPointId}".
 
 ```{ojs}
@@ -248,4 +250,196 @@ ${JSON.stringify(selectedGeoRecord, null, 2)}
 `
 ```
 
+## Table Structure Analysis
+
+Understanding the structure and schema of the parquet file:
+
+### Column Schema
+
+```{ojs}
+//| code-fold: true
+tableSchema = {
+    const query = `DESCRIBE nodes`;
+    const data = await loadData(query, [], "loading_schema");
+    return data;
+}
+```
+
+<div id="loading_schema">Loading table schema...</div>
+
+```{ojs}
+//| code-fold: true
+viewof schemaTable = {
+    const data_table = Inputs.table(tableSchema, {
+        header: {
+            column_name: "Column Name",
+            column_type: "Data Type",
+            null: "Nullable",
+            key: "Key",
+            default: "Default",
+            extra: "Extra"
+        }
+    });
+    return data_table;
+}
+```
+
+### Sample Data
+
+First 10 rows of the dataset to understand the data structure:
+
+```{ojs}
+//| code-fold: true
+sampleData = {
+    const query = `SELECT * FROM nodes LIMIT 10`;
+    const data = await loadData(query, [], "loading_sample");
+    return data;
+}
+```
+
+<div id="loading_sample">Loading sample data...</div>
+
+```{ojs}
+//| code-fold: true
+viewof sampleTable = {
+    const data_table = Inputs.table(sampleData, {
+        layout: "auto",
+        width: {
+            pid: 200,
+            otype: 150
+        }
+    });
+    return data_table;
+}
+```
+
+### Sample Data by Object Type
+
+Examples of records for each object type to understand the data semantics:
+
+```{ojs}
+//| code-fold: true
+sampleDataByOtype = {
+    // First get the list of unique object types
+    const otypeQuery = `SELECT DISTINCT otype FROM nodes ORDER BY otype`;
+    const otypes = await loadData(otypeQuery, [], "loading_otype_samples");
+    
+    const results = [];
+    for (const otypeRow of otypes) {
+        const otype = otypeRow.otype;
+        // Get 3 sample records for each otype
+        const sampleQuery = `SELECT * FROM nodes WHERE otype = ? LIMIT 3`;
+        const samples = await db.query(sampleQuery, [otype]);
+        
+        results.push({
+            otype: otype,
+            count: samples.length,
+            samples: samples
+        });
+    }
+    return results;
+}
+```
+
+<div id="loading_otype_samples">Loading sample data by object type...</div>
+
+```{ojs}
+//| code-fold: true
+viewof otypeSamplesDisplay = {
+    const container = html`<div></div>`;
+    
+    for (const otypeData of sampleDataByOtype) {
+        const section = html`<div style="margin-bottom: 2rem;">
+            <h4 style="color: #2563eb; margin-bottom: 0.5rem;">Object Type: ${otypeData.otype}</h4>
+            <p style="margin: 0.5rem 0; font-style: italic;">Sample records (showing up to 3):</p>
+        </div>`;
+        
+        // Create a table for this otype's samples
+        const table = Inputs.table(otypeData.samples, {
+            layout: "auto",
+            width: {
+                pid: 150,
+                otype: 120,
+                latitude: 100,
+                longitude: 100
+            }
+        });
+        
+        section.appendChild(table);
+        container.appendChild(section);
+    }
+    
+    return container;
+}
+```
+
+## Object Type Counts
+
+The distribution of object types (`otype`) in the dataset:
+
+```{ojs}
+//| code-fold: true
+otypeCounts = {
+    const query = `SELECT otype, COUNT(*) as count FROM nodes GROUP BY otype ORDER BY count DESC`;
+    const data = await loadData(query, [], "loading_otype");
+    return data;
+}
+```
+
+<div id="loading_otype">Loading object type counts...</div>
+
+```{ojs}
+//| code-fold: true
+viewof otypeTable = {
+    const data_table = Inputs.table(otypeCounts, {
+        header: {
+            otype: "Object Type",
+            count: "Count"
+        },
+        format: {
+            count: d => d.toLocaleString()
+        }
+    });
+    return data_table;
+}
+```
+
+Total records by object type: ${otypeCounts.reduce((sum, row) => sum + row.count, 0).toLocaleString()}
+
+## Property Distribution Analysis
+
+Understanding the range of properties (predicates) in this graph database structure:
+
+```{ojs}
+//| code-fold: true
+propertyDistribution = {
+    const query = `SELECT p as property, COUNT(*) as count FROM nodes WHERE p IS NOT NULL GROUP BY p ORDER BY count DESC`;
+    const data = await loadData(query, [], "loading_properties");
+    return data;
+}
+```
+
+<div id="loading_properties">Loading property distribution...</div>
+
+```{ojs}
+//| code-fold: true
+viewof propertyTable = {
+    const data_table = Inputs.table(propertyDistribution, {
+        header: {
+            property: "Property (Predicate)",
+            count: "Count"
+        },
+        format: {
+            count: d => d.toLocaleString()
+        },
+        layout: "auto"
+    });
+    return data_table;
+}
+```
+
+Total records with properties: ${propertyDistribution.reduce((sum, row) => sum + row.count, 0).toLocaleString()}
+
+Unique properties in the dataset: ${propertyDistribution.length.toLocaleString()}
+
 
diff --git a/tutorials/zenodo_isamples_analysis.qmd b/tutorials/zenodo_isamples_analysis.qmd