diff --git a/demos/demos_by_use_case/social/brightkite_checkin.ipynb b/demos/demos_by_use_case/social/brightkite_checkin.ipynb new file mode 100644 index 0000000000..0a1a25c7d6 --- /dev/null +++ b/demos/demos_by_use_case/social/brightkite_checkin.ipynb @@ -0,0 +1,905 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Brightkite Location-Based Social Network Dataset\n", + "\n", + "This notebook analyzes the Brightkite dataset from SNAP:\n", + "- **Network**: 58,228 users with 214,078 friendships\n", + "- **Check-ins**: 4.4M location check-ins from April 2008 - October 2010\n", + "\n", + "Source: https://snap.stanford.edu/data/loc-brightkite.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import requests\n", + "import gzip\n", + "from io import BytesIO, StringIO\n", + "import graphistry\n", + "\n", + "# To specify Graphistry account & server, use:\n", + "# graphistry.register(api=3, protocol=\"https\", server=\"hub.graphistry.com\",\n", + "# username=\"...\", password=\"...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and Parse Friendship Network" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading friendship network...\n", + "Loaded 428,156 edges\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user1", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user2", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "8ccb118f-b78d-48ba-a603-bd767459322c", + "rows": [ + [ + "0", + "0", + "1" + ], + [ + "1", + "0", + "2" + ], + [ + "2", + "0", + "3" + ], + [ + "3", + "0", + "4" + ], + [ + "4", + "0", + "5" + ] + ], + "shape": { + "columns": 2, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user1user2
001
102
203
304
405
\n", + "
" + ], + "text/plain": [ + " user1 user2\n", + "0 0 1\n", + "1 0 2\n", + "2 0 3\n", + "3 0 4\n", + "4 0 5" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download friendship network\n", + "edges_url = 'https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz'\n", + "print('Downloading friendship network...')\n", + "edges_response = requests.get(edges_url)\n", + "\n", + "# Decompress and parse\n", + "with gzip.GzipFile(fileobj=BytesIO(edges_response.content)) as f:\n", + " edges_content = f.read().decode('utf-8')\n", + "\n", + "# Parse into DataFrame\n", + "edges_df = pd.read_csv(\n", + " StringIO(edges_content),\n", + " sep='\\t',\n", + " comment='#',\n", + " names=['user1', 'user2'],\n", + " dtype={'user1': int, 'user2': int}\n", + ")\n", + "\n", + "print(f'Loaded {len(edges_df):,} edges')\n", + "edges_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and Parse Check-in Data" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading check-in data...\n", + "Loaded 4,491,144 check-ins\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user", + "rawType": "int64", + "type": "integer" + }, + { + "name": "check_in_time", + "rawType": "datetime64[ns, UTC]", + "type": "unknown" + }, + { + "name": "latitude", + "rawType": "float64", + "type": "float" + }, + { + "name": "longitude", + "rawType": "float64", + "type": "float" + }, + { + "name": "location_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "fce04ca6-e596-4c7b-bc34-08805db20113", + "rows": [ + [ + "0", + "0", + "2010-10-17 01:48:53+00:00", + "39.747652", + "-104.99251", + "88c46bf20db295831bd2d1718ad7e6f5" + ], + [ + "1", + "0", + "2010-10-16 06:02:04+00:00", + "39.891383", + "-105.070814", + "7a0f88982aa015062b95e3b4843f9ca2" + ], + [ + "2", + "0", + "2010-10-16 03:48:54+00:00", + "39.891077", + "-105.068532", + "dd7cd3d264c2d063832db506fba8bf79" + ], + [ + "3", + "0", + "2010-10-14 18:25:51+00:00", + "39.750469", + "-104.999073", + "9848afcc62e500a01cf6fbf24b797732f8963683" + ], + [ + "4", + "0", + "2010-10-14 00:21:47+00:00", + "39.752713", + "-104.996337", + "2ef143e12038c870038df53e0478cefc" + ] + ], + "shape": { + "columns": 5, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
usercheck_in_timelatitudelongitudelocation_id
002010-10-17 01:48:53+00:0039.747652-104.99251088c46bf20db295831bd2d1718ad7e6f5
102010-10-16 06:02:04+00:0039.891383-105.0708147a0f88982aa015062b95e3b4843f9ca2
202010-10-16 03:48:54+00:0039.891077-105.068532dd7cd3d264c2d063832db506fba8bf79
302010-10-14 18:25:51+00:0039.750469-104.9990739848afcc62e500a01cf6fbf24b797732f8963683
402010-10-14 00:21:47+00:0039.752713-104.9963372ef143e12038c870038df53e0478cefc
\n", + "
" + ], + "text/plain": [ + " user check_in_time latitude longitude \\\n", + "0 0 2010-10-17 01:48:53+00:00 39.747652 -104.992510 \n", + "1 0 2010-10-16 06:02:04+00:00 39.891383 -105.070814 \n", + "2 0 2010-10-16 03:48:54+00:00 39.891077 -105.068532 \n", + "3 0 2010-10-14 18:25:51+00:00 39.750469 -104.999073 \n", + "4 0 2010-10-14 00:21:47+00:00 39.752713 -104.996337 \n", + "\n", + " location_id \n", + "0 88c46bf20db295831bd2d1718ad7e6f5 \n", + "1 7a0f88982aa015062b95e3b4843f9ca2 \n", + "2 dd7cd3d264c2d063832db506fba8bf79 \n", + "3 9848afcc62e500a01cf6fbf24b797732f8963683 \n", + "4 2ef143e12038c870038df53e0478cefc " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download check-in data\n", + "checkins_url = 'https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz'\n", + "print('Downloading check-in data...')\n", + "checkins_response = requests.get(checkins_url)\n", + "\n", + "# Decompress and parse\n", + "with gzip.GzipFile(fileobj=BytesIO(checkins_response.content)) as f:\n", + " checkins_content = f.read().decode('utf-8')\n", + "\n", + "# Parse into DataFrame\n", + "checkins_df = pd.read_csv(\n", + " StringIO(checkins_content),\n", + " sep='\\t',\n", + " comment='#',\n", + " names=['user', 'check_in_time', 'latitude', 'longitude', 'location_id'],\n", + " dtype={'user': int},\n", + " parse_dates=['check_in_time']\n", + ")\n", + "\n", + "# Filter out likely invalid coordinates: (0, 0) or missing values\n", + "checkins_df = checkins_df[\n", + " checkins_df['latitude'].notna() & \n", + " checkins_df['longitude'].notna() & \n", + " ((checkins_df['latitude'] != 0) | (checkins_df['longitude'] != 0))\n", + "]\n", + "\n", + "print(f'Loaded {len(checkins_df):,} check-ins')\n", + "checkins_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filtered edges: 428,156 -> 388,180\n", + "Users in network: 58,228\n", + "Users with valid check-ins: 50,686\n", + "Users in filtered network: 50,111\n" + ] + } + ], + "source": [ + "# Filter edges to only include users with valid check-ins\n", + "valid_users = set(checkins_df['user'].unique())\n", + "edges_df_filtered = edges_df[\n", + " edges_df['user1'].isin(valid_users) & \n", + " edges_df['user2'].isin(valid_users)\n", + "]\n", + "\n", + "print(f'Filtered edges: {len(edges_df):,} -> {len(edges_df_filtered):,}')\n", + "print(f'Users in network: {pd.concat([edges_df[\"user1\"], edges_df[\"user2\"]]).nunique():,}')\n", + "print(f'Users with valid check-ins: {len(valid_users):,}')\n", + "print(f'Users in filtered network: {pd.concat([edges_df_filtered[\"user1\"], edges_df_filtered[\"user2\"]]).nunique():,}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize Friendship Network with Graphistry\n", + "\n", + "This visualization shows the social network of Brightkite users. Each node represents a user, positioned at their first check-in location. Edges represent friendships between users.\n", + "\n", + "**What to explore:**\n", + "- Community clusters: Groups of highly connected friends\n", + "- Geographic patterns: Whether friend groups cluster geographically\n", + "- Network hubs: Users with many connections (high degree)\n", + "- Network structure: Identify isolated groups vs. the main component" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Visualize friendship network (filtered to users with valid check-ins)\n", + "# Use only first check-in per user for node positioning\n", + "\n", + "g = graphistry.edges(edges_df_filtered, 'user1', 'user2').nodes(checkins_df.groupby('user').first().reset_index(), 'user') \\\n", + " .layout_settings(play=0) \\\n", + " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", + "g.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Hypergraph: Users + Check-ins\n", + "\n", + "This hypergraph combines two types of nodes: **user nodes** (blue, at average location) and **check-in nodes** (red, at actual check-in locations). Two types of edges connect them: **friendships** (blue) between users, and **user-to-check-in** edges (red) linking users to their check-ins.\n", + "\n", + "**What to explore:**\n", + "- Mobility patterns: Check-in scatter around user's average location reveals travel behavior\n", + "- Social-spatial correlation: Do friends visit similar locations?\n", + "- Activity levels: Number of red edges from a user shows check-in frequency\n", + "- Geographic hotspots: Dense red node clusters indicate popular locations\n", + "- User movement range: Distance between user node and their check-ins shows mobility" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Max check-ins per user: 6\n", + "Original check-ins: 4,491,144\n", + "Sampled check-ins: 231,714\n", + "Users with check-ins: 50,686\n", + "User nodes: 50,686\n", + "Check-in nodes: 231,714\n", + "Friendship edges: 388,180\n", + "User->check-in edges: 231,714\n", + "Total nodes: 282,400\n", + "Total edges: 619,894\n" + ] + } + ], + "source": [ + "# Sample check-ins using per-user cap for fair representation\n", + "# Users with ≤6 check-ins: keep all\n", + "# Users with >6 check-ins: randomly sample 6\n", + "\n", + "max_per_user = 6 # Maximum check-ins per user\n", + "\n", + "checkins_sampled = checkins_df.groupby('user', group_keys=False)[checkins_df.columns].apply(\n", + " lambda x: x if len(x) <= max_per_user else x.sample(n=max_per_user, random_state=42)\n", + ")\n", + "\n", + "print(f'Max check-ins per user: {max_per_user}')\n", + "print(f'Original check-ins: {len(checkins_df):,}')\n", + "print(f'Sampled check-ins: {len(checkins_sampled):,}')\n", + "print(f'Users with check-ins: {checkins_sampled[\"user\"].nunique():,}')\n", + "\n", + "# Create aggregated user nodes with average coordinates (using SAMPLED check-ins for consistency)\n", + "user_nodes = checkins_sampled.groupby('user').agg({\n", + " 'latitude': 'mean',\n", + " 'longitude': 'mean',\n", + " 'check_in_time': 'count'\n", + "}).reset_index()\n", + "user_nodes.columns = ['user', 'avg_latitude', 'avg_longitude', 'checkin_count']\n", + "user_nodes['type'] = 'user'\n", + "user_nodes['node_id'] = 'user_' + user_nodes['user'].astype(str)\n", + "\n", + "# Create check-in nodes from SAMPLED data\n", + "checkin_nodes = checkins_sampled.copy()\n", + "checkin_nodes['type'] = 'checkin'\n", + "checkin_nodes['node_id'] = 'checkin_' + checkin_nodes.index.astype(str)\n", + "\n", + "# Create user->check-in edges\n", + "user_checkin_edges = pd.DataFrame({\n", + " 'source': 'user_' + checkin_nodes['user'].astype(str),\n", + " 'destination': checkin_nodes['node_id'],\n", + " 'type': 'user_to_checkin',\n", + " 'user': checkin_nodes['user'].astype(str)\n", + "})\n", + "\n", + "# Create friendship edges between user nodes\n", + "friendship_edges = pd.DataFrame({\n", + " 'source': 'user_' + edges_df_filtered['user1'].astype(str),\n", + " 'destination': 'user_' + edges_df_filtered['user2'].astype(str),\n", + " 'type': 'friendship'\n", + "})\n", + "\n", + "# Combine all edges\n", + "all_edges = pd.concat([friendship_edges, user_checkin_edges], ignore_index=True)\n", + "\n", + "# Combine all nodes\n", + "all_nodes = pd.concat([\n", + " user_nodes[['node_id', 'user', 'avg_latitude', 'avg_longitude', 'type', 'checkin_count']].rename(\n", + " columns={'avg_latitude': 'latitude', 'avg_longitude': 'longitude'}\n", + " ),\n", + " checkin_nodes[['node_id', 'user', 'latitude', 'longitude', 'type', 'check_in_time', 'location_id']]\n", + "], ignore_index=True)\n", + "\n", + "print(f'User nodes: {len(user_nodes):,}')\n", + "print(f'Check-in nodes: {len(checkin_nodes):,}')\n", + "print(f'Friendship edges: {len(friendship_edges):,}')\n", + "print(f'User->check-in edges: {len(user_checkin_edges):,}')\n", + "print(f'Total nodes: {len(all_nodes):,}')\n", + "print(f'Total edges: {len(all_edges):,}')" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create hypergraph visualization\n", + "g_hyper = graphistry.edges(all_edges, 'source', 'destination').nodes(all_nodes, 'node_id') \\\n", + " .encode_point_color(\"type\", as_categorical=True, categorical_mapping={\"checkin\": \"red\", \"user\": \"blue\"}) \\\n", + " .encode_edge_color(\"type\", as_categorical=True, categorical_mapping={\"user_to_checkin\": \"red\", \"friendship\": \"blue\"}) \\\n", + " .layout_settings(play=0) \\\n", + " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", + "g_hyper.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Choropleth Map Layer\n", + "\n", + "This visualization adds a geographic choropleth layer using Kepler.gl that color-codes countries by the total number of nodes (users + check-ins) within their borders. The choropleth overlays the hypergraph to provide geographic context for network activity.\n", + "\n", + "**What to explore:**\n", + "- Country-level aggregation: Total node count per country shown via color intensity\n", + "- Color gradient interpretation: Darker (black/dark green) = minimal activity, brighter (vibrant green) = high activity\n", + "- Logarithmic binning: Each color step represents order-of-magnitude increases (1, 10, 100, 1K, 5K, 10K, 15K+)\n", + "- Geographic patterns: Compare regional concentration vs. global distribution\n", + "- Cross-reference: Match choropleth colors to underlying point clusters on the map\n", + "- Network geography: Identify where users and check-ins are concentrated globally" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding country information to 282,400 nodes with coordinates...\n", + "\n", + "Country distribution:\n", + "country\n", + "US 170507\n", + "JP 18165\n", + "GB 17155\n", + "AU 8031\n", + "CA 7654\n", + "DE 7360\n", + "SE 4748\n", + "NL 4442\n", + "IT 3453\n", + "FR 3157\n", + "NO 3144\n", + "ES 2818\n", + "FI 1941\n", + "CN 1866\n", + "BE 1434\n", + "CL 1322\n", + "IN 1313\n", + "BR 1307\n", + "PT 1270\n", + "CH 1207\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Add country information using reverse_geocoder (fast, offline)\n", + "import reverse_geocoder as rg\n", + "\n", + "# Filter nodes with valid coordinates\n", + "nodes_with_coords = all_nodes[all_nodes['latitude'].notna() & all_nodes['longitude'].notna()].copy()\n", + "\n", + "print(f'Adding country information to {len(nodes_with_coords):,} nodes with coordinates...')\n", + "\n", + "# Prepare coordinates for batch reverse geocoding\n", + "coords = list(zip(nodes_with_coords['latitude'], nodes_with_coords['longitude']))\n", + "\n", + "# Batch reverse geocode (much faster than individual requests)\n", + "results = rg.search(coords)\n", + "\n", + "# Extract country codes\n", + "nodes_with_coords['country'] = [result['cc'] for result in results]\n", + "\n", + "# Merge back to all_nodes\n", + "all_nodes = all_nodes.drop(columns=['country'], errors='ignore')\n", + "all_nodes = all_nodes.merge(\n", + " nodes_with_coords[['node_id', 'country']], \n", + " on='node_id', \n", + " how='left'\n", + ")\n", + "\n", + "print('\\nCountry distribution:')\n", + "print(all_nodes['country'].value_counts().head(20))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from graphistry.kepler import KeplerDataset, KeplerLayer, KeplerEncoding\n", + "\n", + "# Create visualization with countries colored by activity\n", + "kepler_ps_encoding = (\n", + " KeplerEncoding()\n", + "\n", + " # Nodes dataset\n", + " .with_dataset(\n", + " KeplerDataset(\n", + " id=\"nodes\",\n", + " type=\"nodes\",\n", + " label=\"Nodes\"\n", + " )\n", + " )\n", + "\n", + " # Edges dataset with mapped coordinates\n", + " .with_dataset(\n", + " KeplerDataset(\n", + " id=\"edges\",\n", + " type=\"edges\",\n", + " label=\"Edges\"\n", + " )\n", + " )\n", + "\n", + " # Countries dataset\n", + " .with_dataset(\n", + " KeplerDataset(\n", + " id=\"countries\",\n", + " type=\"countries\",\n", + " label=\"Nodes in Countries\",\n", + " resolution=110,\n", + " boundary_lakes=False,\n", + " computed_columns={\n", + " \"nodes_in_countries\": {\n", + " \"type\": \"aggregate\",\n", + " \"computeFromDataset\": \"nodes\",\n", + " \"sourceKey\": \"country\",\n", + " \"targetKey\": \"iso_a2_eh\",\n", + " \"aggregate\": \"count\",\n", + " \"aggregateCol\": \"node_id\",\n", + " \"bins\": [0, 1, 10, 100, 1000, 5000, 10000, 15000, 9999999],\n", + " \"right\": False,\n", + " \"includeLowest\": True\n", + " }\n", + " }\n", + " )\n", + " )\n", + "\n", + " # Countries geojson layer with color encoding\n", + " .with_layer(\n", + " KeplerLayer({\n", + " \"id\": \"countries-ps-layer\",\n", + " \"type\": \"geojson\",\n", + " \"config\": {\n", + " \"dataId\": \"countries\",\n", + " \"label\": \"Countries by Num Users\",\n", + " \"columns\": {\n", + " \"geojson\": \"_geometry\"\n", + " },\n", + " \"isVisible\": True,\n", + " \"visConfig\": {\n", + " \"opacity\": 0.7,\n", + " \"strokeOpacity\": 0.8,\n", + " \"thickness\": 0.5,\n", + " \"strokeColor\": [60, 60, 60],\n", + " \"colorRange\": {\n", + " \"name\": \"Custom Gradient\",\n", + " \"type\": \"sequential\",\n", + " \"category\": \"Custom\",\n", + " \"colors\": [\n", + " \"#000000\", # Black for lowest value (0-0.5)\n", + " \"#001a0a\", # Very dark green (0.5-1)\n", + " \"#003314\", # Dark green (1-2)\n", + " \"#004d1f\", # Green (2-3)\n", + " \"#00802d\", # Dark lime green (3-5)\n", + " \"#00b340\", # Medium green (5-7)\n", + " \"#00e65c\", # Bright green (7-10)\n", + " \"#1aff8c\" # Vibrant green for highest value (10+)\n", + " ]\n", + " },\n", + " \"filled\": True,\n", + " \"outline\": True,\n", + " \"extruded\": False,\n", + " \"wireframe\": False\n", + " }\n", + " },\n", + " \"visualChannels\": {\n", + " \"colorField\": {\n", + " \"name\": \"nodes_in_countries\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"colorScale\": \"ordinal\",\n", + " \"sizeField\": None,\n", + " \"sizeScale\": \"linear\"\n", + " }\n", + " })\n", + " )\n", + ")\n", + "\n", + "# Create hypergraph visualization\n", + "g_hyper = graphistry.edges(all_edges, 'source', 'destination').nodes(all_nodes, 'node_id') \\\n", + " .encode_point_color(\"type\", as_categorical=True, categorical_mapping={\"checkin\": \"red\", \"user\": \"blue\"}) \\\n", + " .encode_edge_color(\"type\", as_categorical=True, categorical_mapping={\"user_to_checkin\": \"red\", \"friendship\": \"blue\"}) \\\n", + " .encode_kepler(kepler_ps_encoding) \\\n", + " .layout_settings(play=0) \\\n", + " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", + "g_hyper.plot()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "rapids-24.08", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/source/api/kepler/dataset.rst b/docs/source/api/kepler/dataset.rst index 817b7d5eda..246078ef7b 100644 --- a/docs/source/api/kepler/dataset.rst +++ b/docs/source/api/kepler/dataset.rst @@ -82,6 +82,7 @@ Example See Also -------- +- :doc:`preloaded_datasets`: Complete column documentation for preloaded geographic datasets - :ref:`kepler-dataset-format`: Native Kepler.gl dataset format reference - :ref:`kepler-layer-api`: Layer configuration - :ref:`kepler-encoding-api`: Complete Kepler configuration diff --git a/docs/source/api/kepler/preloaded_datasets.rst b/docs/source/api/kepler/preloaded_datasets.rst new file mode 100644 index 0000000000..285de5fc7b --- /dev/null +++ b/docs/source/api/kepler/preloaded_datasets.rst @@ -0,0 +1,806 @@ +.. _kepler_preloaded_datasets: + +Kepler.gl Preloaded Datasets +================================= + +PyGraphistry provides preloaded Natural Earth geographic datasets for use with Kepler.gl visualizations. +These datasets include administrative boundaries at different levels with comprehensive attribute data. + +Admin Region Hierarchy +---------------------- + +The Natural Earth data is organized into administrative levels: + +* **0th Order (Countries)**: National boundaries - ``countries`` or ``zeroOrderAdminRegions`` +* **1st Order (States/Provinces)**: Sub-national divisions - ``states``, ``provinces``, or ``firstOrderAdminRegions`` + +Countries Dataset (0th Order Admin Regions) +-------------------------------------------- + +The countries dataset contains 168 columns of data for each country. All column names are lowercase. + +Example Usage +^^^^^^^^^^^^^ + +.. code-block:: python + + from graphistry import KeplerDataset + + # Create a countries dataset + countries_ds = KeplerDataset( + type="countries", + resolution=10, # 10=high, 50=medium, 110=low + include_countries=["United States of America", "Canada", "Mexico"] + ) + + # Get list of available columns + columns = KeplerDataset.get_available_columns('countries') + +Complete Column List with Example Values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All 168 columns with example values from United States (displayed in groups for clarity): + +**Geographic and Administrative Columns** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``featurecla`` + - "Admin-0 country" + * - ``scalerank`` + - 1 + * - ``labelrank`` + - 2 + * - ``sovereignt`` + - "United States of America" + * - ``sov_a3`` + - "US1" + * - ``adm0_dif`` + - 1 + * - ``level`` + - 2 + * - ``type`` + - "Country" + * - ``tlc`` + - 1 + * - ``admin`` + - "United States of America" + * - ``adm0_a3`` + - "USA" + * - ``geou_dif`` + - 0 + * - ``geounit`` + - "United States of America" + * - ``gu_a3`` + - "USA" + * - ``su_dif`` + - 0 + * - ``subunit`` + - "United States" + * - ``su_a3`` + - "USA" + * - ``brk_diff`` + - 0 + +**Names and Identifiers** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name`` + - "United States of America" + * - ``name_long`` + - "United States" + * - ``brk_a3`` + - "USA" + * - ``brk_name`` + - "United States" + * - ``brk_group`` + - "" + * - ``abbrev`` + - "U.S.A." + * - ``postal`` + - "US" + * - ``formal_en`` + - "United States of America" + * - ``formal_fr`` + - "" + * - ``name_ciawf`` + - "United States" + * - ``note_adm0`` + - "" + * - ``note_brk`` + - "" + * - ``name_sort`` + - "United States of America" + * - ``name_alt`` + - "" + +**Map Display Properties** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``mapcolor7`` + - 4 + * - ``mapcolor8`` + - 5 + * - ``mapcolor9`` + - 1 + * - ``mapcolor13`` + - 1 + * - ``min_zoom`` + - 0.0 + * - ``min_label`` + - 1.7 + * - ``max_label`` + - 5.7 + * - ``label_x`` + - -97.482602 + * - ``label_y`` + - 39.538479 + * - ``latitude`` + - 42.31380089200132 + * - ``longitude`` + - -105.33907490650022 + +**Demographics and Economics** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``pop_est`` + - 328239523.0 + * - ``pop_rank`` + - 17 + * - ``pop_year`` + - 2019 + * - ``gdp_md`` + - 21433226 + * - ``gdp_year`` + - 2019 + * - ``economy`` + - "1. Developed region: G7" + * - ``income_grp`` + - "1. High income: OECD" + +**ISO and International Codes** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``fips_10`` + - "US" + * - ``iso_a2`` + - "US" + * - ``iso_a2_eh`` + - "US" + * - ``iso_a3`` + - "USA" + * - ``iso_a3_eh`` + - "USA" + * - ``iso_n3`` + - "840" + * - ``iso_n3_eh`` + - "840" + * - ``un_a3`` + - "840" + * - ``wb_a2`` + - "US" + * - ``wb_a3`` + - "USA" + * - ``woe_id`` + - 23424977 + * - ``woe_id_eh`` + - 23424977 + * - ``woe_note`` + - "Exact WOE match as country" + * - ``adm0_iso`` + - "USA" + * - ``adm0_diff`` + - "" + * - ``adm0_tlc`` + - "USA" + +**Regional Classifications** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``continent`` + - "North America" + * - ``region_un`` + - "Americas" + * - ``subregion`` + - "Northern America" + * - ``region_wb`` + - "North America" + +**Country-Specific Admin Codes** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``adm0_a3_us`` + - "USA" + * - ``adm0_a3_fr`` + - "USA" + * - ``adm0_a3_ru`` + - "USA" + * - ``adm0_a3_es`` + - "USA" + * - ``adm0_a3_cn`` + - "USA" + * - ``adm0_a3_tw`` + - "USA" + * - ``adm0_a3_in`` + - "USA" + * - ``adm0_a3_np`` + - "USA" + * - ``adm0_a3_pk`` + - "USA" + * - ``adm0_a3_de`` + - "USA" + * - ``adm0_a3_gb`` + - "USA" + * - ``adm0_a3_br`` + - "USA" + * - ``adm0_a3_il`` + - "USA" + * - ``adm0_a3_ps`` + - "USA" + * - ``adm0_a3_sa`` + - "USA" + * - ``adm0_a3_eg`` + - "USA" + * - ``adm0_a3_ma`` + - "USA" + * - ``adm0_a3_pt`` + - "USA" + * - ``adm0_a3_ar`` + - "USA" + * - ``adm0_a3_jp`` + - "USA" + * - ``adm0_a3_ko`` + - "USA" + * - ``adm0_a3_vn`` + - "USA" + * - ``adm0_a3_tr`` + - "USA" + * - ``adm0_a3_id`` + - "USA" + * - ``adm0_a3_pl`` + - "USA" + * - ``adm0_a3_gr`` + - "USA" + * - ``adm0_a3_it`` + - "USA" + * - ``adm0_a3_nl`` + - "USA" + * - ``adm0_a3_se`` + - "USA" + * - ``adm0_a3_bd`` + - "USA" + * - ``adm0_a3_ua`` + - "USA" + * - ``adm0_a3_un`` + - -99 + * - ``adm0_a3_wb`` + - -99 + +**Metadata Fields** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name_len`` + - 24 + * - ``long_len`` + - 13 + * - ``abbrev_len`` + - 6 + * - ``tiny`` + - -99 + * - ``homepart`` + - 1 + * - ``ne_id`` + - 1159321369 + * - ``wikidataid`` + - "Q30" + +**Multilingual Names** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name_ar`` + - "الولايات المتحدة" + * - ``name_bn`` + - "মার্কিন যুক্তরাষ্ট্র" + * - ``name_de`` + - "Vereinigte Staaten" + * - ``name_en`` + - "United States of America" + * - ``name_es`` + - "Estados Unidos" + * - ``name_fa`` + - "ایالات متحده آمریکا" + * - ``name_fr`` + - "États-Unis" + * - ``name_el`` + - "Ηνωμένες Πολιτείες Αμερικής" + * - ``name_he`` + - "ארצות הברית" + * - ``name_hi`` + - "संयुक्त राज्य अमेरिका" + * - ``name_hu`` + - "Amerikai Egyesült Államok" + * - ``name_id`` + - "Amerika Serikat" + * - ``name_it`` + - "Stati Uniti d'America" + * - ``name_ja`` + - "アメリカ合衆国" + * - ``name_ko`` + - "미국" + * - ``name_nl`` + - "Verenigde Staten van Amerika" + * - ``name_pl`` + - "Stany Zjednoczone" + * - ``name_pt`` + - "Estados Unidos" + * - ``name_ru`` + - "США" + * - ``name_sv`` + - "USA" + * - ``name_tr`` + - "Amerika Birleşik Devletleri" + * - ``name_uk`` + - "Сполучені Штати Америки" + * - ``name_ur`` + - "ریاستہائے متحدہ امریکا" + * - ``name_vi`` + - "Hoa Kỳ" + * - ``name_zh`` + - "美国" + * - ``name_zht`` + - "美國" + +**Feature Classification Fields** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``fclass_iso`` + - "Admin-0 country" + * - ``tlc_diff`` + - "" + * - ``fclass_tlc`` + - "Admin-0 country" + * - ``fclass_us`` + - "" + * - ``fclass_fr`` + - "" + * - ``fclass_ru`` + - "" + * - ``fclass_es`` + - "" + * - ``fclass_cn`` + - "" + * - ``fclass_tw`` + - "" + * - ``fclass_in`` + - "" + * - ``fclass_np`` + - "" + * - ``fclass_pk`` + - "" + * - ``fclass_de`` + - "" + * - ``fclass_gb`` + - "" + * - ``fclass_br`` + - "" + * - ``fclass_il`` + - "" + * - ``fclass_ps`` + - "" + * - ``fclass_sa`` + - "" + * - ``fclass_eg`` + - "" + * - ``fclass_ma`` + - "" + * - ``fclass_pt`` + - "" + * - ``fclass_ar`` + - "" + * - ``fclass_jp`` + - "" + * - ``fclass_ko`` + - "" + * - ``fclass_vn`` + - "" + * - ``fclass_tr`` + - "" + * - ``fclass_id`` + - "" + * - ``fclass_pl`` + - "" + * - ``fclass_gr`` + - "" + * - ``fclass_it`` + - "" + * - ``fclass_nl`` + - "" + * - ``fclass_se`` + - "" + * - ``fclass_bd`` + - "" + * - ``fclass_ua`` + - "" + +**Geometry** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``_geometry`` + - MULTIPOLYGON(...) + +States/Provinces Dataset (1st Order Admin Regions) +--------------------------------------------------- + +The states/provinces dataset contains administrative subdivisions for countries worldwide. + +Example Usage +^^^^^^^^^^^^^ + +.. code-block:: python + + from graphistry import KeplerDataset + + # Create a states dataset for US states + states_ds = KeplerDataset( + type="states", + include_countries=["United States of America"], + include_1st_order_regions=["California", "Texas", "New York"] + ) + +Complete Column List with Example Values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All 115 columns with example values from California: + +**Geographic and Administrative Columns** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``featurecla`` + - "Admin-1 states provinces" + * - ``scalerank`` + - 2 + * - ``adm1_code`` + - "USA-3521" + * - ``diss_me`` + - 3521 + * - ``iso_3166_2`` + - "US-CA" + * - ``wikipedia`` + - "http://en.wikipedia.org/wiki/California" + * - ``iso_a2`` + - "US" + * - ``adm0_sr`` + - 8 + +**Names and Identifiers** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name`` + - "California" + * - ``name_alt`` + - "CA|Calif." + * - ``name_local`` + - "" + * - ``type`` + - "State" + * - ``type_en`` + - "State" + * - ``code_local`` + - "US06" + * - ``code_hasc`` + - "US.CA" + * - ``note`` + - "" + * - ``hasc_maybe`` + - "" + +**Regional Information** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``region`` + - "West" + * - ``region_cod`` + - "" + * - ``region_sub`` + - "Pacific" + * - ``sub_code`` + - "" + * - ``provnum_ne`` + - 0.0 + * - ``gadm_level`` + - 1 + +**Administrative Details** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``check_me`` + - 20 + * - ``datarank`` + - 1 + * - ``abbrev`` + - "Calif." + * - ``postal`` + - "CA" + * - ``area_sqkm`` + - 0.0 + * - ``sameascity`` + - -99 + * - ``labelrank`` + - 0 + * - ``name_len`` + - 10 + +**Map Display Properties** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``mapcolor9`` + - 1 + * - ``mapcolor13`` + - 1 + * - ``min_label`` + - 3.5 + * - ``max_label`` + - 7.5 + * - ``min_zoom`` + - 2.0 + +**External References** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``fips`` + - "US06" + * - ``fips_alt`` + - "" + * - ``woe_id`` + - 2347563.0 + * - ``woe_label`` + - "California, US, United States" + * - ``woe_name`` + - "California" + * - ``wikidataid`` + - "Q99" + * - ``ne_id`` + - 1159308415 + +**Geographic Coordinates** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``latitude`` + - 37.1259483770762 + * - ``longitude`` + - -119.44202946142391 + +**Parent Country Information** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``sov_a3`` + - "US1" + * - ``adm0_a3`` + - "USA" + * - ``adm0_label`` + - 2 + * - ``admin`` + - "United States of America" + * - ``geonunit`` + - "United States of America" + * - ``gu_a3`` + - "USA" + +**GeoNames Integration** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``gn_id`` + - 5332921.0 + * - ``gn_name`` + - "California" + * - ``gns_id`` + - -1.0 + * - ``gns_name`` + - "" + * - ``gn_level`` + - 1.0 + * - ``gn_region`` + - "" + * - ``gn_a1_code`` + - "US.CA" + * - ``gns_level`` + - -1.0 + * - ``gns_lang`` + - "" + * - ``gns_adm1`` + - "" + * - ``gns_region`` + - "" + +**Multilingual Names** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name_ar`` + - "كاليفورنيا" + * - ``name_bn`` + - "ক্যালিফোর্নিয়া" + * - ``name_de`` + - "Kalifornien" + * - ``name_en`` + - "California" + * - ``name_es`` + - "California" + * - ``name_fr`` + - "Californie" + * - ``name_el`` + - "Καλιφόρνια" + * - ``name_hi`` + - "कैलिफ़ोर्निया" + * - ``name_hu`` + - "Kalifornia" + * - ``name_id`` + - "California" + * - ``name_it`` + - "California" + * - ``name_ja`` + - "カリフォルニア州" + * - ``name_ko`` + - "캘리포니아" + * - ``name_nl`` + - "Californië" + * - ``name_pl`` + - "Kalifornia" + * - ``name_pt`` + - "Califórnia" + * - ``name_ru`` + - "Калифорния" + * - ``name_sv`` + - "Kalifornien" + * - ``name_tr`` + - "Kaliforniya" + * - ``name_vi`` + - "California" + * - ``name_zh`` + - "加利福尼亚州" + * - ``name_he`` + - "קליפורניה" + * - ``name_uk`` + - "Каліфорнія" + * - ``name_ur`` + - "کیلی فورنیا" + * - ``name_fa`` + - "کالیفرنیا" + * - ``name_zht`` + - "加利福尼亞州" + +**Feature Classification Fields** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``fclass_iso`` to ``fclass_tlc`` + - "" (empty for all) + +**Geometry** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``_geometry`` + - MULTIPOLYGON(...) + +Data Source +----------- + +These datasets are derived from Natural Earth (https://www.naturalearthdata.com/), a public domain map dataset available at 1:10m, 1:50m, and 1:110m scales. The data is updated periodically to reflect political and demographic changes. + +See Also +-------- + +* :doc:`/api/plotter` - Main plotting interface with Kepler support +* Natural Earth documentation: https://www.naturalearthdata.com/ \ No newline at end of file diff --git a/docs/source/notebooks/gfql.rst b/docs/source/notebooks/gfql.rst index 99b18c033f..8e5f16378c 100644 --- a/docs/source/notebooks/gfql.rst +++ b/docs/source/notebooks/gfql.rst @@ -12,4 +12,4 @@ GFQL Graph queries GPU Benchmarking <../demos/gfql/benchmark_hops_cpu_gpu.ipynb> GFQL Remote mode <../demos/gfql/gfql_remote.ipynb> Python Remote mode <../demos/gfql/python_remote.ipynb> - # ICIJ FinCEN Files Visualization <../demos/demos_by_user_case/icij_fincen_viz.ipynb> + ICIJ FinCEN Files Visualization <../demos/demos_by_user_case/icij_fincen_viz.ipynb>