diff --git a/notebook_cost_estimator/cost_estimator.ipynb b/notebook_cost_estimator/cost_estimator.ipynb new file mode 100644 index 0000000..19a78da --- /dev/null +++ b/notebook_cost_estimator/cost_estimator.ipynb @@ -0,0 +1,584 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9701fa84", + "metadata": {}, + "source": [ + "# Verily Workbench Notebook Cost Estimator\n", + "\n", + "This tool helps you estimate the approximate GCP cost of running a specified JupyterLab notebook on a standard Verily Workbench JupyterLab app. \n", + "\n", + "Refer to the [Verily Workbench Cloud Apps documentation](https://support.workbench.verily.com/docs/guides/cloud_apps/apps_intro/) for details on default app configurations and pricing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15450aaa", + "metadata": {}, + "outputs": [], + "source": [ + "# Import Required Libraries\n", + "import os\n", + "import math\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "from IPython.display import display, Markdown" + ] + }, + { + "cell_type": "markdown", + "id": "ea71a703", + "metadata": {}, + "source": [ + "## User Input: Notebook and Data Specifications\n", + "\n", + "Please provide the following information to estimate your cost:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a20058dd", + "metadata": {}, + "outputs": [], + "source": [ + "# User Input: Notebook and Data Specifications\n", + "from ipywidgets import widgets\n", + "\n", + "notebook_file = widgets.Text(\n", + "    value='',\n", + "    placeholder='Enter notebook filename (e.g., analysis.ipynb)',\n", + "    description='Notebook:',\n", + "    disabled=False\n", + ")\n", + "runtime_hours = widgets.FloatText(\n", + "    value=1.0,\n", + "    description='Runtime (hrs):',\n", + "    disabled=False\n", + ")\n", + "\n", + "# Workspace resource inputs\n", + "resource_type = widgets.Dropdown(\n", + "    options=[\n", + "        ('BigQuery Dataset', 'bq_dataset'),\n", + "        ('BigQuery Table', 'bq_table'), \n", + "        ('Cloud Storage Bucket', 'gcs_bucket'),\n", + "        ('Cloud Storage Object/File', 'gcs_object'),\n", + "        ('Mixed Resources', 'mixed')\n", + "    ],\n", + "    value='gcs_bucket',\n", + "    description='Resource Type:',\n", + "    disabled=False\n", + ")\n", + "\n", + "data_size_gb = widgets.FloatText(\n", + "    value=1.0,\n", + "    description='Data Size (GB):',\n", + "    disabled=False\n", + ")\n", + "output_size_gb = widgets.FloatText(\n", + "    value=0.5,\n", + "    description='Output Size (GB):',\n", + "    disabled=False\n", + ")\n", + "\n", + "# BigQuery-specific inputs\n", + "bq_queries = widgets.IntText(\n", + "    value=1,\n", + "    description='# BQ Queries:',\n", + "    disabled=False\n", + ")\n", + "bq_data_processed_gb = widgets.FloatText(\n", + "    value=1.0,\n", + "    description='BQ Data Processed (GB):',\n", + "    disabled=False\n", + ")\n", + "\n", + "special_resources = widgets.Text(\n", + "    value='',\n", + "    placeholder='e.g., GPU, highmem',\n", + "    description='Special Resources:',\n", + "    disabled=False\n", + ")\n", + "\n", + "ui = widgets.VBox([\n", + "    notebook_file, \n", + "    runtime_hours, \n", + "    resource_type,\n", + "    data_size_gb, \n", + "    output_size_gb,\n", + "    bq_queries,\n", + "    bq_data_processed_gb,\n", + "    special_resources\n", + "])\n", + "display(ui)" + ] + }, + { + "cell_type": "markdown", + "id": "980e6f0a", + "metadata": {}, + "source": [ + "## Estimate Compute Resource Usage\n", + "\n", + "This section estimates the compute resources required based on your input and the default JupyterLab app specs from Verily Workbench documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5cf6acd", + "metadata": {}, + "outputs": [], + "source": [ + "# Estimate Compute Resource Usage\n", + "# Default JupyterLab app specs (as of Nov 2025, see Verily docs):\n", + "# n1-standard-4 (4 vCPU, 15 GB RAM), $0.158/hr (us-central1)\n", + "default_machine_type = 'n1-standard-4'\n", + "default_vcpu = 4\n", + "default_ram_gb = 15\n", + "compute_price_per_hour = 0.158  # USD/hr (update if pricing changes)\n", + "\n", + "print(f\"Default machine: {default_machine_type} ({default_vcpu} vCPU, {default_ram_gb} GB RAM)\")\n", + "print(f\"Compute price: ${compute_price_per_hour}/hr (us-central1, Nov 2025)\")" + ] + }, + { + "cell_type": "markdown", + "id": "31d73559", + "metadata": {}, + "source": [ + "## Estimate Storage Usage Based on Workspace Resources\n", + "\n", + "Estimate storage usage based on the data resources in your Verily Workbench workspace. Workbench supports different types of data resources:\n", + "\n", + "- **BigQuery Datasets/Tables**: Query-based pricing\n", + "- **Cloud Storage Buckets/Objects**: Storage + data transfer pricing  \n", + "- **Referenced Resources**: Point to external data (no workspace storage cost)\n", + "- **Controlled Resources**: Managed within your workspace\n", + "\n", + "For details, see the [Workbench data resources documentation](https://support.workbench.verily.com/docs/guides/research_data/resource_intro/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c4e9f34", + "metadata": {}, + "outputs": [], + "source": [ + "# Estimate Storage Usage Based on Workspace Resources\n", + "# GCP Pricing (us-central1, Nov 2025 - update as needed)\n", + "storage_price_per_gb_month = 0.04  # Cloud Storage Standard\n", + "bq_query_price_per_tb = 6.25  # BigQuery on-demand pricing per TB processed\n", + "bq_storage_price_per_gb_month = 0.02  # BigQuery storage per GB/month\n", + "\n", + "# Convert monthly to hourly rates\n", + "storage_price_per_gb_hour = storage_price_per_gb_month / (30 * 24)\n", + "bq_storage_price_per_gb_hour = bq_storage_price_per_gb_month / (30 * 24)\n", + "\n", + "def estimate_storage_costs(resource_type, data_gb, output_gb, queries, processed_gb, runtime_hrs):\n", + "    storage_cost = 0\n", + "    query_cost = 0\n", + "    \n", + "    if resource_type in ['gcs_bucket', 'gcs_object', 'mixed']:\n", + "        # Cloud Storage costs for data + output\n", + "        total_storage_gb = data_gb + output_gb\n", + "        storage_cost = storage_price_per_gb_hour * total_storage_gb * runtime_hrs\n", + "        \n", + "    elif resource_type in ['bq_dataset', 'bq_table']:\n", + "        # BigQuery storage (for controlled datasets) + query costs\n", + "        storage_cost = bq_storage_price_per_gb_hour * data_gb * runtime_hrs\n", + "        query_cost = (processed_gb / 1000) * bq_query_price_per_tb * queries  # Convert GB to TB\n", + "        \n", + "    elif resource_type == 'mixed':\n", + "        # Combination of storage and BigQuery\n", + "        gcs_storage = storage_price_per_gb_hour * data_gb * runtime_hrs\n", + "        bq_queries = (processed_gb / 1000) * bq_query_price_per_tb * queries\n", + "        storage_cost = gcs_storage + bq_queries\n", + "    \n", + "    return storage_cost, query_cost\n", + "    \n", + "storage_cost, query_cost = estimate_storage_costs(\n", + "    resource_type.value,\n", + "    data_size_gb.value,\n", + "    output_size_gb.value, \n", + "    bq_queries.value,\n", + "    bq_data_processed_gb.value,\n", + "    runtime_hours.value\n", + ")\n", + "\n", + "total_data_cost = storage_cost + query_cost\n", + "\n", + "print(f\"Resource type: {resource_type.value}\")\n", + "print(f\"Estimated storage cost: ${storage_cost:.4f}\")\n", + "if query_cost > 0:\n", + "    print(f\"Estimated BigQuery query cost: ${query_cost:.4f}\")\n", + "print(f\"Total data-related cost: ${total_data_cost:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "760fe16c", + "metadata": {}, + "source": [ + "## Calculate Approximate Cost\n", + "\n", + "This section calculates the estimated cost for compute and storage resources based on your inputs and current pricing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b900007f", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate Approximate Cost\n", + "def estimate_total_cost(runtime_hours, storage_cost, query_cost):\n", + "    compute_cost = compute_price_per_hour * runtime_hours\n", + "    total_data_cost = storage_cost + query_cost\n", + "    total_cost = compute_cost + total_data_cost\n", + "    return compute_cost, total_data_cost, total_cost\n", + "\n", + "compute_cost, total_data_cost, total_cost = estimate_total_cost(\n", + "    runtime_hours.value, \n", + "    storage_cost, \n", + "    query_cost\n", + ")\n", + "\n", + "print(f\"Estimated compute cost: ${compute_cost:.2f}\")\n", + "print(f\"Estimated data cost (storage + queries): ${total_data_cost:.4f}\")\n", + "print(f\"Total estimated cost: ${total_cost:.2f}\")\n", + "\n", + "# Cost breakdown by component\n", + "if resource_type.value in ['bq_dataset', 'bq_table'] and query_cost > 0:\n", + "    print(f\"\\nData cost breakdown:\")\n", + "    print(f\"  - Storage: ${storage_cost:.4f}\")\n", + "    print(f\"  - BigQuery queries: ${query_cost:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4c3146df", + "metadata": {}, + "source": [ + "## Display Cost Breakdown\n", + "\n", + "Below is a detailed breakdown of your estimated costs for running the specified notebook on Verily Workbench." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bac0444", + "metadata": {}, + "outputs": [], + "source": [ + "# Display Cost Breakdown\n", + "import pandas as pd\n", + "\n", + "# Create cost breakdown table\n", + "cost_components = ['Compute', 'Data (Storage + Queries)', 'Total']\n", + "cost_values = [f\"${compute_cost:.2f}\", f\"${total_data_cost:.4f}\", f\"${total_cost:.2f}\"]\n", + "\n", + "cost_breakdown = pd.DataFrame({\n", + "    'Component': cost_components,\n", + "    'Estimated Cost (USD)': cost_values\n", + "})\n", + "\n", + "display(cost_breakdown)\n", + "\n", + "# Detailed explanation\n", + "resource_explanation = {\n", + "    'gcs_bucket': 'Cloud Storage bucket',\n", + "    'gcs_object': 'Cloud Storage object/file', \n", + "    'bq_dataset': 'BigQuery dataset',\n", + "    'bq_table': 'BigQuery table',\n", + "    'mixed': 'Mixed resources (Cloud Storage + BigQuery)'\n", + "}\n", + "\n", + "explanation = f\"\"\"\n", + "**Cost Estimation Details:**\n", + "\n", + "**Compute:**\n", + "- Default JupyterLab app: {default_machine_type} ({default_vcpu} vCPU, {default_ram_gb} GB RAM)\n", + "- Compute price: ${compute_price_per_hour}/hr (us-central1, Nov 2025)\n", + "- Runtime: {runtime_hours.value} hours\n", + "\n", + "**Data Resources:**\n", + "- Resource type: {resource_explanation.get(resource_type.value, resource_type.value)}\n", + "- Storage price: ${storage_price_per_gb_month}/GB/month (Cloud Storage Standard)\"\"\"\n", + "\n", + "if resource_type.value in ['bq_dataset', 'bq_table']:\n", + "    explanation += f\"\"\"\n", + "- BigQuery storage: ${bq_storage_price_per_gb_month}/GB/month\n", + "- BigQuery queries: ${bq_query_price_per_tb}/TB processed\n", + "- Estimated queries: {bq_queries.value}\n", + "- Data processed per query: {bq_data_processed_gb.value} GB\"\"\"\n", + "\n", + "explanation += f\"\"\"\n", + "\n", + "**Notes:**\n", + "- Costs are estimated for us-central1 region\n", + "- Referenced resources (external data) may have no workspace storage cost\n", + "- Controlled resources are managed within your workspace\n", + "- Actual costs may vary based on region, usage patterns, and discounts\n", + "- See [Workbench data resources](https://support.workbench.verily.com/docs/guides/research_data/resource_intro/) for more details\n", + "\"\"\"\n", + "\n", + "display(Markdown(explanation))" + ] + }, + { + "cell_type": "markdown", + "id": "2df01784", + "metadata": {}, + "source": [ + "## Discover Your Workspace Resources with wb CLI\n", + "\n", + "The Verily Workbench CLI is available in this environment. We'll automatically discover and analyze the data resources in your workspace to provide accurate size estimates and resource types for cost calculation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa4b96a1", + "metadata": {}, + "outputs": [], + "source": [ + "# Discover Your Workspace Resources using Workbench CLI\n", + "import subprocess\n", + "import json\n", + "import pandas as pd\n", + "from IPython.display import display, HTML\n", + "\n", + "def run_wb_command(command):\n", + "    \"\"\"Run a wb CLI command and return the result\"\"\"\n", + "    result = subprocess.run(command, shell=True, capture_output=True, text=True)\n", + "    if result.returncode == 0:\n", + "        return result.stdout.strip()\n", + "    else:\n", + "        raise Exception(f\"Command failed: {command}\\nError: {result.stderr}\")\n", + "\n", + "def list_workspace_resources():\n", + "    \"\"\"List all resources in the current workspace\"\"\"\n", + "    print(\"🔍 Discovering workspace resources...\")\n", + "    \n", + "    # List all resources in JSON format for easier parsing\n", + "    resources_json = run_wb_command(\"wb resource list --format=JSON\")\n", + "    \n", + "    resources = json.loads(resources_json)\n", + "    if not resources:\n", + "        print(\"No resources found in this workspace.\")\n", + "        return pd.DataFrame()\n", + "    \n", + "    # Create a summary table\n", + "    resource_data = []\n", + "    for resource in resources:\n", + "        resource_data.append({\n", + "            'Name': resource.get('name', 'Unknown'),\n", + "            'Type': resource.get('resourceType', 'Unknown'),\n", + "            'Stewardship': resource.get('stewardshipType', 'Unknown'),\n", + "            'Description': resource.get('description', '')[:50] + '...' if len(resource.get('description', '')) > 50 else resource.get('description', ''),\n", + "            'Cloud Resource': resource.get('cloudName', 'Unknown')\n", + "        })\n", + "    \n", + "    df = pd.DataFrame(resource_data)\n", + "    print(f\"\\n✅ Found {len(resources)} resources in your workspace:\")\n", + "    display(df)\n", + "    return df\n", + "    \n", + "def get_resource_details(resource_name):\n", + "    \"\"\"Get detailed information about a specific resource\"\"\"\n", + "    print(f\"📋 Getting details for resource: {resource_name}\")\n", + "    \n", + "    details = run_wb_command(f\"wb resource describe --name={resource_name} --format=JSON\")\n", + "    resource_info = json.loads(details)\n", + "    \n", + "    print(f\"\\n**Resource:** {resource_info.get('name', 'Unknown')}\")\n", + "    print(f\"**Type:** {resource_info.get('resourceType', 'Unknown')}\")\n", + "    print(f\"**Stewardship:** {resource_info.get('stewardshipType', 'Unknown')}\")\n", + "    print(f\"**Cloud Resource:** {resource_info.get('cloudName', 'Unknown')}\")\n", + "    \n", + "    # For GCS buckets, we can try to get size info\n", + "    if resource_info.get('resourceType') == 'GCS_BUCKET':\n", + "        bucket_name = resource_info.get('cloudName', '')\n", + "        if bucket_name:\n", + "            print(f\"**Bucket:** {bucket_name}\")\n", + "            print(f\"💡 Use get_gcs_bucket_size('{bucket_name}') to get actual size\")\n", + "    \n", + "    # For BigQuery datasets, show how to get table info\n", + "    elif resource_info.get('resourceType') == 'BQ_DATASET':\n", + "        dataset_name = resource_info.get('cloudName', '')\n", + "        if dataset_name:\n", + "            print(f\"**Dataset:** {dataset_name}\")\n", + "            print(f\"💡 Use get_bigquery_dataset_size('{dataset_name}') to get actual size\")\n", + "    \n", + "    return resource_info\n", + "    \n", + "# Automatically run the resource discovery\n", + "resources_df = list_workspace_resources()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ccdf1bc7", + "metadata": {}, + "outputs": [], + "source": [ + "# Get detailed information for specific resources\n", + "print(\"📋 Available resources in your workspace:\")\n", + "if not resources_df.empty:\n", + "    for i, name in enumerate(resources_df['Name'], 1):\n", + "        resource_type = resources_df.iloc[i-1]['Type']\n", + "        stewardship = resources_df.iloc[i-1]['Stewardship']\n", + "        print(f\"   {i}. {name} ({resource_type}, {stewardship})\")\n", + "    \n", + "    print(f\"\\n💡 To get detailed information about a resource, use:\")\n", + "    print(f\"   resource_details = get_resource_details('resource-name')\")\n", + "    print(f\"\\n💡 Example: get_resource_details('{resources_df.iloc[0]['Name']}')\")\n", + "else:\n", + "    print(\"   No resources found in this workspace.\")\n", + "    print(\"   Make sure you're running this in a Verily Workbench JupyterLab environment with resources configured.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed3bd12a", + "metadata": {}, + "outputs": [], + "source": [ + "# Get Cloud Storage bucket sizes and BigQuery dataset information\n", + "def get_gcs_bucket_size(bucket_name):\n", + "    \"\"\"Get the size of a GCS bucket using gsutil via wb CLI\"\"\"\n", + "    print(f\"📏 Getting size for bucket: {bucket_name}\")\n", + "    \n", + "    # Get bucket size in bytes\n", + "    size_output = run_wb_command(f\"wb gsutil du -s gs://{bucket_name}\")\n", + "    \n", + "    # Parse the size (gsutil du returns: size_bytes gs://bucket_name)\n", + "    if size_output:\n", + "        size_bytes = int(size_output.split()[0])\n", + "        size_gb = size_bytes / (1024**3)  # Convert to GB\n", + "        \n", + "        print(f\"Bucket size: {size_gb:.2f} GB ({size_bytes:,} bytes)\")\n", + "        return size_gb\n", + "    else:\n", + "        print(f\"Could not determine bucket size\")\n", + "        return 0\n", + "\n", + "def get_bigquery_dataset_size(project_dataset):\n", + "    \"\"\"Get BigQuery dataset size and table information\"\"\"\n", + "    print(f\"📊 Getting BigQuery dataset info: {project_dataset}\")\n", + "    \n", + "    # Split project.dataset if needed\n", + "    if '.' not in project_dataset:\n", + "        print(\"⚠️ Dataset name should be in format 'project.dataset' or provide the full dataset reference\")\n", + "        return 0\n", + "    \n", + "    project, dataset = project_dataset.split('.', 1)\n", + "    \n", + "    # Query to get table sizes\n", + "    sql_query = f\"\"\"\n", + "    SELECT \n", + "        table_name,\n", + "        ROUND(size_bytes/1024/1024/1024, 2) as size_gb,\n", + "        row_count,\n", + "        creation_time\n", + "    FROM `{project}.{dataset}.INFORMATION_SCHEMA.TABLES`\n", + "    WHERE table_type = 'BASE TABLE'\n", + "    ORDER BY size_bytes DESC\n", + "    \"\"\"\n", + "    \n", + "    print(\"Running BigQuery size analysis...\")\n", + "    query_result = run_wb_command(f'wb bq query --sql=\"{sql_query}\" --format=JSON')\n", + "    \n", + "    tables = json.loads(query_result)\n", + "    \n", + "    if tables:\n", + "        print(f\"\\n📋 Tables in {project_dataset}:\")\n", + "        table_data = []\n", + "        total_size_gb = 0\n", + "        \n", + "        for table in tables:\n", + "            size_gb = float(table.get('size_gb', 0))\n", + "            total_size_gb += size_gb\n", + "            table_data.append({\n", + "                'Table': table.get('table_name', ''),\n", + "                'Size (GB)': size_gb,\n", + "                'Rows': table.get('row_count', 0),\n", + "                'Created': table.get('creation_time', '')[:10]  # Just date\n", + "            })\n", + "        \n", + "        df = pd.DataFrame(table_data)\n", + "        display(df)\n", + "        print(f\"\\n**Total dataset size: {total_size_gb:.2f} GB**\")\n", + "        return total_size_gb\n", + "    else:\n", + "        print(\"No tables found in dataset\")\n", + "        return 0\n", + "\n", + "# Auto-detect and analyze resources with actual sizes\n", + "print(\"🔍 Analyzing discovered resources for accurate size information...\")\n", + "\n", + "for _, resource in resources_df.iterrows():\n", + "    resource_name = resource['Name']\n", + "    resource_type = resource['Type']\n", + "    cloud_resource = resource['Cloud Resource']\n", + "    \n", + "    print(f\"\\n🔹 **{resource_name}** ({resource_type})\")\n", + "    \n", + "    try:\n", + "        if resource_type == 'GCS_BUCKET' and cloud_resource != 'Unknown':\n", + "            actual_size = get_gcs_bucket_size(cloud_resource)\n", + "            print(f\"   ✅ Actual size determined: {actual_size:.2f} GB\")\n", + "            \n", + "        elif resource_type == 'BQ_DATASET' and cloud_resource != 'Unknown':\n", + "            actual_size = get_bigquery_dataset_size(cloud_resource)\n", + "            print(f\"   ✅ Actual size determined: {actual_size:.2f} GB\")\n", + "            \n", + "        else:\n", + "            print(f\"   ℹ️ Size analysis not available for {resource_type}\")\n", + "            \n", + "    except Exception as e:\n", + "        print(f\"   ⚠️ Could not determine size: {str(e)}\")\n", + "    \n", + "    # Add compute costs\n", + "    compute_cost = compute_price_per_hour * runtime_hours.value\n", + "    total_estimated_cost += compute_cost\n", + "    \n", + "    print(f\"\\n\" + \"=\" * 60)\n", + "    print(f\"📋 **TOTAL COST ESTIMATE FROM WORKSPACE RESOURCES**\")\n", + "    print(f\"   💻 Compute cost ({runtime_hours.value}h): ${compute_cost:.2f}\")\n", + "    print(f\"   💾 Storage cost ({total_storage_gb:.1f}GB): ${total_estimated_cost - compute_cost - total_query_costs:.4f}\")\n", + "    if total_query_costs > 0:\n", + "        print(f\"   🔍 Query costs: ${total_query_costs:.4f}\")\n", + "    print(f\"   🎯 **TOTAL: ${total_estimated_cost:.2f}**\")\n", + "    \n", + "    return total_estimated_cost\n", + "    \n", + "# Run automated estimation with discovered resources\n", + "print(\"\\n🚀 Running automated cost estimation...\")\n", + "automated_cost = estimate_costs_from_resources()" + ] + }, + { + "cell_type": "markdown", + "id": "ab2333c2", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}