diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 40902c36f..c0567f642 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -23,6 +23,7 @@ sdk/python/foundation-models/cohere/command_faiss_langchain.ipynb @stewart-co @k sdk/python/foundation-models/cohere/command_tools-langchain.ipynb @stewart-co @kseniia-cohere /sdk/python/foundation-models/nixtla/ @AzulGarza /sdk/python/foundation-models/healthcare-ai/ @jmerkow @ivantarapov +/sdk/python/foundation-models/rf3-modelforge/ @pabhatia-ms /sdk/python/assets/data/versioning.ipynb @ShakutaiGit /sdk/python/jobs/finetuning @amltres @sasum @marici /sdk/python/jobs/grpo @sharvin2187 @rtanase @gpenumetsa-msft @yeshsurya @babu-namburi diff --git a/sdk/python/foundation-models/rf3-modelforge/webrequest-rf3.ipynb b/sdk/python/foundation-models/rf3-modelforge/webrequest-rf3.ipynb new file mode 100644 index 000000000..e1160ca42 --- /dev/null +++ b/sdk/python/foundation-models/rf3-modelforge/webrequest-rf3.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a6320f26", + "metadata": {}, + "source": [ + "# Extract Files from JSON\n", + "\n", + "This notebook processes JSON files containing base64-encoded data and extracts them as files to a specified output directory. It handles both .gz compressed files (like .cif.gz) and CSV files.\n", + "\n", + "## Overview\n", + "The JSON structure is expected to have an \"outputs\" section with file names as keys and base64-encoded file contents as values." + ] + }, + { + "cell_type": "markdown", + "id": "9bd9270a", + "metadata": {}, + "source": [ + "## 1. Import Required Libraries\n", + "\n", + "Import all necessary libraries for file operations, JSON processing, and base64 decoding." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "562ffbd4", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import base64\n", + "import gzip\n", + "import os\n", + "from pathlib import Path\n", + "import shutil" + ] + }, + { + "cell_type": "markdown", + "id": "78bfecf4", + "metadata": {}, + "source": [ + "## 2. Configuration\n", + "\n", + "Set up file paths and configuration parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "36ffcbad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JSON file path: out.temp\n", + "Output directory: extracted_files\n" + ] + } + ], + "source": [ + "# Configuration parameters\n", + "JSON_FILE_PATH = \"out.temp\" # Update this path to your JSON file\n", + "OUTPUT_DIRECTORY = \"extracted_files\" # Output directory for extracted files\n", + "\n", + "print(f\"JSON file path: {JSON_FILE_PATH}\")\n", + "print(f\"Output directory: {OUTPUT_DIRECTORY}\")" + ] + }, + { + "cell_type": "markdown", + "id": "96c2a60c", + "metadata": {}, + "source": [ + "## 3. Load JSON Data\n", + "\n", + "Load and parse the JSON file containing the base64-encoded data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fbfc1dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully loaded JSON file: out.temp\n", + "Found 'outputs' section with 7 items\n", + "\n", + "JSON structure:\n", + "Request ID: req_6b3e11172ce141b6924b2dddc5ef8efe\n", + "Status: success\n", + "Output files found: ['pb-msa_model_0.cif.gz', 'pb-msa_model_3.cif.gz', 'pb-msa_model_4.cif.gz', 'pb-msa_model_2.cif.gz', 'pb-msa_model_1.cif.gz', 'pb-msa.score', 'pb-msa_metrics.csv']\n" + ] + } + ], + "source": [ + "def load_json_data(json_file_path):\n", + " \"\"\"\n", + " Load JSON data from file and return the parsed content.\n", + " \"\"\"\n", + " try:\n", + " with open(json_file_path, 'r') as file:\n", + " data = json.load(file)\n", + " print(f\"Successfully loaded JSON file: {json_file_path}\")\n", + " \n", + " # Check if the JSON has the expected structure\n", + " if 'outputs' in data:\n", + " print(f\"Found 'outputs' section with {len(data['outputs'])} items\")\n", + " return data\n", + " else:\n", + " print(\"Warning: 'outputs' section not found in JSON\")\n", + " return data\n", + " \n", + " except FileNotFoundError:\n", + " print(f\"Error: File {json_file_path} not found\")\n", + " return None\n", + " except json.JSONDecodeError:\n", + " print(f\"Error: Invalid JSON format in {json_file_path}\")\n", + " return None\n", + "\n", + "# Load the JSON data\n", + "json_data = load_json_data(JSON_FILE_PATH)\n", + "\n", + "if json_data:\n", + " print(\"\\nJSON structure:\")\n", + " print(f\"Request ID: {json_data.get('request_id', 'Not found')}\")\n", + " print(f\"Status: {json_data.get('status', 'Not found')}\")\n", + " if 'outputs' in json_data:\n", + " print(f\"Output files found: {list(json_data['outputs'].keys())}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4b010695", + "metadata": {}, + "source": [ + "## 4. Setup Output Directory\n", + "\n", + "Create the output directory if it doesn't exist." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ff0937ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Output directory ready: /home/mekshirs/azurefiles/UW_IPD/extracted_files\n" + ] + } + ], + "source": [ + "def setup_output_directory(output_dir):\n", + " \"\"\"\n", + " Create output directory if it doesn't exist.\n", + " \"\"\"\n", + " output_path = Path(output_dir)\n", + " output_path.mkdir(parents=True, exist_ok=True)\n", + " print(f\"Output directory ready: {output_path.absolute()}\")\n", + " return output_path\n", + "\n", + "# Setup output directory\n", + "output_path = setup_output_directory(OUTPUT_DIRECTORY)" + ] + }, + { + "cell_type": "markdown", + "id": "245ddcc6", + "metadata": {}, + "source": [ + "## 5. File Extraction Functions\n", + "\n", + "Define functions to handle base64 decoding and file saving for different file types." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d96f300d", + "metadata": {}, + "outputs": [], + "source": [ + "def decode_and_save_gz_file(filename, base64_data, output_path):\n", + " \"\"\"\n", + " Decode base64 data and save as a .gz file.\n", + " \"\"\"\n", + " try:\n", + " # Decode base64 data\n", + " decoded_data = base64.b64decode(base64_data)\n", + " \n", + " # Create full file path\n", + " file_path = output_path / filename\n", + " \n", + " # Write the decoded data directly as a .gz file\n", + " with open(file_path, 'wb') as f:\n", + " f.write(decoded_data)\n", + " \n", + " print(f\"✓ Saved .gz file: {filename} ({len(decoded_data)} bytes)\")\n", + " return True\n", + " \n", + " except Exception as e:\n", + " print(f\"✗ Error saving {filename}: {e}\")\n", + " return False\n", + "\n", + "def decode_and_save_csv_file(filename, base64_data, output_path):\n", + " \"\"\"\n", + " Decode base64 data and save as a CSV file.\n", + " \"\"\"\n", + " try:\n", + " # Decode base64 data\n", + " decoded_data = base64.b64decode(base64_data)\n", + " \n", + " # Create full file path\n", + " file_path = output_path / filename\n", + " \n", + " # Write the decoded data as text (CSV is text-based)\n", + " with open(file_path, 'wb') as f:\n", + " f.write(decoded_data)\n", + " \n", + " print(f\"✓ Saved CSV file: {filename} ({len(decoded_data)} bytes)\")\n", + " return True\n", + " \n", + " except Exception as e:\n", + " print(f\"✗ Error saving {filename}: {e}\")\n", + " return False\n", + "\n", + "def decode_and_save_generic_file(filename, base64_data, output_path):\n", + " \"\"\"\n", + " Decode base64 data and save as a generic file.\n", + " \"\"\"\n", + " try:\n", + " # Decode base64 data\n", + " decoded_data = base64.b64decode(base64_data)\n", + " \n", + " # Create full file path\n", + " file_path = output_path / filename\n", + " \n", + " # Write the decoded data\n", + " with open(file_path, 'wb') as f:\n", + " f.write(decoded_data)\n", + " \n", + " print(f\"✓ Saved file: {filename} ({len(decoded_data)} bytes)\")\n", + " return True\n", + " \n", + " except Exception as e:\n", + " print(f\"✗ Error saving {filename}: {e}\")\n", + " return False" + ] + }, + { + "cell_type": "markdown", + "id": "cb0fc671", + "metadata": {}, + "source": [ + "## 6. Process and Extract Files\n", + "\n", + "Extract all files from the JSON data based on their file extensions." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ed646eb0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Processing 7 files...\n", + "--------------------------------------------------\n", + "✓ Saved .gz file: pb-msa_model_0.cif.gz (60745 bytes)\n", + "✓ Saved .gz file: pb-msa_model_3.cif.gz (60828 bytes)\n", + "✓ Saved .gz file: pb-msa_model_4.cif.gz (61190 bytes)\n", + "✓ Saved .gz file: pb-msa_model_2.cif.gz (60931 bytes)\n", + "✓ Saved .gz file: pb-msa_model_1.cif.gz (61087 bytes)\n", + "✓ Saved file: pb-msa.score (2299 bytes)\n", + "✓ Saved CSV file: pb-msa_metrics.csv (904 bytes)\n", + "--------------------------------------------------\n", + "Extraction Summary:\n", + " .gz files saved: 5\n", + " .csv files saved: 1\n", + " Other files saved: 1\n", + " Total successful: 7/7\n" + ] + } + ], + "source": [ + "def process_files(json_data, output_path):\n", + " \"\"\"\n", + " Process all files in the JSON data and save them to the output directory.\n", + " \"\"\"\n", + " if not json_data or 'outputs' not in json_data:\n", + " print(\"No outputs found in JSON data\")\n", + " return\n", + " \n", + " outputs = json_data['outputs']\n", + " \n", + " # Counters for different file types\n", + " gz_files = 0\n", + " csv_files = 0\n", + " other_files = 0\n", + " successful_saves = 0\n", + " \n", + " print(f\"\\nProcessing {len(outputs)} files...\")\n", + " print(\"-\" * 50)\n", + " \n", + " for filename, base64_data in outputs.items():\n", + " # Check file extension and process accordingly\n", + " if filename.endswith('.gz'):\n", + " # Handle .gz files (including .cif.gz)\n", + " if decode_and_save_gz_file(filename, base64_data, output_path):\n", + " gz_files += 1\n", + " successful_saves += 1\n", + " \n", + " elif filename.endswith('.csv'):\n", + " # Handle CSV files\n", + " if decode_and_save_csv_file(filename, base64_data, output_path):\n", + " csv_files += 1\n", + " successful_saves += 1\n", + " \n", + " else:\n", + " # Handle other file types\n", + " if decode_and_save_generic_file(filename, base64_data, output_path):\n", + " other_files += 1\n", + " successful_saves += 1\n", + " \n", + " # Print summary\n", + " print(\"-\" * 50)\n", + " print(f\"Extraction Summary:\")\n", + " print(f\" .gz files saved: {gz_files}\")\n", + " print(f\" .csv files saved: {csv_files}\")\n", + " print(f\" Other files saved: {other_files}\")\n", + " print(f\" Total successful: {successful_saves}/{len(outputs)}\")\n", + "\n", + "# Process the files if JSON data is available\n", + "if json_data:\n", + " process_files(json_data, output_path)\n", + "else:\n", + " print(\"Cannot process files - JSON data not loaded\")" + ] + }, + { + "cell_type": "markdown", + "id": "b909e7ac", + "metadata": {}, + "source": [ + "## 7. Verify Extracted Files\n", + "\n", + "Check the output directory and verify that files were extracted correctly." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3289bc5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Files in output directory (extracted_files):\n", + "------------------------------------------------------------\n", + "\n", + " Compressed (.gz) files:\n", + " pb-msa_model_0.cif.gz 60,745 bytes\n", + " pb-msa_model_1.cif.gz 61,087 bytes\n", + " pb-msa_model_2.cif.gz 60,931 bytes\n", + " pb-msa_model_3.cif.gz 60,828 bytes\n", + " pb-msa_model_4.cif.gz 61,190 bytes\n", + "\n", + " CSV files:\n", + " pb-msa_metrics.csv 904 bytes\n", + "\n", + " Other files:\n", + " pb-msa.score 2,299 bytes\n", + "\n", + "Total files extracted: 7\n" + ] + } + ], + "source": [ + "def verify_extracted_files(output_path):\n", + " \"\"\"\n", + " Verify the extracted files in the output directory.\n", + " \"\"\"\n", + " if not output_path.exists():\n", + " print(f\"Output directory {output_path} does not exist\")\n", + " return\n", + " \n", + " files = list(output_path.iterdir())\n", + " \n", + " if not files:\n", + " print(f\"No files found in {output_path}\")\n", + " return\n", + " \n", + " print(f\"\\nFiles in output directory ({output_path}):\")\n", + " print(\"-\" * 60)\n", + " \n", + " # Sort files by extension for better organization\n", + " gz_files = [f for f in files if f.name.endswith('.gz')]\n", + " csv_files = [f for f in files if f.name.endswith('.csv')]\n", + " other_files = [f for f in files if not f.name.endswith('.gz') and not f.name.endswith('.csv')]\n", + " \n", + " # Display .gz files\n", + " if gz_files:\n", + " print(\"\\n Compressed (.gz) files:\")\n", + " for file in sorted(gz_files):\n", + " size = file.stat().st_size\n", + " print(f\" {file.name:<40} {size:>10,} bytes\")\n", + " \n", + " # Display CSV files\n", + " if csv_files:\n", + " print(\"\\n CSV files:\")\n", + " for file in sorted(csv_files):\n", + " size = file.stat().st_size\n", + " print(f\" {file.name:<40} {size:>10,} bytes\")\n", + " \n", + " # Display other files\n", + " if other_files:\n", + " print(\"\\n Other files:\")\n", + " for file in sorted(other_files):\n", + " size = file.stat().st_size\n", + " print(f\" {file.name:<40} {size:>10,} bytes\")\n", + " \n", + " print(f\"\\nTotal files extracted: {len(files)}\")\n", + "\n", + "# Verify the extracted files\n", + "verify_extracted_files(output_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6954a1d6-6b3b-4258-9251-b9c67b8f3c05", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "trace", + "language": "python", + "name": "trace" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}