|
5 | 5 | import logging |
6 | 6 | from typing import TYPE_CHECKING, Any, Dict, Optional |
7 | 7 |
|
| 8 | +from amp.utils.manifest_inspector import describe_manifest, print_schema |
| 9 | + |
8 | 10 | from . import models |
9 | 11 |
|
10 | 12 | if TYPE_CHECKING: |
@@ -197,6 +199,72 @@ def get_manifest(self, namespace: str, name: str, version: str) -> dict: |
197 | 199 | response = self._registry._request('GET', path) |
198 | 200 | return response.json() |
199 | 201 |
|
| 202 | + def describe(self, namespace: str, name: str, version: str = 'latest') -> Dict[str, list[Dict[str, str | bool]]]: |
| 203 | + """Get a structured summary of tables and columns in a dataset. |
| 204 | +
|
| 205 | + Returns a dictionary mapping table names to lists of column information, |
| 206 | + making it easy to programmatically inspect the dataset schema. |
| 207 | +
|
| 208 | + Args: |
| 209 | + namespace: Dataset namespace |
| 210 | + name: Dataset name |
| 211 | + version: Version tag (default: 'latest') |
| 212 | +
|
| 213 | + Returns: |
| 214 | + dict: Mapping of table names to column information. Each column is a dict with: |
| 215 | + - name: Column name (str) |
| 216 | + - type: Arrow type (str, simplified representation) |
| 217 | + - nullable: Whether the column allows NULL values (bool) |
| 218 | +
|
| 219 | + Example: |
| 220 | + >>> client = RegistryClient() |
| 221 | + >>> schema = client.datasets.describe('edgeandnode', 'ethereum-mainnet', 'latest') |
| 222 | + >>> for table_name, columns in schema.items(): |
| 223 | + ... print(f"\\nTable: {table_name}") |
| 224 | + ... for col in columns: |
| 225 | + ... nullable = "NULL" if col['nullable'] else "NOT NULL" |
| 226 | + ... print(f" {col['name']}: {col['type']} {nullable}") |
| 227 | + """ |
| 228 | + manifest = self.get_manifest(namespace, name, version) |
| 229 | + return describe_manifest(manifest) |
| 230 | + |
| 231 | + def inspect(self, namespace: str, name: str, version: str = 'latest') -> None: |
| 232 | + """Pretty-print the structure of a dataset for easy inspection. |
| 233 | +
|
| 234 | + Displays tables and their columns in a human-readable format. |
| 235 | + This is perfect for exploring datasets interactively. |
| 236 | +
|
| 237 | + Args: |
| 238 | + namespace: Dataset namespace |
| 239 | + name: Dataset name |
| 240 | + version: Version tag (default: 'latest') |
| 241 | +
|
| 242 | + Example: |
| 243 | + >>> client = RegistryClient() |
| 244 | + >>> client.datasets.inspect('graphops', 'ethereum-mainnet') |
| 245 | + Dataset: graphops/ethereum-mainnet@latest |
| 246 | +
|
| 247 | + blocks (4 columns) |
| 248 | + block_num UInt64 NOT NULL |
| 249 | + timestamp Timestamp NOT NULL |
| 250 | + hash FixedSizeBinary(32) NOT NULL |
| 251 | + parent_hash FixedSizeBinary(32) NOT NULL |
| 252 | +
|
| 253 | + transactions (23 columns) |
| 254 | + block_num UInt64 NOT NULL |
| 255 | + tx_hash FixedSizeBinary(32) NOT NULL |
| 256 | + ... |
| 257 | + """ |
| 258 | + # Get dataset info |
| 259 | + dataset = self.get(namespace, name) |
| 260 | + header = f'Dataset: {namespace}/{name}@{version}' |
| 261 | + if dataset.description: |
| 262 | + header += f'\nDescription: {dataset.description}' |
| 263 | + |
| 264 | + # Get schema and print |
| 265 | + schema = self.describe(namespace, name, version) |
| 266 | + print_schema(schema, header=header) |
| 267 | + |
200 | 268 | # Write Operations (Require Authentication) |
201 | 269 |
|
202 | 270 | def publish( |
|
0 commit comments