diff --git a/docs/user-guide/draft-validation.qmd b/docs/user-guide/draft-validation.qmd index fccbeffa2..2f8e1b7d3 100644 --- a/docs/user-guide/draft-validation.qmd +++ b/docs/user-guide/draft-validation.qmd @@ -66,7 +66,7 @@ data = pb.load_dataset(dataset="global_sales", tbl_type="polars") # Generate a validation plan pb.DraftValidation( data=data, - model="anthropic:claude-3-7-sonnet-latest", + model="anthropic:claude-sonnet-4-5", api_key="your_api_key_here" # Replace with your actual API key ) ``` @@ -159,7 +159,7 @@ api_key = os.getenv("ANTHROPIC_API_KEY") draft_validation = pb.DraftValidation( data=data, - model="anthropic:claude-3-7-sonnet-latest", + model="anthropic:claude-sonnet-4-5", api_key=api_key ) ``` @@ -179,7 +179,7 @@ If your API keys have standard names (like `ANTHROPIC_API_KEY` or `OPENAI_API_KE # No API key needed if stored in .env with standard names draft_validation = pb.DraftValidation( data=data, - model="anthropic:claude-3-7-sonnet-latest" + model="anthropic:claude-sonnet-4-5" ) ``` @@ -191,7 +191,7 @@ Here's an example of a validation plan that might be generated by `DraftValidati ```python pb.DraftValidation( pb.load_dataset(dataset="nycflights", tbl_type="duckdb", - model="anthropic:claude-3-7-sonnet-latest" + model="anthropic:claude-sonnet-4-5" ) ``` @@ -269,7 +269,7 @@ When using `DraftValidation`, you specify the model in the format `"provider:mod ```python # Using Anthropic's Claude model -pb.DraftValidation(data=data, model="anthropic:claude-3-7-sonnet-latest") +pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") # Using OpenAI's GPT model pb.DraftValidation(data=data, model="openai:gpt-4-turbo") @@ -285,7 +285,7 @@ pb.DraftValidation(data=data, model="bedrock:anthropic.claude-3-sonnet-20240229- Different models have different capabilities when it comes to generating validation plans: -- Anthropic Claude 3.7 Sonnet generally provides the most comprehensive and accurate validation +- Anthropic Claude Sonnet 4.5 generally provides the most comprehensive and accurate validation plans - OpenAI GPT-4 models also perform well - Local models through Ollama can be useful for private data but they currently have reduced diff --git a/pointblank/_interrogation.py b/pointblank/_interrogation.py index 1ceff1f55..09d9d8333 100644 --- a/pointblank/_interrogation.py +++ b/pointblank/_interrogation.py @@ -1895,6 +1895,7 @@ def interrogate_prompt(tbl: FrameT, columns_subset: list[str] | None, ai_config: provider=llm_provider, model=llm_model, api_key=None, # Will be loaded from environment variables + verify_ssl=True, # Default to verifying SSL certificates ) # Set up batch configuration diff --git a/pointblank/_utils_ai.py b/pointblank/_utils_ai.py index 05a22e240..a94c87939 100644 --- a/pointblank/_utils_ai.py +++ b/pointblank/_utils_ai.py @@ -28,17 +28,22 @@ class _LLMConfig: provider LLM provider name (e.g., 'anthropic', 'openai', 'ollama', 'bedrock'). model - Model name (e.g., 'claude-3-sonnet-20240229', 'gpt-4'). + Model name (e.g., 'claude-sonnet-4-5', 'gpt-4'). api_key API key for the provider. If None, will be read from environment. + verify_ssl + Whether to verify SSL certificates when making requests. Defaults to True. """ provider: str model: str api_key: Optional[str] = None + verify_ssl: bool = True -def _create_chat_instance(provider: str, model_name: str, api_key: Optional[str] = None): +def _create_chat_instance( + provider: str, model_name: str, api_key: Optional[str] = None, verify_ssl: bool = True +): """ Create a chatlas chat instance for the specified provider. @@ -50,6 +55,8 @@ def _create_chat_instance(provider: str, model_name: str, api_key: Optional[str] The model name for the provider. api_key Optional API key. If None, will be read from environment. + verify_ssl + Whether to verify SSL certificates when making requests. Defaults to True. Returns ------- @@ -89,6 +96,17 @@ def _create_chat_instance(provider: str, model_name: str, api_key: Optional[str] {"index": 2, "result": true} ]""" + # Create httpx client with SSL verification settings + try: + import httpx # noqa + except ImportError: # pragma: no cover + raise ImportError( # pragma: no cover + "The `httpx` package is required for SSL configuration. " + "Please install it using `pip install httpx`." + ) + + http_client = httpx.AsyncClient(verify=verify_ssl) + # Create provider-specific chat instance if provider == "anthropic": # pragma: no cover # Check that the anthropic package is installed @@ -106,6 +124,7 @@ def _create_chat_instance(provider: str, model_name: str, api_key: Optional[str] model=model_name, api_key=api_key, system_prompt=system_prompt, + kwargs={"http_client": http_client}, ) elif provider == "openai": # pragma: no cover @@ -124,6 +143,7 @@ def _create_chat_instance(provider: str, model_name: str, api_key: Optional[str] model=model_name, api_key=api_key, system_prompt=system_prompt, + kwargs={"http_client": http_client}, ) elif provider == "ollama": # pragma: no cover @@ -141,6 +161,7 @@ def _create_chat_instance(provider: str, model_name: str, api_key: Optional[str] chat = ChatOllama( model=model_name, system_prompt=system_prompt, + kwargs={"http_client": http_client}, ) elif provider == "bedrock": # pragma: no cover @@ -149,6 +170,7 @@ def _create_chat_instance(provider: str, model_name: str, api_key: Optional[str] chat = ChatBedrockAnthropic( model=model_name, system_prompt=system_prompt, + kwargs={"http_client": http_client}, ) else: @@ -722,7 +744,10 @@ def __init__(self, llm_config: _LLMConfig): """ self.llm_config = llm_config self.chat = _create_chat_instance( - provider=llm_config.provider, model_name=llm_config.model, api_key=llm_config.api_key + provider=llm_config.provider, + model_name=llm_config.model, + api_key=llm_config.api_key, + verify_ssl=llm_config.verify_ssl, ) def validate_batches( diff --git a/pointblank/assistant.py b/pointblank/assistant.py index 21a8eee85..52eae95f0 100644 --- a/pointblank/assistant.py +++ b/pointblank/assistant.py @@ -55,7 +55,7 @@ def assistant( ---------- model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. data An optional data table to focus on during discussion with the PbA, which could be a diff --git a/pointblank/data/api-docs.txt b/pointblank/data/api-docs.txt index d3f3e5e70..fb57594e7 100644 --- a/pointblank/data/api-docs.txt +++ b/pointblank/data/api-docs.txt @@ -1157,7 +1157,7 @@ Definition of a schema object. `Schema` object is used in a validation workflow. -DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None) -> None +DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None, verify_ssl: 'bool' = True) -> None Draft a validation plan for a given table using an LLM. @@ -1180,10 +1180,15 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None The data to be used for drafting a validation plan. model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. api_key The API key to be used for the model. + verify_ssl + Whether to verify SSL certificates when making requests to the LLM provider. Set to `False` + to disable SSL verification (e.g., when behind a corporate firewall with self-signed + certificates). Defaults to `True`. Use with caution as disabling SSL verification can pose + security risks. Returns ------- @@ -1225,6 +1230,33 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None There's no need to have the `python-dotenv` package installed when using `.env` files in this way. + Notes on SSL Certificate Verification + -------------------------------------- + By default, SSL certificate verification is enabled for all requests to LLM providers. However, + in certain network environments (such as corporate networks with self-signed certificates or + firewall proxies), you may encounter SSL certificate verification errors. + + To disable SSL verification, set the `verify_ssl` parameter to `False`: + + ```python + import pointblank as pb + + data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") + + # Disable SSL verification for networks with self-signed certificates + pb.DraftValidation( + data=data, + model="anthropic:claude-sonnet-4-5", + verify_ssl=False + ) + ``` + + :::{.callout-warning} + Disabling SSL verification (through `verify_ssl=False`) can expose your API keys and data to + man-in-the-middle attacks. Only use this option in trusted network environments and when + absolutely necessary. + ::: + Notes on Data Sent to the Model Provider ---------------------------------------- The data sent to the model provider is a JSON summary of the table. This data summary is @@ -1251,7 +1283,7 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None Let's look at how the `DraftValidation` class can be used to draft a validation plan for a table. The table to be used is `"nycflights"`, which is available here via the [`load_dataset()`](`pointblank.load_dataset`) function. The model to be used is - `"anthropic:claude-3-5-sonnet-latest"` (which performs very well compared to other LLMs). The + `"anthropic:claude-sonnet-4-5"` (which performs very well compared to other LLMs). The example assumes that the API key is stored in an `.env` file as `ANTHROPIC_API_KEY`. ```python @@ -1261,7 +1293,7 @@ DraftValidation(data: 'FrameT | Any', model: 'str', api_key: 'str | None' = None data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") # Draft a validation plan for the "nycflights" table - pb.DraftValidation(data=data, model="anthropic:claude-3-5-sonnet-latest") + pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") ``` The output will be a drafted validation plan for the `"nycflights"` table and this will appear @@ -5853,10 +5885,10 @@ prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | Non so try to include only the columns necessary for the validation. model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, - `"openai"`, `"ollama"`, and `"bedrock"`. The model name should be the specific model to - be used from the provider. Model names are subject to change so consult the provider's - documentation for the most up-to-date model names. + `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, + `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from + the provider. Model names are subject to change so consult the provider's documentation + for the most up-to-date model names. batch_size Number of rows to process in each batch. Larger batches are more efficient but may hit API limits. Default is `1000`. @@ -9927,7 +9959,7 @@ assistant(model: 'str', data: 'FrameT | Any | None' = None, tbl_name: 'str | Non ---------- model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. data An optional data table to focus on during discussion with the PbA, which could be a diff --git a/pointblank/draft.py b/pointblank/draft.py index c69b37ab8..8531befda 100644 --- a/pointblank/draft.py +++ b/pointblank/draft.py @@ -38,10 +38,15 @@ class DraftValidation: The data to be used for drafting a validation plan. model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. api_key The API key to be used for the model. + verify_ssl + Whether to verify SSL certificates when making requests to the LLM provider. Set to `False` + to disable SSL verification (e.g., when behind a corporate firewall with self-signed + certificates). Defaults to `True`. Use with caution as disabling SSL verification can pose + security risks. Returns ------- @@ -83,6 +88,33 @@ class DraftValidation: There's no need to have the `python-dotenv` package installed when using `.env` files in this way. + Notes on SSL Certificate Verification + -------------------------------------- + By default, SSL certificate verification is enabled for all requests to LLM providers. However, + in certain network environments (such as corporate networks with self-signed certificates or + firewall proxies), you may encounter SSL certificate verification errors. + + To disable SSL verification, set the `verify_ssl` parameter to `False`: + + ```python + import pointblank as pb + + data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") + + # Disable SSL verification for networks with self-signed certificates + pb.DraftValidation( + data=data, + model="anthropic:claude-sonnet-4-5", + verify_ssl=False + ) + ``` + + :::{.callout-warning} + Disabling SSL verification (through `verify_ssl=False`) can expose your API keys and data to + man-in-the-middle attacks. Only use this option in trusted network environments and when + absolutely necessary. + ::: + Notes on Data Sent to the Model Provider ---------------------------------------- The data sent to the model provider is a JSON summary of the table. This data summary is @@ -109,7 +141,7 @@ class DraftValidation: Let's look at how the `DraftValidation` class can be used to draft a validation plan for a table. The table to be used is `"nycflights"`, which is available here via the [`load_dataset()`](`pointblank.load_dataset`) function. The model to be used is - `"anthropic:claude-3-5-sonnet-latest"` (which performs very well compared to other LLMs). The + `"anthropic:claude-sonnet-4-5"` (which performs very well compared to other LLMs). The example assumes that the API key is stored in an `.env` file as `ANTHROPIC_API_KEY`. ```python @@ -119,7 +151,7 @@ class DraftValidation: data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") # Draft a validation plan for the "nycflights" table - pb.DraftValidation(data=data, model="anthropic:claude-3-5-sonnet-latest") + pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") ``` The output will be a drafted validation plan for the `"nycflights"` table and this will appear @@ -194,6 +226,7 @@ class DraftValidation: data: FrameT | Any model: str api_key: str | None = None + verify_ssl: bool = True response: str = field(init=False) def __post_init__(self): @@ -280,6 +313,18 @@ def __post_init__(self): " per line)" ) + # Create httpx client with SSL verification settings + # This will be passed to the LLM provider's chat client + try: + import httpx # noqa + except ImportError: # pragma: no cover + raise ImportError( # pragma: no cover + "The `httpx` package is required for SSL configuration. " + "Please install it using `pip install httpx`." + ) + + http_client = httpx.AsyncClient(verify=self.verify_ssl) + if provider == "anthropic": # pragma: no cover # Check that the anthropic package is installed try: @@ -296,6 +341,7 @@ def __post_init__(self): model=model_name, system_prompt="You are a terse assistant and a Python expert.", api_key=self.api_key, + kwargs={"http_client": http_client}, ) if provider == "openai": # pragma: no cover @@ -314,6 +360,7 @@ def __post_init__(self): model=model_name, system_prompt="You are a terse assistant and a Python expert.", api_key=self.api_key, + kwargs={"http_client": http_client}, ) if provider == "ollama": # pragma: no cover @@ -331,6 +378,7 @@ def __post_init__(self): chat = ChatOllama( # pragma: no cover model=model_name, system_prompt="You are a terse assistant and a Python expert.", + kwargs={"http_client": http_client}, ) if provider == "bedrock": # pragma: no cover @@ -339,6 +387,7 @@ def __post_init__(self): chat = ChatBedrockAnthropic( # pragma: no cover model=model_name, system_prompt="You are a terse assistant and a Python expert.", + kwargs={"http_client": http_client}, ) self.response = str(chat.chat(prompt, stream=False, echo="none")) # pragma: no cover diff --git a/pointblank/validate.py b/pointblank/validate.py index 8843e73e3..5e0343803 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -9396,10 +9396,10 @@ def prompt( so try to include only the columns necessary for the validation. model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, - `"openai"`, `"ollama"`, and `"bedrock"`. The model name should be the specific model to - be used from the provider. Model names are subject to change so consult the provider's - documentation for the most up-to-date model names. + `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, + `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from + the provider. Model names are subject to change so consult the provider's documentation + for the most up-to-date model names. batch_size Number of rows to process in each batch. Larger batches are more efficient but may hit API limits. Default is `1000`. diff --git a/pointblank_mcp_server/pointblank_server.py b/pointblank_mcp_server/pointblank_server.py index 1bda07b7e..5be708726 100644 --- a/pointblank_mcp_server/pointblank_server.py +++ b/pointblank_mcp_server/pointblank_server.py @@ -2429,8 +2429,8 @@ async def draft_validation_plan( dataframe_id: Annotated[str, "ID of the DataFrame to generate validation plan for."], model: Annotated[ str, - "AI model to use in format 'provider:model' (e.g., 'anthropic:claude-3-5-sonnet-latest', 'openai:gpt-4'). Supported providers: anthropic, openai, ollama, bedrock.", - ] = "anthropic:claude-3-5-sonnet-latest", + "AI model to use in format 'provider:model' (e.g., 'anthropic:claude-sonnet-4-5', 'openai:gpt-4'). Supported providers: anthropic, openai, ollama, bedrock.", + ] = "anthropic:claude-sonnet-4-5", api_key: Annotated[ Optional[str], "API key for the model provider. If not provided, will try to load from environment variables or .env file.", diff --git a/pyproject.toml b/pyproject.toml index 5370945ad..3b2850fa6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ generate = [ "chatlas>=0.3.0", "anthropic[bedrock]>=0.45.2", "openai>=1.63.0", + "httpx>=0.28.0", "shiny>=1.3.0", ] mcp = [ diff --git a/tests/test__utils_ai.py b/tests/test__utils_ai.py index 81dcc7b21..c394529cc 100644 --- a/tests/test__utils_ai.py +++ b/tests/test__utils_ai.py @@ -50,7 +50,7 @@ def sample_pl_data(): @pytest.fixture def llm_config(): """Sample LLM configuration.""" - return _LLMConfig(provider="anthropic", model="claude-3-sonnet-20240229", api_key="test-key") + return _LLMConfig(provider="anthropic", model="claude-sonnet-4-5", api_key="test-key") @pytest.fixture @@ -84,10 +84,10 @@ def test_llm_config_creation(): def test_llm_config_with_api_key(): """Test LLMConfig with API key.""" - config = _LLMConfig(provider="anthropic", model="claude-3", api_key="test-key") + config = _LLMConfig(provider="anthropic", model="claude-sonnet-4-5", api_key="test-key") assert config.provider == "anthropic" - assert config.model == "claude-3" + assert config.model == "claude-sonnet-4-5" assert config.api_key == "test-key" @@ -529,7 +529,10 @@ def test_ai_validation_engine_init(mock_create_chat, llm_config): assert engine.llm_config is llm_config assert engine.chat is mock_chat mock_create_chat.assert_called_once_with( - provider="anthropic", model_name="claude-3-sonnet-20240229", api_key="test-key" + provider="anthropic", + model_name="claude-sonnet-4-5", + api_key="test-key", + verify_ssl=True, ) @@ -709,7 +712,10 @@ def mock_chat_response(prompt, **kwargs): mock_create_chat.return_value = mock_chat # Setup components - llm_config = _LLMConfig(provider="anthropic", model="claude-3") + llm_config = _LLMConfig( + provider="anthropic", + model="claude-sonnet-4-5", + ) batch_config = _BatchConfig(size=2, max_concurrent=1) # Create batcher and batches