pangaeaGPT/cli.py at main · CliDyn/pangaeaGPT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
#!/usr/bin/env python3
"""
cli.py - Command Line Interface for PANGAEA GPT

This provides a command-line interface to the PANGAEA GPT system,
allowing users to search datasets and run analysis without the Streamlit UI.
"""

import os
import sys
import argparse
import json
import logging
from pathlib import Path
from typing import Dict, List, Optional, Any
import pandas as pd
from datetime import datetime

# Add the project root to Python path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

# CRITICAL: Set deployment mode BEFORE any imports that use streamlit
# This prevents src/config.py from trying to access st.secrets
os.environ["PANGAEA_CLI_MODE"] = "true"
os.environ["DEPLOYMENT_MODE"] = "cli"  # Override deployment mode for CLI

# Import CLI utilities and set up mock Streamlit BEFORE any other imports
from src.cli_utils import setup_cli_mode, CLICallbackHandler

# Set up CLI mode - this must happen before importing any modules that use streamlit
mock_st = setup_cli_mode()

# Now we can import the rest of the modules
from src.agents import create_search_agent, create_supervisor_agent
from src.search.search_pg_default import pg_search_default, direct_access_doi
from src.memory import CustomMemorySaver
from main import (
    get_datasets_info_for_active_datasets,
    load_selected_datasets_into_cache,
    set_active_datasets_from_selection
)


def make_json_serializable(obj):
    """
    Recursively convert any object to a JSON-serializable format.
    Handles nested structures, pandas objects, numpy arrays, etc.
    """
    import pandas as pd
    import numpy as np
    from datetime import datetime, date

    # Handle None
    if obj is None:
        return None

    # Handle basic JSON-serializable types
    if isinstance(obj, (str, int, float, bool)):
        return obj

    # Handle datetime objects
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()

    # Handle pandas Series
    if isinstance(obj, pd.Series):
        return obj.to_dict()

    # Handle pandas DataFrame
    if isinstance(obj, pd.DataFrame):
        return obj.to_dict(orient='records')

    # Handle numpy arrays and numpy scalars
    if hasattr(obj, 'tolist'):
        return obj.tolist()
    if hasattr(obj, 'item'):
        return obj.item()

    # Handle dictionaries recursively
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}

    # Handle lists and tuples recursively
    if isinstance(obj, (list, tuple)):
        return [make_json_serializable(item) for item in obj]

    # Handle sets
    if isinstance(obj, set):
        return list(obj)

    # For any other object, try to convert to dict or string
    try:
        # Try to get __dict__ attribute
        if hasattr(obj, '__dict__'):
            return make_json_serializable(obj.__dict__)
    except:
        pass

    # Last resort: convert to string
    return str(obj)


class CLIInterface:
    """Command Line Interface for PANGAEA GPT"""

    def __init__(self, api_key: str, model_name: str = "gpt-4.1",
                 anthropic_key: Optional[str] = None,
                 langchain_key: Optional[str] = None,
                 langchain_project: Optional[str] = None):
        """Initialize CLI interface with API keys"""
        # Set up environment variables
        os.environ["OPENAI_API_KEY"] = api_key
        if anthropic_key:
            os.environ["ANTHROPIC_API_KEY"] = anthropic_key
        if langchain_key:
            os.environ["LANGCHAIN_API_KEY"] = langchain_key
            os.environ["LANGCHAIN_TRACING_V2"] = "true"
            if langchain_project:
                os.environ["LANGCHAIN_PROJECT_NAME"] = langchain_project

        # Initialize session state properly using the mock from cli_utils
        import streamlit as st

        # Initialize all required session state keys
        default_state = {
            "messages_search": [],
            "messages_data_agent": [],
            "datasets_cache": {},
            "datasets_info": None,
            "active_datasets": [],
            "selected_datasets": set(),
            "show_dataset": True,
            "current_page": "search",
            "dataset_dfs": {},
            "dataset_names": {},
            "saved_plot_paths": {},
            "memory": CustomMemorySaver(),
            "oceanographer_agent_used": False,
            "ecologist_agent_used": False,
            "visualization_agent_used": False,
            "dataframe_agent_used": False,
            "specialized_agent_used": False,
            "search_method": "PANGAEA Search (default)",
            "selected_text": "",
            "new_plot_generated": False,
            "execution_history": [],
            "model_name": model_name,
            "search_mode": "simple",
            "processing": False,
            "intermediate_steps": [],
            "thread_id": None,
            "viz_datasets_text": "",
            "search_results_cache": {}
        }

        # Update the mock session state
        for key, value in default_state.items():
            st.session_state[key] = value

        # Keep a reference to session state
        self.session_state = st.session_state

        # Also mock secrets if API keys are provided
        if api_key:
            mock_st.secrets.data["general"]["openai_api_key"] = api_key
        if anthropic_key:
            mock_st.secrets.data["general"]["anthropic_api_key"] = anthropic_key
        if langchain_key:
            mock_st.secrets.data["general"]["langchain_api_key"] = langchain_key
        if langchain_project:
            mock_st.secrets.data["general"]["langchain_project_name"] = langchain_project

        # Set up logging
        self.setup_logging()

        # Create output directory
        self.output_dir = Path("cli_output") / datetime.now().strftime("%Y%m%d_%H%M%S")
        self.output_dir.mkdir(parents=True, exist_ok=True)

        logging.info(f"CLI Interface initialized. Output directory: {self.output_dir}")

    def setup_logging(self):
        """Set up logging configuration"""
        log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        logging.basicConfig(
            level=logging.INFO,
            format=log_format,
            handlers=[
                logging.FileHandler('pangaea_cli.log'),
                logging.StreamHandler(sys.stdout)
            ]
        )

    def search_datasets(self, query: str, count: int = 15,
                       search_mode: str = "simple",
                       mindate: Optional[str] = None,
                       maxdate: Optional[str] = None,
                       minlat: Optional[float] = None,
                       maxlat: Optional[float] = None,
                       minlon: Optional[float] = None,
                       maxlon: Optional[float] = None) -> pd.DataFrame:
        """Search for datasets using the search agent"""
        print(f"\n🔍 Searching for: '{query}' (mode: {search_mode})")

        self.session_state["search_mode"] = search_mode

        # Create search agent
        search_agent = create_search_agent(
            datasets_info=self.session_state["datasets_info"],
            search_mode=search_mode
        )

        # Process the search query
        from main import process_search_query
        result = process_search_query(query, search_agent, self.session_state)

        print(f"\n✅ {result}")

        # Get the datasets info
        datasets_info = self.session_state.get("datasets_info")

        if datasets_info is not None and not datasets_info.empty:
            # Save results to CSV
            output_file = self.output_dir / "search_results.csv"
            datasets_info.to_csv(output_file, index=False)
            print(f"\n📄 Search results saved to: {output_file}")

            # Display results
            print("\n📊 Search Results:")
            for idx, row in datasets_info.iterrows():
                print(f"\n{idx + 1}. {row['Name']}")
                print(f"   DOI: {row['DOI']}")
                print(f"   Description: {row['Short Description'][:200]}...")
                print(f"   Parameters: {row['Parameters'][:100]}...")

            return datasets_info
        else:
            print("\n⚠️ No datasets found.")
            return pd.DataFrame()

    def direct_access_datasets(self, dois: List[str]) -> pd.DataFrame:
        """Direct access to datasets by DOI"""
        print(f"\n📥 Directly accessing DOIs: {', '.join(dois)}")

        # Join DOIs with commas for the function
        doi_string = ', '.join(dois)
        datasets_info, prompt_text = direct_access_doi(doi_string)

        if not datasets_info.empty:
            # Save results
            output_file = self.output_dir / "direct_access_results.csv"
            datasets_info.to_csv(output_file, index=False)
            print(f"\n📄 Results saved to: {output_file}")

            # Display results
            print("\n📊 Direct Access Results:")
            for idx, row in datasets_info.iterrows():
                print(f"\n{idx + 1}. {row['Name']}")
                print(f"   DOI: {row['DOI']}")
                print(f"   Parameters: {row['Parameters']}")

            return datasets_info
        else:
            print("\n⚠️ Failed to access datasets.")
            return pd.DataFrame()

    def analyze_datasets(self, dataset_indices: List[int], query: str,
                        datasets_info: Optional[pd.DataFrame] = None):
        """Analyze selected datasets with the data agent"""
        if datasets_info is None:
            datasets_info = self.session_state.get("datasets_info")

        if datasets_info is None or datasets_info.empty:
            print("\n⚠️ No datasets available. Please search first.")
            return

        # CRITICAL FIX: Clear all agent-related session state before new analysis
        print("\n🔄 Clearing previous analysis state...")

        # Clear agent usage flags
        self.session_state["oceanographer_agent_used"] = False
        self.session_state["ecologist_agent_used"] = False
        self.session_state["visualization_agent_used"] = False
        self.session_state["dataframe_agent_used"] = False
        self.session_state["specialized_agent_used"] = False

        # Clear messages and conversation history
        self.session_state["messages_data_agent"] = []
        self.session_state["intermediate_steps"] = []

        # Generate new thread ID for this analysis
        import uuid
        self.session_state["thread_id"] = str(uuid.uuid4())

        # Clear any cached plot info
        self.session_state["new_plot_generated"] = False
        self.session_state["saved_plot_paths"] = {}

        # Reset processing flags
        self.session_state["processing"] = False

        print(f"✓ Session state cleared for new analysis: '{query}'")

        # Select datasets
        selected_dois = set()
        for idx in dataset_indices:
            if 0 <= idx - 1 < len(datasets_info):
                doi = datasets_info.iloc[idx - 1]['DOI']
                selected_dois.add(doi)
                print(f"✓ Selected dataset {idx}: {datasets_info.iloc[idx - 1]['Name']}")

        if not selected_dois:
            print("\n⚠️ No valid datasets selected.")
            return

        self.session_state["selected_datasets"] = selected_dois

        # Load datasets
        print("\n📥 Loading selected datasets...")
        load_selected_datasets_into_cache(selected_dois, self.session_state)
        set_active_datasets_from_selection(self.session_state)

        # Get dataset info for active datasets
        active_datasets_info = get_datasets_info_for_active_datasets(self.session_state)

        if not active_datasets_info:
            print("\n⚠️ Failed to load datasets.")
            return

        # Create and invoke supervisor agent
        print(f"\n🤖 Analyzing with query: '{query}'")

        # FIX: Add the user's query to the message history before invoking the agent
        self.session_state["messages_data_agent"].append({"role": "user", "content": query})

        logging.info(f"CLI Analysis Query: {query}")
        logging.info(f"Selected DOIs: {', '.join(selected_dois)}")
        logging.info(f"Active datasets count: {len(active_datasets_info)}")
        logging.info(f"Thread ID: {self.session_state.get('thread_id', 'None')}")

        # Verify we have clean session state
        agent_flags = {
            "oceanographer_agent_used": self.session_state.get("oceanographer_agent_used", False),
            "ecologist_agent_used": self.session_state.get("ecologist_agent_used", False),
            "visualization_agent_used": self.session_state.get("visualization_agent_used", False),
            "dataframe_agent_used": self.session_state.get("dataframe_agent_used", False)
        }
        logging.info(f"Agent flags before analysis: {agent_flags}")

        # Use the standard create_and_invoke_supervisor_agent - no monkey-patching
        from main import create_and_invoke_supervisor_agent

        print(f"✓ Using supervisor agent with planning tool")

        # Call the standard function without any modifications
        response = create_and_invoke_supervisor_agent(
            user_query=query,
            datasets_info=active_datasets_info,
            memory=self.session_state["memory"],
            session_data=self.session_state,
            st_callback=None
        )

        # Rest of the method remains the same...
        if response:
            # Extract the final message
            messages = response.get('messages', [])
            if messages:
                last_message = messages[-1]
                logging.info(f"Final message from agent: {last_message.name if hasattr(last_message, 'name') else 'Unknown'}")
                logging.info(f"Message content preview: {last_message.content[:200]}...")

                print(f"\n📊 Analysis Result:\n")
                print(last_message.content)

                # Save result
                output_file = self.output_dir / "analysis_result.txt"
                with open(output_file, 'w') as f:
                    f.write(f"Query: {query}\n")
                    f.write(f"Selected DOIs: {', '.join(selected_dois)}\n\n")
                    f.write("Analysis Result:\n")
                    f.write(last_message.content)

                print(f"\n📄 Analysis saved to: {output_file}")

                # Check for generated plots
                plot_images = response.get('plot_images', [])
                if plot_images:
                    print(f"\n🎨 Generated {len(plot_images)} plots:")
                    for plot_path in plot_images:
                        if os.path.exists(plot_path):
                            print(f"   - {plot_path}")

                # Save execution history
                history_file = self.output_dir / "execution_history.json"
                try:
                    serializable_history = make_json_serializable(self.session_state.get("execution_history", []))
                    with open(history_file, 'w') as f:
                        json.dump(serializable_history, f, indent=2, default=str)
                    print(f"\n📄 Execution history saved to: {history_file}")
                except Exception as e:
                    logging.error(f"Could not save execution history: {e}")
        else:
            print("\n⚠️ Analysis failed.")

    def interactive_mode(self):
        """Run in interactive mode"""
        print("\n🌊 PANGAEA GPT - Interactive CLI Mode")
        print("=" * 50)
        print("Commands:")
        print("  search <query>     - Search for datasets")
        print("  direct <doi1,doi2> - Direct access to DOIs")
        print("  select <1,2,3>     - Select datasets by number")
        print("  analyze <query>    - Analyze selected datasets")
        print("  list               - List current datasets")
        print("  clear              - Clear current session")
        print("  help               - Show this help")
        print("  exit               - Exit the program")
        print("=" * 50)

        # Store the original model name for session resets
        original_model = self.session_state.get("model_name", "gpt-4.1")

        while True:
            try:
                command = input("\n> ").strip()

                if not command:
                    continue

                parts = command.split(maxsplit=1)
                cmd = parts[0].lower()
                args = parts[1] if len(parts) > 1 else ""

                if cmd == "exit":
                    print("\n👋 Goodbye!")
                    break

                elif cmd == "help":
                    self.interactive_mode()  # Show help again

                elif cmd == "search":
                    if not args:
                        print("⚠️ Please provide a search query.")
                        continue
                    self.search_datasets(args)

                elif cmd == "direct":
                    if not args:
                        print("⚠️ Please provide DOIs separated by commas.")
                        continue
                    dois = [doi.strip() for doi in args.split(',')]
                    self.direct_access_datasets(dois)

                elif cmd == "select":
                    if not args:
                        print("⚠️ Please provide dataset numbers separated by commas.")
                        continue
                    try:
                        indices = [int(x.strip()) for x in args.split(',')]
                        # Mark as selected
                        datasets_info = self.session_state.get("datasets_info")
                        if datasets_info is not None:
                            for idx in indices:
                                if 0 <= idx - 1 < len(datasets_info):
                                    doi = datasets_info.iloc[idx - 1]['DOI']
                                    self.session_state["selected_datasets"].add(doi)
                            print(f"✓ Selected {len(indices)} datasets")
                        else:
                            print("⚠️ No datasets available to select from.")
                    except ValueError:
                        print("⚠️ Invalid dataset numbers.")

                elif cmd == "analyze":
                    if not args:
                        print("⚠️ Please provide an analysis query.")
                        continue
                    selected = list(self.session_state.get("selected_datasets", set()))
                    if not selected:
                        print("⚠️ No datasets selected. Use 'select' command first.")
                        continue
                    # Get indices of selected datasets
                    datasets_info = self.session_state.get("datasets_info")
                    if datasets_info is not None:
                        indices = []
                        for idx, row in datasets_info.iterrows():
                            if row['DOI'] in selected:
                                indices.append(idx + 1)
                        self.analyze_datasets(indices, args, datasets_info)

                elif cmd == "list":
                    datasets_info = self.session_state.get("datasets_info")
                    selected = self.session_state.get("selected_datasets", set())
                    if datasets_info is not None and not datasets_info.empty:
                        print("\n📊 Current Datasets:")
                        for idx, row in datasets_info.iterrows():
                            marker = "✓" if row['DOI'] in selected else " "
                            print(f"\n[{marker}] {idx + 1}. {row['Name']}")
                            print(f"     DOI: {row['DOI']}")
                    else:
                        print("\n⚠️ No datasets loaded.")

                elif cmd == "clear":
                    # Clear session state while preserving model
                    import streamlit as st
                    import uuid
                    preserved_model = st.session_state.get("model_name", original_model)

                    # Reinitialize with default values
                    default_state = {
                        "messages_search": [],
                        "messages_data_agent": [],
                        "datasets_cache": {},
                        "datasets_info": None,
                        "active_datasets": [],
                        "selected_datasets": set(),
                        "show_dataset": True,
                        "current_page": "search",
                        "dataset_dfs": {},
                        "dataset_names": {},
                        "saved_plot_paths": {},
                        "memory": CustomMemorySaver(),
                        "oceanographer_agent_used": False,
                        "ecologist_agent_used": False,
                        "visualization_agent_used": False,
                        "dataframe_agent_used": False,
                        "specialized_agent_used": False,
                        "search_method": "PANGAEA Search (default)",
                        "selected_text": "",
                        "new_plot_generated": False,
                        "execution_history": [],
                        "model_name": preserved_model,
                        "search_mode": "simple",
                        "processing": False,
                        "intermediate_steps": [],
                        "thread_id": str(uuid.uuid4()),  # Generate new thread ID
                        "viz_datasets_text": "",
                        "search_results_cache": {}
                    }

                    # Clear and update
                    st.session_state.clear()
                    st.session_state.update(default_state)
                    self.session_state = st.session_state

                    print("✓ Session cleared with fresh thread ID.")

                else:
                    print(f"⚠️ Unknown command: '{cmd}'. Type 'help' for available commands.")

            except KeyboardInterrupt:
                print("\n\n👋 Goodbye!")
                break
            except Exception as e:
                print(f"\n❌ Error: {str(e)}")
                logging.error(f"Error in interactive mode: {str(e)}", exc_info=True)


def main():
    """Main entry point for CLI"""
    parser = argparse.ArgumentParser(
        description="PANGAEA GPT - Command Line Interface",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Search for datasets
  python cli.py --search "temperature arctic ocean" --count 20

  # Direct access to DOIs
  python cli.py --direct-doi "10.1594/PANGAEA.123456,10.1594/PANGAEA.789012"

  # Search and analyze in one command
  python cli.py --search "salinity fram strait" --analyze "Plot salinity distribution" --select 1,2,3

  # Interactive mode
  python cli.py --interactive

  # Use different model
  python cli.py --model gpt-4o-mini --search "biodiversity data"
        """
    )

    # API keys
    parser.add_argument('--api-key', type=str, help='OpenAI API key (or set OPENAI_API_KEY env var)')
    parser.add_argument('--anthropic-key', type=str, help='Anthropic API key (optional)')
    parser.add_argument('--langchain-key', type=str, help='LangChain API key (optional)')
    parser.add_argument('--langchain-project', type=str, help='LangChain project name (optional)')

    # Model selection
    parser.add_argument('--model', type=str, default='gpt-4.1-mini',
                       choices=['gpt-5.2', 'gpt-5', 'gpt-4.1-mini', 'gpt-4.1', 'gpt-4.1-nano', 'gpt-4o', 'gpt-4o-mini', 'o1-mini', 'o3-mini'],
                       help='Model to use (default: gpt-4.1)')

    # Search options
    parser.add_argument('--search', type=str, help='Search query for datasets')
    parser.add_argument('--search-mode', type=str, default='simple', choices=['simple', 'deep'],
                       help='Search mode: simple (fast) or deep (comprehensive)')
    parser.add_argument('--count', type=int, default=15, help='Number of search results (default: 15)')
    parser.add_argument('--mindate', type=str, help='Minimum date (YYYY-MM-DD)')
    parser.add_argument('--maxdate', type=str, help='Maximum date (YYYY-MM-DD)')
    parser.add_argument('--minlat', type=float, help='Minimum latitude')
    parser.add_argument('--maxlat', type=float, help='Maximum latitude')
    parser.add_argument('--minlon', type=float, help='Minimum longitude')
    parser.add_argument('--maxlon', type=float, help='Maximum longitude')

    # Direct DOI access
    parser.add_argument('--direct-doi', type=str, help='Direct access to DOIs (comma-separated)')

    # Analysis options
    parser.add_argument('--select', type=str, help='Select datasets by number (comma-separated, e.g., 1,2,3)')
    parser.add_argument('--analyze', type=str, help='Analysis query for selected datasets')

    # Interactive mode
    parser.add_argument('--interactive', '-i', action='store_true', help='Run in interactive mode')

    # Output options
    parser.add_argument('--output-dir', type=str, help='Output directory (default: cli_output/timestamp)')

    args = parser.parse_args()

    # Get API key
    api_key = args.api_key or os.environ.get('OPENAI_API_KEY')
    if not api_key:
        print("❌ Error: OpenAI API key required. Set via --api-key or OPENAI_API_KEY environment variable.")
        sys.exit(1)

    # Get other API keys
    anthropic_key = args.anthropic_key or os.environ.get('ANTHROPIC_API_KEY')
    langchain_key = args.langchain_key or os.environ.get('LANGCHAIN_API_KEY')
    langchain_project = args.langchain_project or os.environ.get('LANGCHAIN_PROJECT_NAME')

    # Create CLI interface
    cli = CLIInterface(
        api_key=api_key,
        model_name=args.model,
        anthropic_key=anthropic_key,
        langchain_key=langchain_key,
        langchain_project=langchain_project
    )

    # Override output directory if specified
    if args.output_dir:
        cli.output_dir = Path(args.output_dir)
        cli.output_dir.mkdir(parents=True, exist_ok=True)

    # Run in interactive mode
    if args.interactive:
        cli.interactive_mode()
        return

    # Execute commands
    datasets_info = None

    # Search or direct DOI access
    if args.search:
        datasets_info = cli.search_datasets(
            query=args.search,
            count=args.count,
            search_mode=args.search_mode,
            mindate=args.mindate,
            maxdate=args.maxdate,
            minlat=args.minlat,
            maxlat=args.maxlat,
            minlon=args.minlon,
            maxlon=args.maxlon
        )
    elif args.direct_doi:
        dois = [doi.strip() for doi in args.direct_doi.split(',')]
        datasets_info = cli.direct_access_datasets(dois)

    # Select and analyze
    if args.select and args.analyze:
        if datasets_info is None or datasets_info.empty:
            print("\n⚠️ No datasets available. Please use --search or --direct-doi first.")
            sys.exit(1)

        try:
            indices = [int(x.strip()) for x in args.select.split(',')]
            cli.analyze_datasets(indices, args.analyze, datasets_info)
        except ValueError:
            print("❌ Error: Invalid dataset selection. Use comma-separated numbers (e.g., 1,2,3)")
            sys.exit(1)
    elif args.analyze:
        print("\n⚠️ --analyze requires --select to specify which datasets to analyze.")
        sys.exit(1)

    # If no analysis requested but datasets were found, just show summary
    if datasets_info is not None and not args.analyze:
        print(f"\n✅ Found {len(datasets_info)} datasets. Use --select and --analyze to analyze them.")


if __name__ == "__main__":
    main()