LLM-VEDHA/app.py at master · Nandhan-Golla/LLM-VEDHA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
"""
Streamlit web interface for SMILES molecule generation.
Interactive application for generating and visualizing molecules.
"""
import streamlit as st
import torch
import os
import tempfile
import base64
from io import BytesIO
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import numpy as np
from typing import List, Dict, Optional

# Import our modules
from inference import MoleculeGenerator
from model import GPTModel
from tokenizer import SMILESTokenizer
from utils import get_model_config, format_number

# Try to import RDKit for molecule visualization
try:
    from rdkit import Chem
    from rdkit.Chem import Draw
    RDKIT_AVAILABLE = True
except ImportError:
    RDKIT_AVAILABLE = False
    st.warning("RDKit not available. Molecule visualization will be limited.")


def load_model(checkpoint_path: str, model_size: str) -> Optional[MoleculeGenerator]:
    """
    Load model from checkpoint.

    Args:
        checkpoint_path: Path to checkpoint file
        model_size: Model size configuration

    Returns:
        MoleculeGenerator instance or None if loading fails
    """
    try:
        if not os.path.exists(checkpoint_path):
            st.error(f"Checkpoint file not found: {checkpoint_path}")
            return None

        with st.spinner("Loading model..."):
            generator = MoleculeGenerator.from_checkpoint(
                checkpoint_path=checkpoint_path,
                model_size=model_size,
                device='cpu'  # Use CPU for Streamlit deployment
            )

        st.success("Model loaded successfully!")
        return generator

    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        return None


def create_molecule_image(smiles: str, img_size: tuple = (300, 300)) -> Optional[Image.Image]:
    """
    Create molecule image from SMILES string.

    Args:
        smiles: SMILES string
        img_size: Image size (width, height)

    Returns:
        PIL Image or None if creation fails
    """
    if not RDKIT_AVAILABLE:
        return None

    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None

        # Generate molecule image
        img = Draw.MolToImage(mol, size=img_size)
        return img

    except Exception as e:
        st.error(f"Error creating molecule image: {str(e)}")
        return None


def create_molecules_grid(smiles_list: List[str], max_mols: int = 12) -> Optional[Image.Image]:
    """
    Create grid of molecule images.

    Args:
        smiles_list: List of SMILES strings
        max_mols: Maximum number of molecules to display

    Returns:
        PIL Image or None if creation fails
    """
    if not RDKIT_AVAILABLE:
        return None

    try:
        # Limit number of molecules
        smiles_list = smiles_list[:max_mols]

        # Convert to molecule objects
        mols = []
        legends = []
        for smiles in smiles_list:
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                mols.append(mol)
                legends.append(smiles)

        if not mols:
            return None

        # Create grid image
        img = Draw.MolsToGridImage(
            mols,
            molsPerRow=4,
            subImgSize=(200, 200),
            legends=legends
        )

        return img

    except Exception as e:
        st.error(f"Error creating molecule grid: {str(e)}")
        return None


def plot_evaluation_metrics(results: Dict) -> None:
    """
    Plot evaluation metrics.

    Args:
        results: Evaluation results dictionary
    """
    col1, col2 = st.columns(2)

    with col1:
        # Validity and uniqueness metrics
        fig, ax = plt.subplots(figsize=(8, 6))

        metrics = ['Validity', 'Uniqueness']
        values = [
            results['validity_ratio'] * 100,
            results['diversity_all']['unique_ratio'] * 100
        ]

        bars = ax.bar(metrics, values, color=['#2E86AB', '#A23B72'])
        ax.set_ylabel('Percentage (%)')
        ax.set_title('Generation Quality Metrics')
        ax.set_ylim(0, 100)

        # Add value labels on bars
        for bar, value in zip(bars, values):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 1,
                   f'{value:.1f}%', ha='center', va='bottom')

        plt.tight_layout()
        st.pyplot(fig)
        plt.close()

    with col2:
        # Molecular properties (if available)
        if results.get('molecular_properties'):
            fig, ax = plt.subplots(figsize=(8, 6))

            props = list(results['molecular_properties'].keys())
            means = [results['molecular_properties'][prop]['mean'] for prop in props]
            stds = [results['molecular_properties'][prop]['std'] for prop in props]

            x_pos = np.arange(len(props))
            bars = ax.bar(x_pos, means, yerr=stds, capsize=5, color='#F18F01')
            ax.set_xlabel('Molecular Properties')
            ax.set_ylabel('Value')
            ax.set_title('Molecular Properties Distribution')
            ax.set_xticks(x_pos)
            ax.set_xticklabels([prop.replace('_', ' ').title() for prop in props], rotation=45)

            plt.tight_layout()
            st.pyplot(fig)
            plt.close()


def main():
    """Main Streamlit application."""
    st.set_page_config(
        page_title="SMILES Molecule Generator",
        page_icon="🧪",
        layout="wide",
        initial_sidebar_state="expanded"
    )

    st.title("🧪 SMILES Molecule Generator")
    st.markdown("Generate novel molecular structures using a transformer-based language model")

    # Sidebar for model configuration
    st.sidebar.header("Model Configuration")

    # Model checkpoint upload or path
    checkpoint_option = st.sidebar.radio(
        "Checkpoint Source",
        ["Upload File", "File Path"]
    )

    checkpoint_path = None
    if checkpoint_option == "Upload File":
        uploaded_file = st.sidebar.file_uploader(
            "Upload model checkpoint",
            type=['pt', 'pth'],
            help="Upload a trained model checkpoint file"
        )

        if uploaded_file is not None:
            # Save uploaded file temporarily
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pt') as tmp_file:
                tmp_file.write(uploaded_file.read())
                checkpoint_path = tmp_file.name
    else:
        checkpoint_path = st.sidebar.text_input(
            "Checkpoint file path",
            value="checkpoints/best_model.pt",
            help="Path to the model checkpoint file"
        )

    model_size = st.sidebar.selectbox(
        "Model Size",
        ["10M", "25M", "50M"],
        index=0,
        help="Model size configuration"
    )

    # Initialize session state
    if 'generator' not in st.session_state:
        st.session_state.generator = None
    if 'generated_molecules' not in st.session_state:
        st.session_state.generated_molecules = []
    if 'evaluation_results' not in st.session_state:
        st.session_state.evaluation_results = None

    # Load model button
    if st.sidebar.button("Load Model", type="primary"):
        if checkpoint_path:
            st.session_state.generator = load_model(checkpoint_path, model_size)
        else:
            st.sidebar.error("Please provide a checkpoint file")

    # Model status
    if st.session_state.generator is not None:
        st.sidebar.success("✅ Model loaded and ready")

        # Model info
        with st.sidebar.expander("Model Information"):
            param_count = st.session_state.generator.model.get_num_trainable_params()
            st.write(f"**Parameters:** {format_number(param_count)}")
            st.write(f"**Vocabulary Size:** {st.session_state.generator.tokenizer.get_vocab_size()}")
            st.write(f"**Max Sequence Length:** {st.session_state.generator.tokenizer.max_seq_len}")
            st.write(f"**Device:** {st.session_state.generator.device}")
    else:
        st.sidebar.warning("⚠️ No model loaded")

    # Main interface
    if st.session_state.generator is not None:
        # Generation parameters
        st.header("Generation Parameters")

        col1, col2, col3 = st.columns(3)

        with col1:
            n_samples = st.number_input(
                "Number of Molecules",
                min_value=1,
                max_value=100,
                value=10,
                help="Number of molecules to generate"
            )

            temperature = st.slider(
                "Temperature",
                min_value=0.1,
                max_value=2.0,
                value=1.0,
                step=0.1,
                help="Sampling temperature (higher = more random)"
            )

        with col2:
            use_top_k = st.checkbox("Use Top-K Sampling")
            top_k = st.number_input(
                "Top-K",
                min_value=1,
                max_value=100,
                value=50,
                disabled=not use_top_k,
                help="Keep only top K tokens for sampling"
            ) if use_top_k else None

            use_top_p = st.checkbox("Use Nucleus Sampling")
            top_p = st.slider(
                "Top-P",
                min_value=0.1,
                max_value=1.0,
                value=0.9,
                step=0.05,
                disabled=not use_top_p,
                help="Nucleus sampling threshold"
            ) if use_top_p else None

        with col3:
            seed = st.number_input(
                "Random Seed",
                min_value=0,
                max_value=999999,
                value=42,
                help="Seed for reproducible generation"
            )

            max_length = st.number_input(
                "Max Length",
                min_value=10,
                max_value=256,
                value=128,
                help="Maximum sequence length"
            )

        # SMILES prompt (optional)
        st.subheader("Optional: SMILES Prompt")
        prompt = st.text_input(
            "SMILES Prompt",
            value="",
            placeholder="e.g., CC (to generate molecules starting with CC)",
            help="Optional SMILES string to start generation from"
        )

        # Generate button
        if st.button("🚀 Generate Molecules", type="primary"):
            with st.spinner("Generating molecules..."):
                try:
                    if prompt.strip():
                        # Generate with prompt
                        molecules = []
                        for _ in range(n_samples):
                            mol = st.session_state.generator.generate(
                                prompt=prompt.strip(),
                                temperature=temperature,
                                top_k=top_k,
                                top_p=top_p,
                                max_length=max_length
                            )
                            molecules.append(mol)
                    else:
                        # Generate without prompt
                        molecules = st.session_state.generator.sample(
                            n_samples=n_samples,
                            temperature=temperature,
                            top_k=top_k,
                            top_p=top_p,
                            max_length=max_length,
                            seed=seed
                        )

                    st.session_state.generated_molecules = molecules

                    # Evaluate generated molecules
                    st.session_state.evaluation_results = st.session_state.generator.evaluate(molecules)

                    st.success(f"Generated {len(molecules)} molecules!")

                except Exception as e:
                    st.error(f"Error during generation: {str(e)}")

        # Display results
        if st.session_state.generated_molecules:
            st.header("Generated Molecules")

            # Summary metrics
            results = st.session_state.evaluation_results
            if results:
                col1, col2, col3, col4 = st.columns(4)

                with col1:
                    st.metric("Total Generated", results['total_count'])

                with col2:
                    st.metric(
                        "Valid Molecules",
                        results['valid_count'],
                        f"{results['validity_ratio']:.1%}"
                    )

                with col3:
                    st.metric(
                        "Unique Molecules",
                        results['diversity_all']['unique_count'],
                        f"{results['diversity_all']['unique_ratio']:.1%}"
                    )

                with col4:
                    st.metric(
                        "Avg. Diversity",
                        f"{results['diversity_all']['avg_levenshtein']:.2f}",
                        "Levenshtein distance"
                    )

            # Tabs for different views
            tab1, tab2, tab3, tab4 = st.tabs(["📋 SMILES List", "🧪 Molecule Viewer", "📊 Analytics", "💾 Download"])

            with tab1:
                # Display SMILES in a table
                df_data = []
                for i, smiles in enumerate(st.session_state.generated_molecules):
                    # Check validity
                    validity, _ = st.session_state.generator.validate_smiles([smiles])
                    is_valid = validity[0] if validity else False

                    df_data.append({
                        'Index': i + 1,
                        'SMILES': smiles,
                        'Valid': '✅' if is_valid else '❌',
                        'Length': len(smiles)
                    })

                df = pd.DataFrame(df_data)
                st.dataframe(df, use_container_width=True)

            with tab2:
                # Molecule visualization
                if RDKIT_AVAILABLE:
                    st.subheader("Molecule Structures")

                    # Filter valid molecules
                    valid_smiles = []
                    for smiles in st.session_state.generated_molecules:
                        validity, _ = st.session_state.generator.validate_smiles([smiles])
                        if validity and validity[0]:
                            valid_smiles.append(smiles)

                    if valid_smiles:
                        # Create molecule grid
                        grid_img = create_molecules_grid(valid_smiles[:12])  # Show first 12
                        if grid_img:
                            st.image(grid_img, caption="Generated Molecules", use_column_width=True)

                        # Individual molecule viewer
                        st.subheader("Individual Molecule Viewer")
                        selected_smiles = st.selectbox(
                            "Select a molecule to view:",
                            valid_smiles,
                            format_func=lambda x: f"{x} ({'Valid' if x in valid_smiles else 'Invalid'})"
                        )

                        if selected_smiles:
                            col1, col2 = st.columns([1, 1])

                            with col1:
                                mol_img = create_molecule_image(selected_smiles, (400, 400))
                                if mol_img:
                                    st.image(mol_img, caption=selected_smiles)

                            with col2:
                                # Molecule properties
                                try:
                                    mol = Chem.MolFromSmiles(selected_smiles)
                                    if mol:
                                        from rdkit.Chem import Descriptors

                                        st.write("**Molecular Properties:**")
                                        st.write(f"- Molecular Weight: {Descriptors.MolWt(mol):.2f}")
                                        st.write(f"- LogP: {Descriptors.MolLogP(mol):.2f}")
                                        st.write(f"- Number of Atoms: {mol.GetNumAtoms()}")
                                        st.write(f"- Number of Bonds: {mol.GetNumBonds()}")
                                        st.write(f"- Number of Rings: {Descriptors.RingCount(mol)}")
                                        st.write(f"- TPSA: {Descriptors.TPSA(mol):.2f}")
                                except Exception as e:
                                    st.error(f"Error calculating properties: {str(e)}")
                    else:
                        st.warning("No valid molecules to display")
                else:
                    st.warning("RDKit not available. Cannot display molecule structures.")

            with tab3:
                # Analytics and metrics
                if results:
                    st.subheader("Generation Analytics")

                    # Plot metrics
                    plot_evaluation_metrics(results)

                    # Detailed statistics
                    st.subheader("Detailed Statistics")

                    col1, col2 = st.columns(2)

                    with col1:
                        st.write("**Validity Statistics:**")
                        st.write(f"- Total molecules: {results['total_count']}")
                        st.write(f"- Valid molecules: {results['valid_count']}")
                        st.write(f"- Invalid molecules: {results['total_count'] - results['valid_count']}")
                        st.write(f"- Validity rate: {results['validity_ratio']:.1%}")

                    with col2:
                        st.write("**Diversity Statistics:**")
                        st.write(f"- Unique molecules: {results['diversity_all']['unique_count']}")
                        st.write(f"- Uniqueness rate: {results['diversity_all']['unique_ratio']:.1%}")
                        st.write(f"- Avg. Levenshtein distance: {results['diversity_all']['avg_levenshtein']:.2f}")

                    # Sample molecules
                    if results.get('sample_valid_smiles'):
                        st.subheader("Sample Valid Molecules")
                        for i, smiles in enumerate(results['sample_valid_smiles'][:5], 1):
                            st.code(f"{i}. {smiles}")

                    if results.get('sample_invalid_smiles'):
                        st.subheader("Sample Invalid Molecules")
                        for i, smiles in enumerate(results['sample_invalid_smiles'][:3], 1):
                            st.code(f"{i}. {smiles}")

            with tab4:
                # Download options
                st.subheader("Download Results")

                # Prepare data for download
                molecules_df = pd.DataFrame({
                    'SMILES': st.session_state.generated_molecules,
                    'Index': range(1, len(st.session_state.generated_molecules) + 1)
                })

                # Add validity column
                validity_list = []
                for smiles in st.session_state.generated_molecules:
                    validity, _ = st.session_state.generator.validate_smiles([smiles])
                    validity_list.append(validity[0] if validity else False)

                molecules_df['Valid'] = validity_list

                # Download buttons
                col1, col2 = st.columns(2)

                with col1:
                    csv_data = molecules_df.to_csv(index=False)
                    st.download_button(
                        label="📄 Download as CSV",
                        data=csv_data,
                        file_name="generated_molecules.csv",
                        mime="text/csv"
                    )

                with col2:
                    if results:
                        json_data = st.session_state.generator._convert_for_json(results)
                        import json
                        json_str = json.dumps(json_data, indent=2)
                        st.download_button(
                            label="📊 Download Analytics",
                            data=json_str,
                            file_name="evaluation_results.json",
                            mime="application/json"
                        )

    else:
        # Instructions when no model is loaded
        st.info("👈 Please load a model from the sidebar to start generating molecules")

        st.markdown("""
        ## How to use this application:

        1. **Load a Model**: Upload a checkpoint file or specify the path to a trained model
        2. **Set Parameters**: Adjust generation parameters like temperature, sampling methods, etc.
        3. **Generate**: Click the generate button to create new molecules
        4. **Explore**: View the generated molecules, their structures, and analytics
        5. **Download**: Export your results for further analysis

        ## Generation Parameters:

        - **Temperature**: Controls randomness (0.1 = deterministic, 2.0 = very random)
        - **Top-K**: Limits sampling to top K most likely tokens
        - **Top-P**: Nucleus sampling - keeps tokens with cumulative probability ≤ P
        - **SMILES Prompt**: Optional starting sequence for conditional generation

        ## Features:

        - 🧪 **Molecule Visualization**: View 2D structures of generated molecules
        - 📊 **Analytics**: Comprehensive evaluation metrics and statistics
        - ✅ **Validation**: Automatic validation using RDKit
        - 💾 **Export**: Download results in CSV and JSON formats
        """)


if __name__ == "__main__":
    main()