diff --git a/.gitignore b/.gitignore
index aa25096..17d80df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ wandb
 exps*
 .vscode
 build
-lora_diffusion.egg-info
\ No newline at end of file
+lora_diffusion.egg-info
+training_batch_preview
\ No newline at end of file
diff --git a/lora_diffusion/cli_lora_add.py b/lora_diffusion/cli_lora_add.py
index fc7f7e4..df9303f 100644
--- a/lora_diffusion/cli_lora_add.py
+++ b/lora_diffusion/cli_lora_add.py
@@ -1,4 +1,9 @@
-from typing import Literal, Union, Dict
+import sys
+if sys.version_info >= (3,8):
+    from typing import Literal
+else : 
+    from typing_extensions import Literal
+from typing import Union, Dict
 import os
 import shutil
 import fire
@@ -6,14 +11,28 @@
 from safetensors.torch import safe_open, save_file
 
 import torch
-from .lora import (
-    tune_lora_scale,
-    patch_pipe,
-    collapse_lora,
-    monkeypatch_remove_lora,
-)
-from .lora_manager import lora_join
-from .to_ckpt_v2 import convert_to_ckpt
+
+try:
+    from .lora import (
+        tune_lora_scale,
+        patch_pipe,
+        collapse_lora,
+        monkeypatch_remove_lora,
+    )
+
+    from .lora_manager import lora_join
+    from .to_ckpt_v2 import convert_to_ckpt
+
+except:  # allows running the repo without installing it (can mess up existing dependencies)
+    from lora_diffusion import (
+        tune_lora_scale,
+        patch_pipe,
+        collapse_lora,
+        monkeypatch_remove_lora,
+    )
+
+    from lora_diffusion.lora_manager import lora_join
+    from lora_diffusion.to_ckpt_v2 import convert_to_ckpt
 
 
 def _text_lora_path(path: str) -> str:
@@ -185,3 +204,7 @@ def add(
 
 def main():
     fire.Fire(add)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lora_diffusion/cli_lora_pti.py b/lora_diffusion/cli_lora_pti.py
index 7de4bae..6538b4f 100644
--- a/lora_diffusion/cli_lora_pti.py
+++ b/lora_diffusion/cli_lora_pti.py
@@ -1,15 +1,16 @@
 # Bootstrapped from:
 # https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py
 
-import argparse
-import hashlib
-import inspect
 import itertools
 import math
 import os
+
+import json
+import time
 import random
 import re
 from pathlib import Path
+import numpy as np
 from typing import Optional, List, Literal
 
 import torch
@@ -32,6 +33,14 @@
 import wandb
 import fire
 
+import sys
+if sys.version_info >= (3,8):
+    from typing import Literal
+else : 
+    from typing_extensions import Literal
+
+from typing import Optional, List
+
 from lora_diffusion import (
     PivotalTuningDatasetCapation,
     extract_lora_ups_down,
@@ -46,6 +55,33 @@
 )
 
 
+def preview_training_batch(train_dataloader, mode, n_imgs=40):
+    outdir = f"training_batch_preview/{mode}"
+    os.makedirs(outdir, exist_ok=True)
+    imgs_saved = 0
+
+    while True:
+        for batch_i, batch in enumerate(train_dataloader):
+            imgs = batch["pixel_values"]
+            for i, img_torch in enumerate(imgs):
+                img_torch = (img_torch + 1) / 2
+                # convert to pil and save to disk:
+                img = Image.fromarray(
+                    (255.0 * img_torch)
+                    .permute(1, 2, 0)
+                    .detach()
+                    .cpu()
+                    .numpy()
+                    .astype(np.uint8)
+                ).convert("RGB")
+                img.save(f"{outdir}/preview_{imgs_saved}.jpg")
+                imgs_saved += 1
+
+        if imgs_saved > n_imgs:
+            print(f"\nSaved {imgs_saved} preview training imgs to {outdir}")
+            return
+
+
 def get_models(
     pretrained_model_name_or_path,
     pretrained_vae_name_or_path,
@@ -108,6 +144,12 @@ def get_models(
             initializer_token_id = token_ids[0]
             token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
 
+            # print some stats about the token embedding:
+            t = token_embeds[placeholder_token_id]
+            print(
+                f"init_token {init_tok} --> mean: {t.mean().item():.3f}, std: {t.std().item():.3f}, norm: {t.norm():.4f}"
+            )
+
     vae = AutoencoderKL.from_pretrained(
         pretrained_vae_name_or_path or pretrained_model_name_or_path,
         subfolder=None if pretrained_vae_name_or_path else "vae",
@@ -188,6 +230,7 @@ def collate_fn(examples):
         train_dataloader = torch.utils.data.DataLoader(
             train_dataset,
             batch_size=train_batch_size,
+            num_workers=4,
             shuffle=True,
             collate_fn=collate_fn,
         )
@@ -249,6 +292,7 @@ def collate_fn(examples):
 
     train_dataloader = torch.utils.data.DataLoader(
         train_dataset,
+        num_workers=4,
         batch_size=train_batch_size,
         shuffle=True,
         collate_fn=collate_fn,
@@ -263,6 +307,7 @@ def loss_step(
     vae,
     text_encoder,
     scheduler,
+    optimized_embeddings=None,
     train_inpainting=False,
     t_mutliplier=1.0,
     mixed_precision=False,
@@ -286,11 +331,11 @@ def loss_step(
                 scale_factor=1 / 8,
             )
     else:
-        latents = batch["pixel_values"]
+        latents = batch["pixel_values"].to(dtype=weight_dtype).to(unet.device)
 
         if train_inpainting:
             masked_image_latents = batch["masked_image_latents"]
-            mask = batch["mask_values"]
+            mask = batch["mask_values"].to(dtype=weight_dtype).to(unet.device)
 
     noise = torch.randn_like(latents)
     bsz = latents.shape[0]
@@ -367,6 +412,12 @@ def loss_step(
         .mean()
     )
 
+    if optimized_embeddings is not None:
+        embedding_norm = optimized_embeddings.norm(dim=1).mean()
+        target_norm = 0.39
+        embedding_norm_loss = (embedding_norm - target_norm) ** 2
+        loss += 0.005 * embedding_norm_loss
+
     return loss
 
 
@@ -396,6 +447,7 @@ def train_inversion(
     clip_ti_decay: bool = True,
 ):
 
+    print("Performing Inversion....")
     progress_bar = tqdm(range(num_steps))
     progress_bar.set_description("Steps")
     global_step = 0
@@ -408,6 +460,7 @@ def train_inversion(
 
     index_updates = ~index_no_updates
     loss_sum = 0.0
+    losses = []
 
     for epoch in range(math.ceil(num_steps / len(dataloader))):
         unet.eval()
@@ -424,6 +477,7 @@ def train_inversion(
                         vae,
                         text_encoder,
                         scheduler,
+                        optimized_embeddings=None,
                         train_inpainting=train_inpainting,
                         mixed_precision=mixed_precision,
                         cached_latents=cached_latents,
@@ -431,6 +485,7 @@ def train_inversion(
                     / accum_iter
                 )
 
+                losses.append(loss.detach().mean().item())
                 loss.backward()
                 loss_sum += loss.detach().item()
 
@@ -466,19 +521,22 @@ def train_inversion(
                             ) * (
                                 pre_norm + lambda_ * (0.4 - pre_norm)
                             )
-                            print(pre_norm)
+                            # print(pre_norm)
 
-                        current_norm = (
-                            text_encoder.get_input_embeddings()
-                            .weight[index_updates, :]
-                            .norm(dim=-1)
-                        )
+                        optimizing_embeds = text_encoder.get_input_embeddings().weight[
+                            index_updates, :
+                        ]
+                        current_norm = optimizing_embeds.norm(dim=-1)
 
+                        # reset original embeddings (we're only optimizing the new token ones)
                         text_encoder.get_input_embeddings().weight[
                             index_no_updates
                         ] = orig_embeds_params[index_no_updates]
 
-                        print(f"Current Norm : {current_norm}")
+                        for i, t in enumerate(optimizing_embeds):
+                            print(
+                                f"token {i} --> mean: {t.mean().item():.3f}, std: {t.std().item():.3f}, norm: {t.norm():.4f}"
+                            )
 
                 global_step += 1
                 progress_bar.update(1)
@@ -490,6 +548,7 @@ def train_inversion(
                 progress_bar.set_postfix(**logs)
 
             if global_step % save_steps == 0:
+                plot_loss_curve(losses, "textual_inversion")
                 save_all(
                     unet=unet,
                     text_encoder=text_encoder,
@@ -542,6 +601,20 @@ def train_inversion(
                 return
 
 
+import matplotlib.pyplot as plt
+
+
+def plot_loss_curve(losses, name, moving_avg=20):
+    losses = np.array(losses)
+    losses = np.convolve(losses, np.ones(moving_avg) / moving_avg, mode="valid")
+    plt.plot(losses)
+    plt.xlabel("Step")
+    plt.ylabel("Loss")
+    plt.title(f"Losses during {name} phase:")
+    plt.savefig(f"{name}.png")
+    plt.clf()
+
+
 def perform_tuning(
     unet,
     vae,
@@ -562,12 +635,13 @@ def perform_tuning(
     tokenizer,
     test_image_path: str,
     cached_latents: bool,
+    index_no_updates=None,
     log_wandb: bool = False,
     wandb_log_prompt_cnt: int = 10,
     class_token: str = "person",
     train_inpainting: bool = False,
 ):
-
+    print("Performing Tuning....")
     progress_bar = tqdm(range(num_steps))
     progress_bar.set_description("Steps")
     global_step = 0
@@ -577,12 +651,22 @@ def perform_tuning(
     unet.train()
     text_encoder.train()
 
+    # Save the current token embeddings:
+    orig_embeds_params = text_encoder.get_input_embeddings().weight.data.clone()
+
     if log_wandb:
         preped_clip = prepare_clip_model_sets()
 
+    print(f"Performing {math.ceil(num_steps / len(dataloader))} epochs of training!")
     loss_sum = 0.0
+    losses = []
 
     for epoch in range(math.ceil(num_steps / len(dataloader))):
+        if not cached_latents:
+            dataloader.dataset.tune_h_flip_prob(
+                epoch / math.ceil(num_steps / len(dataloader))
+            )
+
         for batch in dataloader:
             lr_scheduler_lora.step()
 
@@ -594,6 +678,7 @@ def perform_tuning(
                 vae,
                 text_encoder,
                 scheduler,
+                optimized_embeddings=text_encoder.get_input_embeddings().weight[:, :],
                 train_inpainting=train_inpainting,
                 t_mutliplier=0.8,
                 mixed_precision=True,
@@ -613,10 +698,21 @@ def perform_tuning(
                 "lr": lr_scheduler_lora.get_last_lr()[0],
             }
             progress_bar.set_postfix(**logs)
+            losses.append(loss.detach().item())
+
+            if index_no_updates is not None:
+                with torch.no_grad():
+                    # reset original embeddings (we're only optimizing the new tokens)
+                    text_encoder.get_input_embeddings().weight[
+                        index_no_updates
+                    ] = orig_embeds_params[index_no_updates]
 
             global_step += 1
 
             if global_step % save_steps == 0:
+                # plot the loss curve:
+                plot_loss_curve(losses, "tuning")
+
                 save_all(
                     unet,
                     text_encoder,
@@ -701,7 +797,7 @@ def train(
     pretrained_vae_name_or_path: str = None,
     revision: Optional[str] = None,
     perform_inversion: bool = True,
-    use_template: Literal[None, "object", "style"] = None,
+    use_template: Literal[None, "object", "style", "person"] = None,
     train_inpainting: bool = False,
     placeholder_tokens: str = "",
     placeholder_token_at_data: Optional[str] = None,
@@ -750,8 +846,12 @@ def train(
     enable_xformers_memory_efficient_attention: bool = False,
     out_name: str = "final_lora",
 ):
+    script_start_time = time.time()
     torch.manual_seed(seed)
 
+    # Get a dict with all the arguments:
+    args_dict = locals()
+
     if log_wandb:
         wandb.init(
             project=wandb_project_name,
@@ -771,7 +871,6 @@ def train(
         print("PTI : Placeholder Tokens not given, using null token")
     else:
         placeholder_tokens = placeholder_tokens.split("|")
-
         assert (
             sorted(placeholder_tokens) == placeholder_tokens
         ), f"Placeholder tokens should be sorted. Use something like {'|'.join(sorted(placeholder_tokens))}'"
@@ -886,8 +985,13 @@ def train(
 
     if cached_latents:
         vae = None
+
     # STEP 1 : Perform Inversion
     if perform_inversion:
+        if not cached_latents:
+            preview_training_batch(train_dataloader, "inversion")
+
+        print("PTI : Performing Inversion")
         ti_optimizer = optim.AdamW(
             text_encoder.get_input_embeddings().parameters(),
             lr=ti_lr,
@@ -896,6 +1000,14 @@ def train(
             weight_decay=weight_decay_ti,
         )
 
+        token_ids_positions_to_update = np.where(index_no_updates.cpu().numpy() == 0)
+        print(
+            "Training embedding of size",
+            text_encoder.get_input_embeddings()
+            .weight[token_ids_positions_to_update]
+            .shape,
+        )
+
         lr_scheduler = get_scheduler(
             lr_scheduler,
             optimizer=ti_optimizer,
@@ -930,6 +1042,7 @@ def train(
         )
 
         del ti_optimizer
+        print("###############  Inversion Done  ###############")
 
     # Next perform Tuning with LoRA:
     if not use_extended_lora:
@@ -940,17 +1053,16 @@ def train(
             dropout_p=lora_dropout_p,
             scale=lora_scale,
         )
+        print("PTI : not use_extended_lora...")
     else:
         print("PTI : USING EXTENDED UNET!!!")
         lora_unet_target_modules = (
             lora_unet_target_modules | UNET_EXTENDED_TARGET_REPLACE
         )
         print("PTI : Will replace modules: ", lora_unet_target_modules)
-
         unet_lora_params, _ = inject_trainable_lora_extended(
             unet, r=lora_rank, target_replace_module=lora_unet_target_modules
         )
-    print(f"PTI : has {len(unet_lora_params)} lora")
 
     print("PTI : Before training:")
     inspect_lora(unet)
@@ -980,6 +1092,7 @@ def train(
             param.requires_grad = False
     else:
         text_encoder.requires_grad_(False)
+
     if train_text_encoder:
         text_encoder_lora_params, _ = inject_trainable_lora(
             text_encoder,
@@ -995,9 +1108,18 @@ def train(
         inspect_lora(text_encoder)
 
     lora_optimizers = optim.AdamW(params_to_optimize, weight_decay=weight_decay_lora)
+    with torch.no_grad():
+        n_optimizable_unet_params = sum(
+            p.numel() for p in unet.parameters() if p.requires_grad
+        )
+        +sum(p.numel() for p in text_encoder.parameters() if p.requires_grad)
+
+    print("PTI : n_optimizable_unet_params: ", n_optimizable_unet_params)
 
+    print(f"PTI : has {len(unet_lora_params)} lora")
     unet.train()
     if train_text_encoder:
+        print("Training text encoder!")
         text_encoder.train()
 
     train_dataset.blur_amount = 70
@@ -1008,6 +1130,8 @@ def train(
         num_warmup_steps=lr_warmup_steps_lora,
         num_training_steps=max_train_steps_tuning,
     )
+    if not cached_latents:
+        preview_training_batch(train_dataloader, "tuning")
 
     perform_tuning(
         unet,
@@ -1015,6 +1139,7 @@ def train(
         text_encoder,
         train_dataloader,
         max_train_steps_tuning,
+        index_no_updates=index_no_updates,
         cached_latents=cached_latents,
         scheduler=noise_scheduler,
         optimizer=lora_optimizers,
@@ -1035,6 +1160,19 @@ def train(
         train_inpainting=train_inpainting,
     )
 
+    print("###############  Tuning Done  ###############")
+    training_time = time.time() - script_start_time
+    print(f"Training time: {training_time/60:.1f} minutes")
+    args_dict["training_time_s"] = int(training_time)
+
+    # Save the args_dict to the output directory as a json file:
+    with open(os.path.join(output_dir, "lora_training_args.json"), "w") as f:
+        json.dump(args_dict, f, default=lambda o: "<not serializable>", indent=2)
+
 
 def main():
     fire.Fire(train)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lora_diffusion/cli_pt_to_safetensors.py b/lora_diffusion/cli_pt_to_safetensors.py
index 9a4be40..aefac92 100644
--- a/lora_diffusion/cli_pt_to_safetensors.py
+++ b/lora_diffusion/cli_pt_to_safetensors.py
@@ -62,9 +62,11 @@ def convert(*paths, outpath, overwrite=False, **settings):
             }
 
             prefix = f"{name}."
-            
-            arg_settings = { k[len(prefix) :]: v for k, v in settings.items() if k.startswith(prefix) }
-            model_settings = { **model_settings, **arg_settings }
+
+            arg_settings = {
+                k[len(prefix) :]: v for k, v in settings.items() if k.startswith(prefix)
+            }
+            model_settings = {**model_settings, **arg_settings}
 
             print(f"Loading Lora for {name} from {path} with settings {model_settings}")
 
diff --git a/lora_diffusion/dataset.py b/lora_diffusion/dataset.py
index f1c28fd..01a22fb 100644
--- a/lora_diffusion/dataset.py
+++ b/lora_diffusion/dataset.py
@@ -1,7 +1,7 @@
 import random
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
-
+import numpy as np
 from PIL import Image
 from torch import zeros_like
 from torch.utils.data import Dataset
@@ -39,7 +39,38 @@
     "a photo of a small {}",
 ]
 
-STYLE_TEMPLATE = [
+PERSON_TEMPLATE = [
+    "{}",
+    "{}",
+    "a picture of {}",
+    "a closeup of {}",
+    "a closeup photo of {}",
+    "a close-up picture of {}",
+    "a photo of {}",
+    "a photo of {}",
+    "the photo of {}",
+    "a cropped photo of {}",
+    "a funny photo of {}",
+    "a selfie of {}",
+    "a photo of the handsome {}",
+    "a photo of the beautiful {}",
+    "a selfie taken by the handsome {}",
+    "a selfie taken by {}",
+    "{} taking a selfie",
+    "{} is having fun, 4k photograph",
+    "{} wearing a plaidered shirt standing next to another person",
+    "smiling {} in a hoodie and sweater",
+    "a photo of the cool {}",
+    "a close-up photo of {}",
+    "a bright photo of {}",
+    "a cropped photo of {}",
+    "a brilliant HD photo of {}",
+    "a beautiful picture of {}",
+    "a photo showing {}",
+    "a great photo of {}",
+]
+
+STYLE_TEMPLATE_ORIG = [
     "a painting in the style of {}",
     "a rendering in the style of {}",
     "a cropped painting in the style of {}",
@@ -61,10 +92,28 @@
     "a large painting in the style of {}",
 ]
 
+STYLE_TEMPLATE = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "an artwork in the style of {}",
+    "a magnificent painting in the style of {}",
+    "a picture in the style of {}",
+    "a photograph, {} style",
+    "{} style painting",
+    "a {}-styled artwork",
+    "a nice painting in the style of {}",
+    "a goregous example of {} style",
+    "image in the style of {}",
+    "{}, painting",
+    "{} artwork",
+]
+
+
 NULL_TEMPLATE = ["{}"]
 
 TEMPLATE_MAP = {
     "object": OBJECT_TEMPLATE,
+    "person": PERSON_TEMPLATE,
     "style": STYLE_TEMPLATE,
     "null": NULL_TEMPLATE,
 }
@@ -116,6 +165,35 @@ def _generate_random_mask(image):
     return mask, masked_image
 
 
+def expand_rectangle(mask, f):
+    rows, cols = np.where(mask == 255)
+    top_row, bottom_row = np.min(rows), np.max(rows)
+    left_col, right_col = np.min(cols), np.max(cols)
+
+    rect_height, rect_width = bottom_row - top_row + 1, right_col - left_col + 1
+    new_height, new_width = np.round(rect_height * f), np.round(rect_width * f)
+
+    center_row, center_col = top_row + rect_height // 2, left_col + rect_width // 2
+    top_row, bottom_row = np.round(center_row - new_height / 2), np.round(
+        center_row + new_height / 2
+    )
+    left_col, right_col = np.round(center_col - new_width / 2), np.round(
+        center_col + new_width / 2
+    )
+
+    top_row, bottom_row = int(np.clip(top_row, 0, mask.shape[0] - 1)), int(
+        np.clip(bottom_row, 0, mask.shape[0] - 1)
+    )
+    left_col, right_col = int(np.clip(left_col, 0, mask.shape[1] - 1)), int(
+        np.clip(right_col, 0, mask.shape[1] - 1)
+    )
+
+    expanded_mask = np.ones_like(mask)
+    expanded_mask[top_row : bottom_row + 1, left_col : right_col + 1] = 255
+
+    return expanded_mask
+
+
 class PivotalTuningDatasetCapation(Dataset):
     """
     A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
@@ -141,6 +219,8 @@ def __init__(
         self.tokenizer = tokenizer
         self.resize = resize
         self.train_inpainting = train_inpainting
+        self.h_flip_prob = 0.5
+        self.final_flip_prob = 0.33 if use_template == "person" else 0.5
 
         instance_data_root = Path(instance_data_root)
         if not instance_data_root.exists():
@@ -156,6 +236,10 @@ def __init__(
         # Prepare the instance images
         if use_mask_captioned_data:
             src_imgs = glob.glob(str(instance_data_root) + "/*src.jpg")
+            src_imgs = sorted(
+                src_imgs, key=lambda x: int(str(Path(x).stem).split(".")[0])
+            )
+
             for f in src_imgs:
                 idx = int(str(Path(f).stem).split(".")[0])
                 mask_path = f"{instance_data_root}/{idx}.mask.png"
@@ -218,6 +302,18 @@ def __init__(
                         ]
                     )
                     for idx, mask in enumerate(masks):
+                        avg_pixel_value = np.array(mask.getdata()).mean()
+                        if avg_pixel_value == 1.0:
+                            print(f"No mask detected for {idx}..")
+                        else:
+                            if 1:
+                                # convert to numpy array:
+                                mask = np.array(mask)
+                                # Make the rectangular mask region bigger:
+                                mask = expand_rectangle(mask, 1.25)
+                                # convert back to PIL image:
+                                mask = Image.fromarray(mask).convert("L")
+
                         mask.save(f"{instance_data_root}/{idx}.mask.png")
 
                     break
@@ -237,12 +333,13 @@ def __init__(
         self.h_flip = h_flip
         self.image_transforms = transforms.Compose(
             [
+                transforms.RandomAffine(degrees=0, translate=(0, 0), scale=(1.0, 1.2)),
                 transforms.Resize(
                     size, interpolation=transforms.InterpolationMode.BILINEAR
                 )
                 if resize
                 else transforms.Lambda(lambda x: x),
-                transforms.ColorJitter(0.1, 0.1)
+                transforms.ColorJitter(0.1, 0.1, 0.02, 0.02)
                 if color_jitter
                 else transforms.Lambda(lambda x: x),
                 transforms.CenterCrop(size),
@@ -253,6 +350,15 @@ def __init__(
 
         self.blur_amount = blur_amount
 
+        print("Captions:")
+        print(self.captions)
+
+    def tune_h_flip_prob(self, training_progress):
+        if self.h_flip:
+            # Tune the h_flip probability to be 0.5 training_progress is 0 and end_prob when training_progress is 1
+            self.h_flip_prob = 0.5 + (self.final_flip_prob - 0.5) * training_progress
+            print(f"h_flip_prob: {self.h_flip_prob:.3f}")
+
     def __len__(self):
         return self._length
 
@@ -283,18 +389,14 @@ def __getitem__(self, index):
                 for token, value in self.token_map.items():
                     text = text.replace(token, value)
 
-        print(text)
+        if random.random() < 0.1:
+            print(text)
 
         if self.use_mask:
-            example["mask"] = (
-                self.image_transforms(
-                    Image.open(self.mask_path[index % self.num_instance_images])
-                )
-                * 0.5
-                + 1.0
-            )
+            img_mask = Image.open(self.mask_path[index % self.num_instance_images])
+            example["mask"] = self.image_transforms(img_mask) * 0.5 + 1.0
 
-        if self.h_flip and random.random() > 0.5:
+        if self.h_flip and random.random() < self.h_flip_prob:
             hflip = transforms.RandomHorizontalFlip(p=1)
 
             example["instance_images"] = hflip(example["instance_images"])
diff --git a/lora_diffusion/lora.py b/lora_diffusion/lora.py
index 8753f15..aae1d8b 100644
--- a/lora_diffusion/lora.py
+++ b/lora_diffusion/lora.py
@@ -1,7 +1,12 @@
 import json
 import math
 from itertools import groupby
-from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union
+import sys
+if sys.version_info >= (3,9):
+    from typing import Type
+else : 
+    from typing_extensions import Type
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import numpy as np
 import PIL
@@ -914,7 +919,7 @@ def apply_learned_embed_in_clip(
         trained_tokens = list(learned_embeds.keys())
 
     for token in trained_tokens:
-        print(token)
+        print("Adding new token: ", token)
         embeds = learned_embeds[token]
 
         # cast to dtype of text_encoder
diff --git a/lora_diffusion/preprocess_files.py b/lora_diffusion/preprocess_files.py
index bedb89f..315765a 100644
--- a/lora_diffusion/preprocess_files.py
+++ b/lora_diffusion/preprocess_files.py
@@ -2,7 +2,12 @@
 # Have BLIP auto caption
 # Have CLIPSeg auto mask concept
 
-from typing import List, Literal, Union, Optional, Tuple
+import sys
+if sys.version_info >= (3,8):
+    from typing import Literal
+else : 
+    from typing_extensions import Literal
+from typing import List, Union, Optional, Tuple
 import os
 from PIL import Image, ImageFilter
 import torch
@@ -244,7 +249,7 @@ def _center_of_mass(mask: Image.Image):
 def load_and_save_masks_and_captions(
     files: Union[str, List[str]],
     output_dir: str,
-    caption_text: Optional[str] = None,
+    captions_text: Optional[Union[List[str], str]] = None,
     target_prompts: Optional[Union[List[str], str]] = None,
     target_size: int = 512,
     crop_based_on_salience: bool = True,
@@ -263,8 +268,10 @@ def load_and_save_masks_and_captions(
         # check if it is a directory
         if os.path.isdir(files):
             # get all the .png .jpg in the directory
-            files = glob.glob(os.path.join(files, "*.png")) + glob.glob(
-                os.path.join(files, "*.jpg")
+            files = (
+                glob.glob(os.path.join(files, "*.png"))
+                + glob.glob(os.path.join(files, "*.jpg"))
+                + glob.glob(os.path.join(files, "*.jpeg"))
             )
 
         if len(files) == 0:
@@ -278,8 +285,10 @@ def load_and_save_masks_and_captions(
     images = [Image.open(file) for file in files]
 
     # captions
-    print(f"Generating {len(images)} captions...")
-    captions = blip_captioning_dataset(images, text=caption_text)
+    captions = caption_text
+    if not isinstance(caption_text, List):
+        print(f"Generating {len(images)} captions...")
+        captions = blip_captioning_dataset(images, text=caption_text)
 
     if target_prompts is None:
         target_prompts = captions
@@ -325,3 +334,7 @@ def load_and_save_masks_and_captions(
 
 def main():
     fire.Fire(load_and_save_masks_and_captions)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
index 89eebcd..f05192c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ safetensors
 opencv-python
 torchvision
 mediapipe
+typing_extensions; python_version < '3.9'
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 6d286b3..2b5e609 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 setup(
     name="lora_diffusion",
     py_modules=["lora_diffusion"],
-    version="0.1.7",
+    version="0.1.8",
     description="Low Rank Adaptation for Diffusion Models. Works with Stable Diffusion out-of-the-box.",
     author="Simo Ryu",
     packages=find_packages(),
diff --git a/textual_inversion.png b/textual_inversion.png
new file mode 100644
index 0000000..8c39982
Binary files /dev/null and b/textual_inversion.png differ