|
1 | 1 | import streamlit as st |
2 | 2 |
|
3 | | -from clip.utils import get_images |
4 | | - |
5 | 3 | # this caches the output to store the output and not call this function again |
6 | 4 | # and again preventing time wastage. `allow_output_mutation = True` tells the |
7 | 5 | # function to not hash the output of this function and we can get away with it |
|
10 | 8 | @st.cache(allow_output_mutation=True, show_spinner=False) |
11 | 9 | def get_cross_modal_search_models(): |
12 | 10 | from clip.clip import CLIP |
13 | | - return { |
14 | | - 'CLIP': CLIP() |
15 | | - } |
| 11 | + return CLIP() |
16 | 12 |
|
17 | 13 | # load all the models before the app starts |
18 | | -with st.spinner('Downloading and Loading Model with Vocabulary...'): |
19 | | - MODELS = get_cross_modal_search_models() |
| 14 | +with st.spinner('Loading Model with Vocabulary ... (might take sometime)'): |
| 15 | + model = get_cross_modal_search_models() |
| 16 | + |
| 17 | +st.write(f''' |
| 18 | +# Image Searching App |
20 | 19 |
|
21 | | -st.write(''' |
22 | | -# NL-Images |
23 | | -CLIP is used to perform Cross Modal Search: |
24 | | -- CLIP: CLIP (Contrastive Language-Image Pre-Training) is a neural network that |
25 | | -consists of a image encoder and a text encoder. It predicts the similarity between |
26 | | -the given images and textual descriptions. |
| 20 | +Find images using text and yes, there's an easter egg. |
27 | 21 | ''') |
28 | 22 |
|
29 | | -model_name = st.sidebar.selectbox( |
30 | | - 'Please select your app', |
31 | | - ["CLIP"] |
| 23 | +app_mode = st.sidebar.selectbox( |
| 24 | + 'Please select tasks', |
| 25 | + ["Text Search", "Image Search", "Text to Text Similarity"] |
32 | 26 | ) |
33 | 27 |
|
34 | | -if model_name != "CLIP": |
35 | | - st.write("Use `CLIP` model!") |
36 | | - model = MODELS['CLIP'] |
| 28 | +st.write('''Upload more images to cache, if you want to add more!''') |
| 29 | +images = st.file_uploader("Images", accept_multiple_files=True, type=['png', 'jpg', 'jpeg']) |
| 30 | + |
| 31 | +if st.button("Upload") and len(images): |
| 32 | + out = model.upload_images(images) |
| 33 | + st.write(out) |
| 34 | + st.write(f'''{model.n_images}''') |
37 | 35 |
|
38 | | -if model_name == "CLIP": |
39 | | - st.write("### `CLIP` Model") |
40 | | - st.write("Please upload images and write text of your choice") |
41 | | - st.write("Note: Write each description in a new line") |
42 | | - model = MODELS['CLIP'] |
| 36 | +# slider to select the number of images to display |
| 37 | +n_images = st.slider('Number of images to see', min_value=1, max_value = model.n_images) |
43 | 38 |
|
44 | | -images = st.file_uploader("Images", accept_multiple_files=True, type=['png', 'jpg']) |
| 39 | +if app_mode == "Image Search": |
| 40 | + st.write('''### Image Search''') |
| 41 | + st.write(f"Upload any image for similarity search. Searching {n_images} images!") |
| 42 | + image = st.file_uploader("Images", accept_multiple_files=False, type=['png', 'jpg', 'jpeg']) |
| 43 | + if st.button("Process") and image: |
| 44 | + out = model.visual_search(image, n_images) |
| 45 | + for x in out: |
| 46 | + st.image(x) |
45 | 47 |
|
46 | | -if len(images) != 0: |
47 | | - images, image_grid = get_images(images) |
48 | | - st.image(image_grid) |
| 48 | +elif app_mode == "Text Search": |
| 49 | + st.write('''### Text Search''') |
| 50 | + text = st.text_input(f"Add the text to search. Searching {n_images} images!") |
| 51 | + if st.button("Process") and text: |
| 52 | + out = model.text_search(text, n_images) |
| 53 | + for x in out: |
| 54 | + st.image(x) |
49 | 55 |
|
50 | | -default_ = "a person stuck in traffic\na apple on the table\na garden of sunflowers" |
51 | | -text = st.text_area("Text", value=default_, key="Text") |
52 | | -text = text.splitlines() |
| 56 | +elif app_mode == "Text to Text Similarity": |
| 57 | + st.write('''### Text to Text Similarity |
| 58 | + |
| 59 | +This requires two different inputs, first is the memory against which to check |
| 60 | +the second input query.''') |
53 | 61 |
|
54 | | -# `transpose_flag` tells against which input, should softmax be calculated |
55 | | -# ie. if transpose_flag = False -> sum(text[i]) == 1 but sum(images[i]) != 1 |
56 | | -# ie. if transpose_flag = True -> sum(text[i]) != 1 but sum(images[i]) == 1 |
57 | | -transpose_flag = st.radio('Priority', ['Image', 'Text']) |
58 | | -if len(images) == 1: |
59 | | - transpose_flag = True |
60 | | -elif len(text) == 1: |
61 | | - transpose_flag = False |
62 | | -else: |
63 | | - transpose_flag = True if transpose_flag == 'Image' else False |
| 62 | + default_ = '''How can I sample from the EMNIST letters dataset? |
| 63 | +Simple, efficient way to create Dataset? |
| 64 | +How to use multiple models to perform inference on same data in parallel? |
| 65 | +Get target list from Dataset |
| 66 | +Sparse dataset and dataloader |
| 67 | +Element-Wise Max Between Two Tensors?''' |
| 68 | + memory = st.text_area("Memory", value=default_) |
| 69 | + query = st.text_input("Query", value="Can I run mulitple models in parallel?") |
| 70 | + matches = model.text_to_text_similarity(memory.split("\n"), query) |
64 | 71 |
|
65 | | -if st.button("Predict"): |
66 | | - with st.spinner('Predicting...'): |
67 | | - output = model.eval(images, text, transpose_flag) |
68 | | - st.write(output) |
| 72 | + if st.button("Process"): |
| 73 | + st.write("**Query**: " + query) |
| 74 | + st.write("\n".join([f"- {m}" for m in matches])) |
0 commit comments