aiGeneratedTextDetection/app.py at main · itzsoomit/aiGeneratedTextDetection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import streamlit as st
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from collections import OrderedDict
import re
import nltk
from nltk.tokenize import sent_tokenize
import joblib
import matplotlib.pyplot as plt

#activate virtual environment first
# for perplexity score
model_name="gpt2"  #or any gpt variant
model=GPT2LMHeadModel.from_pretrained(model_name)
tokenizer=GPT2Tokenizer.from_pretrained(model_name)

def calculate_perplexity(line):
    tokens = tokenizer.encode(line, return_tensors='pt')
    perplexity=model.forward(tokens,labels=tokens)["loss"].exp().item()
    return perplexity

def getResult(perplexity):
    threshold1=60
    threshold2=80
    if perplexity<threshold1:
        st.markdown("The Text is generated by AI with [perplexity](https://en.wikipedia.org/wiki/Perplexity#:~:text=In%20information%20theory%2C%20perplexity%20is,be%20drawn%20from%20the%20distribution.) score of {:.2f}".format(perplexity))
        return
    elif perplexity<threshold2:
        "The Text is most probably contain parts which are generated by AI. (require more text for better Judgement)"
        return
    else:
        st.markdown("The Text is written by Human with [perplexity](https://en.wikipedia.org/wiki/Perplexity#:~:text=In%20information%20theory%2C%20perplexity%20is,be%20drawn%20from%20the%20distribution.) score of {:.2f}".format(perplexity))
        return

def predict(sentence):
    result=OrderedDict()
    lines = sent_tokenize(sentence)
    Perplexity_per_line=[]
    for line in lines:
        ppl=calculate_perplexity(line)
        Perplexity_per_line.append(ppl)
    result["perplexity"]=sum(Perplexity_per_line)/len(Perplexity_per_line)

    out=getResult(result["perplexity"])


#from trained model

ensemble_model=joblib.load(r'C:\Users\sumit\Desktop\projects\detectAItext\ensemble_model.joblib')
tfidf_vectorizer=joblib.load(r'C:\Users\sumit\Desktop\projects\detectAItext\tfidf_vectorizer.joblib')


def main():
    nltk.download('punkt')

    st.title("Detect AI generated Text")
    user_input =  st.text_area("Enter your text here:",height=300)
    if st.button("Check"):
        if user_input:
            predict(user_input)
            test_x=tfidf_vectorizer.transform([user_input])
            result=ensemble_model.predict_proba(test_x)[:,1]
            ai=result[0]*100
            human=100-ai
            labels = ['Human Generated', 'AI Generated',]
            sizes = [human,ai]

            fig, ax = plt.subplots()
            wedges, texts, autotexts = ax.pie(sizes, labels=labels, autopct='%1.2f%%', startangle=90,
                                            textprops=dict(color="w"))

            # Set the background color to be transparent
            fig.patch.set_facecolor('none')
            ax.set_facecolor('none')

            # Set the text color of the autopct labels to be white
            for autotext in autotexts:
                autotext.set_color('white')

            # Display the pie chart using st.pyplot()
            st.pyplot(fig)
            # st.write(f"Probability (AI generated)= {ai:.2f} %")
            # st.write(f"Probability (Human generated)= {human:.2f} %")


        else:
            st.warning("Please enter text before pressing the 'Check' button.")

if __name__ == "__main__":
    main()