synthetic_data_generator/main.py at master · jehumtine/synthetic_data_generator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from langchain import PromptTemplate, LLMChain, HuggingFaceHub
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import os
import PyPDF2
from transformers import AutoTokenizer
import requests
from pathlib import Path
from tqdm import tqdm
import re
import json


os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''

tokenizer = AutoTokenizer.from_pretrained("nomic-ai/gpt4all-falcon")
history = {'internal': [], 'visible': []}
command = ""
template = """{question} \
        Task:You are an API that converts bodies of text into a single question and answer into a JSON format. Each JSON " \
          "contains a single question with a single answer. Only respond with the JSON and no additional text.
          \n."""
prompt = PromptTemplate(template=template, input_variables=["question"])
model_path = '/home/jehu/.local/share/nomic.ai/GPT4All/ggml-model-gpt4all-falcon-q4_0.bin'
folder_path = '/home/jehu/Documents/law data/data'
callbacks = [StreamingStdOutCallbackHandler()]
llm = GPT4All(model=model_path, callbacks=callbacks,verbose=True)
llm_chain = LLMChain(prompt=prompt, llm=llm)


def read_pdf(file_path):
    text = ""
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def read_documents(folder_path):
    combined_text = ""
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        print("The File path is"+ file_path)
        if filename.endswith(".pdf"):
            text = read_pdf(file_path)
            combined_text += text + "\n\n"
    return combined_text

def tokenize(text):
    enc = tokenizer.encode(text)
    return enc

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def is_json(data):
    try:
        json.loads(data)
        return True
    except ValueError:
        return False


def submit_to_llm(chunk, retries=3):
    for i in range(retries):
        try:
            print(i)
            print("th try to communicate to llm")
            response = llm_chain.run(chunk.strip())
            # Extract JSON string from between back-ticks
            if is_json(response):
                print(response)
                return json.loads(response)
            else:
                match = re.search(r'`(.*?)`', response, re.S)
                if match and is_json(match.group(1)):
                    print(f"Attempt {i + 1} failed. Retrying...")
                    return json.loads(match.group(1))  # assuming you want to return the JSON data
                else:
                    print("Request failed:")
                    print(response)
        except requests.exceptions.RequestException as e:
            continue
    print("Max retries exceeded. Skipping this chunk.")
    return None

print("Extracting Texts From PDF........")
text = read_documents(folder_path)
tokens = tokenize(text)

token_chunks = list(chunks(tokens, 256))
print("Done Tokenizing........")
responses = []
q=0
for chunk in token_chunks:
    q=q+1
    print(q)
    response = submit_to_llm(tokenizer.decode(chunk))
    if response is not None:
        #responses.append(response)
        with open('responses.json', 'a') as f:
            if q > 1:  # Add a comma before writing new JSON, except for the first one
                f.write(",\n")
            json.dump(response, f)
    else:
        print("Response is NON")

# Write responses to a JSON file
##with open('responses.json', 'w') as f:
##    json.dump(responses, f)