XHealth/summarize.py at main · keyanadesai/XHealth · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from openai import OpenAI
import asyncio
import random
import json


prompt = """You are a tweet analyzer, extract all the information from these tweets that pertains to each of these categories:
    *Disease Outbreaks and Public Health Emergencies* - New or breaking information about only epidemics and diseases around the world, avoid covid related updates unless they involve new variants
    *Health Disparities and Equity* - Information regarding minority, marginalized, or lower-income individuals, avoid initiatives and awareness months/weeks
    *Medical Research and Innovations* - advancements or studies to do with biotechnology
    *Daily Health and Nutrition* - information regarding daily activities and nutrition and tips to imrpove daily living
    *None of the above* - unnecessary information related to politics or awareness months or information that doesn't fall into the above categories

 Find the 5 best tweets for each of the categories and then summarize them into a 150 character passage that uses correct grammar and is easy to follow as if there was no longer access to the tweets. Be as specific as possible while maintaining the paragraph format and end each summary with at most 2 relevant and positive hashtags. Use complete sentences and avoid semicolons and return them in the following format:
{"category1": "summary1", "category2":  "summary2", "category3": "summary3", "category4":  "summary4"}.
"""
from utility.post_tweets import tweet

file_path = "./data/april_21_tweets.txt"


async def create_summaries(date):
    client = OpenAI()
    text = ""
    with open(file_path, "r") as file:
        lines = file.readlines()
        random.shuffle(lines)
        text = lines[0:200]
    arr = []
    tooLong = True
    while tooLong:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": "".join(text)},
            ],
        )

        dict = json.loads(completion.choices[0].message.content)

        arr.append(dict["Daily Health and Nutrition"])
        arr.append(dict["Disease Outbreaks and Public Health Emergencies"])
        arr.append(dict["Health Disparities and Equity"])
        arr.append(dict["Medical Research and Innovations"])
        tooLong = False
        for item in arr:
            if len(item) > 275:
                print("Too long! Trying again. \n")
                tooLong = True
                arr = []

    completion2 = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": 'create a python array of 4 headers, indexed by integers, 1 for each of the items in this text, just output the array like text in the format ["header1", "header2"...]',
            },
            {"role": "user", "content": "".join(arr)},
        ],
        max_tokens=4096,
    )
    # print(completion2.choices[0].message.content)
    summary = completion2.choices[0].message.content
    topics = arr
    print("before JSON", summary)
    summary = json.loads(summary)
    tweet(summary, topics, date)
    return arr


date = "April 21st, 2024"
asyncio.run(create_summaries(date))