CustomNews/crawler.py at main · jackyjin1234/CustomNews · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import time
from newspaper import build
import requests
from openai import OpenAI
import pandas as pd
import pickle
import numpy as np
import nltk

nltk.download('punkt_tab')

client = OpenAI(
  api_key="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"
)


def crawl(url, num):
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                    (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
    }

    cnn_paper = build(url, memoize_articles=False)

    recent_articles = []

    for article in cnn_paper.articles:
        if len(recent_articles) >= num:
            break

        try:
            # Manually download using requests for better control
            response = requests.get(article.url, headers=HEADERS, timeout=10)
            if response.status_code != 200:
                continue

            # Load into newspaper3k
            article.download(input_html=response.text)
            article.parse()
            article.nlp()

            # Store result
            recent_articles.append(article)
            print(f"{len(recent_articles)} {article.title}")
            time.sleep(2)
        except Exception as e:
            print(f"Error processing {article.url}: {e}")
            time.sleep(2)

    with open("title.txt", "w") as file:
        for item in recent_articles:
            file.write(item.title + "\n")

    print("\n✅ Crawled", len(recent_articles), "recent articles from CNN.")
    return recent_articles

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def gen_embed(articles: list):
    # embedding_cache_path = "embeddings.pkl"

    # # load the cache if it exists, and save a copy to disk
    # try:
    #     embedding_cache = pd.read_pickle(embedding_cache_path)
    # except (FileNotFoundError, EOFError):
    #     embedding_cache = {}
    # with open(embedding_cache_path, "wb") as embedding_cache_file:
    #     pickle.dump(embedding_cache, embedding_cache_file)

    # for article in articles:
    #     if article.title not in embedding_cache.keys():
    #         embedding_cache[article.title] = client.embeddings.create(input = article.title + " " + article.summary, model="text-embedding-3-small").data[0].embedding
    #         with open(embedding_cache_path, "wb") as embedding_cache_file:
    #             pickle.dump(embedding_cache, embedding_cache_file)

    embeddings = []
    for article in articles:
        embed = client.embeddings.create(input = article.title + " " + article.summary, model="text-embedding-3-small").data[0].embedding
        embeddings.append((embed, article))

    return embeddings

def rank(embedding_cache, interest, len):
    select = []

    for prompt in interest:
        prompt_emb = client.embeddings.create(
            input=prompt,
            model="text-embedding-3-small"
        ).data[0].embedding

        sims = []
        # print(embedding_cache)
        for item in embedding_cache:
            sims.append((item[1], cosine_similarity(prompt_emb, item[0])))

        sims.sort(key=lambda x: x[1], reverse=True)

        temp = []
        for i in range(len):
            temp.append(sims[i][0])
        select.append((prompt, temp))

    return select


if __name__ == "__main__":
    prompt = "AI Engineer who is building a startup"

    articles = crawl("https://cnn.com")
    rank(articles, prompt)