-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
112 lines (85 loc) · 3.48 KB
/
crawler.py
File metadata and controls
112 lines (85 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import time
from newspaper import build
import requests
from openai import OpenAI
import pandas as pd
import pickle
import numpy as np
import nltk
nltk.download('punkt_tab')
client = OpenAI(
api_key="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"
)
def crawl(url, num):
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
cnn_paper = build(url, memoize_articles=False)
recent_articles = []
for article in cnn_paper.articles:
if len(recent_articles) >= num:
break
try:
# Manually download using requests for better control
response = requests.get(article.url, headers=HEADERS, timeout=10)
if response.status_code != 200:
continue
# Load into newspaper3k
article.download(input_html=response.text)
article.parse()
article.nlp()
# Store result
recent_articles.append(article)
print(f"{len(recent_articles)} {article.title}")
time.sleep(2)
except Exception as e:
print(f"Error processing {article.url}: {e}")
time.sleep(2)
with open("title.txt", "w") as file:
for item in recent_articles:
file.write(item.title + "\n")
print("\n✅ Crawled", len(recent_articles), "recent articles from CNN.")
return recent_articles
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def gen_embed(articles: list):
# embedding_cache_path = "embeddings.pkl"
# # load the cache if it exists, and save a copy to disk
# try:
# embedding_cache = pd.read_pickle(embedding_cache_path)
# except (FileNotFoundError, EOFError):
# embedding_cache = {}
# with open(embedding_cache_path, "wb") as embedding_cache_file:
# pickle.dump(embedding_cache, embedding_cache_file)
# for article in articles:
# if article.title not in embedding_cache.keys():
# embedding_cache[article.title] = client.embeddings.create(input = article.title + " " + article.summary, model="text-embedding-3-small").data[0].embedding
# with open(embedding_cache_path, "wb") as embedding_cache_file:
# pickle.dump(embedding_cache, embedding_cache_file)
embeddings = []
for article in articles:
embed = client.embeddings.create(input = article.title + " " + article.summary, model="text-embedding-3-small").data[0].embedding
embeddings.append((embed, article))
return embeddings
def rank(embedding_cache, interest, len):
select = []
for prompt in interest:
prompt_emb = client.embeddings.create(
input=prompt,
model="text-embedding-3-small"
).data[0].embedding
sims = []
# print(embedding_cache)
for item in embedding_cache:
sims.append((item[1], cosine_similarity(prompt_emb, item[0])))
sims.sort(key=lambda x: x[1], reverse=True)
temp = []
for i in range(len):
temp.append(sims[i][0])
select.append((prompt, temp))
return select
if __name__ == "__main__":
prompt = "AI Engineer who is building a startup"
articles = crawl("https://cnn.com")
rank(articles, prompt)