-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathharvest_links.php
More file actions
127 lines (99 loc) · 2.96 KB
/
harvest_links.php
File metadata and controls
127 lines (99 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
<?
require_once("db.php");
require_once("utf8.php");
require_once("string.php");
require_once("date.php");
$suffixes = array("/index.html", "/index.htm", "/index.shtml", "/index.shtm", "/index.php", "/index.php3", "/index.php4", "/index.py", "/index.asp", "/default.asp", "/main.aspx", "/index.jsp");
function normalize($url)
{
global $suffixes;
# tar bort "http://"
$url = substr($url, 7);
if(string_startswith($url, "www."))
$url = substr($url, 4);
foreach($suffixes as $suffix)
if(string_endswith(strtolower($url), $suffix))
return remove_suffix($url, $suffix);
return $url;
}
function remove_suffix($word, $suffix)
{
$end = strlen($word) - strlen($suffix);
return substr($word, 0, $end);
}
function extract_unique_links($text)
{
$matches = array();
preg_match_all("/(<|<)a[^>]+href=('http:\/\/([^>]+)'|\"http:\/\/([^>]+)\")/Usi", $text, &$matches);
$urls = array();
foreach($matches[2] as $url)
{
$url = trim(trim($url, "'\"/"));
# Blogger har en egen länk i alla flöden
if($url == "http://help.blogger.com/bin/answer.py?answer=697") continue;
array_push($urls, normalize($url));
}
return array_unique($urls);
}
function extract_title($text)
{
# tittar endast på strängen fram till första inlägget
$pos = strpos($text, "<item");
if($pos === false)
$pos = strpos($text, "<entry");
if($pos !== false)
$text = substr($text, 0, $pos);
$matches = array();
if(preg_match("/<title( [^>]+)?>(<!\[CDATA\[)??(.*)(\]\]>)??<\/title>/Usi", $text, &$matches) == 1)
return $matches[3];
return "";
}
function harvest($time_begin, $time_end, $filename)
{
ini_set("user_agent", "BlogwalkBot/1.0 (+http://www.blogwalk.se/about/bot)");
# ta reda på vilka flöden som har uppdaterats den gångna veckan
$result = query(
"SELECT DISTINCT blog.`index`,name,feedurl
FROM post,blog
WHERE
post.blog = blog.`index`
AND time >= $time_begin AND time < $time_end");
# hämta alla flöden och plocka ut unika länkar ur varje
$links = array();
while($row = fetch_array($result))
{
$xmldata = @file_get_contents($row["feedurl"]);
if($xmldata === false)
{
echo "! inget svar från $row[name]\n";
continue;
}
# när vi ändå läser flöden ... hämtar och lagrar bloggtiteln
$title = extract_title($xmldata);
if($title != "")
{
prepare_string($title);
query("UPDATE blog SET feedtitle='$title' WHERE `index`=$row[index]");
}
foreach(extract_unique_links($xmldata) as $url)
$links[$url] += 1;
}
# hämta titel på de mest populära sidorna
arsort($links);
$toplinks = array();
foreach($links as $url => $hits)
{
if($hits <= 2) break;
$title = $url;
$htmldata = @file_get_contents("http://$url");
if($htmldata !== false)
{
$matches = array();
if(preg_match("/<title>(.*)<\/title>/Uis", $htmldata, &$matches) == 1)
$title = trim($matches[1]);
}
array_push($toplinks, array("title" => $title, "url" => $url, "hits" => $hits));
}
save_data($toplinks, "/home/davense/aggregator/links/$filename");
}
?>