forked from WardCunningham/search
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.rb
More file actions
170 lines (156 loc) · 4.6 KB
/
scrape.rb
File metadata and controls
170 lines (156 loc) · 4.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
require 'json'
require 'fileutils'
def asSlug title
title.gsub(/\s/, '-').gsub(/[^A-Za-z0-9-]/, '').downcase
end
def ago date
"#{Time.now().to_i/60/60/24 - date.to_i/1000/60/60/24} days ago"
end
def sites
# Index ► Ruby:scrape ► read Sites:dir
Dir.glob 'sites/*' do |each|
path, site = each.split '/'
begin
codes = {
6 => "unknown host",
28 => "request timeout"
}
# Index ► Ruby:scrape ► get Wiki:sitemap
text = `curl -s -m 12 -L http://#{site}/system/sitemap.json`
raise "curl #{codes[$?.exitstatus]||$?}" if $?!=0
raise "empty response" if text.length == 0
if text[0] == '<'
raise $~.captures[0] if text.match(/<title>(.+?)<\/title>/)
raise "looks like html"
end
raise "not expected json" if text[0] != '['
sitemap = JSON.parse text
yield site, sitemap
rescue => e
puts "#{site}, sitemap: #{e.to_s[0..50].gsub(/\s+/,' ')}"
end
end
end
def scrape site, slug, aspect
result = yield
return if result.empty?
dirname = "sites/#{site}/pages/#{slug}"
FileUtils.mkdir_p dirname unless File.directory? dirname
# Index ► Ruby:scrape ► write Pages:words.txt ► write Pages:sites.txt
File.open "#{dirname}/#{aspect}.txt", 'w' do |file|
result.each do |word|
file.puts word
end
end
puts "\t\t\t#{aspect} #{result.length}"
end
def update site, pageinfo
puts "\t#{pageinfo['title'].gsub(/\n/,'-')}, #{ago pageinfo['date']}"
begin
slug = pageinfo['slug']
page = JSON.parse `curl -s -L http://#{site}/#{slug}.json`
story = page['story'] || []
journal = page['journal'] || []
puts "\t\tstory #{story.length}, journal #{journal.length}"
scrape site, slug, 'words' do
words = []
story.each do |item|
next unless text = item['text']
text.gsub! /[a-zA-Z0-9\+\/]{50,}/,''
text.gsub! /<(.|\n)*?>/, ' ' if item['type'] == 'html'
text.gsub! /\[((http|https|ftp):.*?) (.*?)\]/, '\3'
text.scan /[A-Za-z]+/ do |word|
word = word.downcase
words.push word unless words.include? word
end
end
words.sort
end
scrape site, slug, 'links' do
words = []
story.each do |item|
words.push item['slug'] if item['slug']
next unless text = item['text']
text.scan /\[\[([^\]]+)\]\]/ do |word|
word = asSlug word[0]
words.push word unless words.include? word
end
end
words.sort
end
scrape site, slug, 'sites' do
words = []
story.each do |item|
if word = item['site']
words.push word.downcase unless words.include? word.downcase
end
if item['type'] == 'roster'
item['text'].scan /^(([a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)+)(:\d+)?)$/ do |word|
word = word[0].downcase
words.push word unless words.include? word
end
# item['text'].scan /^ROSTER ([A-Za-z0-9.-:]+\/[a-z0-9-]+)$/ do |word|
# word = word[0].downcase
# words.push word unless words.include? word
# end
# item['text'].scan /^REFERENCES ([A-Za-z0-9.-:]+\/[a-z0-9-]+)$/ do |word|
# word = word[0].downcase
# words.push word unless words.include? word
# end
end
end
journal.each do |item|
if word = item['site']
words.push word.downcase unless words.include? word.downcase
end
end
words.sort
end
scrape site, slug, 'items' do
words = []
story.each do |item|
id = item['alias'] || item['id']
words.push id unless words.include? id
end
words.sort
end
scrape site, slug, 'plugins' do
words = []
story.each do |item|
type = item['type']
words.push type unless words.include? type
end
words.sort
end
rescue => e
puts "\t#{pageinfo['slug']}, update: #{e}"
end
end
def scraped site
mark = "sites/#{site}/scraped"
if File.exist? mark
since = File.mtime(mark).to_i*1000
else
since = 0
end
FileUtils.touch mark
since
end
scraped = []
sites do |site, sitemap|
since = scraped site
newest = 0
puts "#{site}, #{sitemap.length} pages"
sitemap.each do |pageinfo|
date = pageinfo['date'] || 1
newest = date if date > newest
# next if pageinfo['slug'] != 'gtexplainerexamples'
# update site, pageinfo
update site, pageinfo if date > since
end
scraped << {site:site, pages:sitemap.length, date:newest}
end
# Debug ► Ruby:scrape ► writes Public:scraped.json
File.open('public/scraped.json', 'w') do |file|
file.puts scraped.to_json
end