search/scrape.rb at main · federated-wiki/search · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
require 'json'
require 'fileutils'

def asSlug title
  title.gsub(/\s/, '-').gsub(/[^A-Za-z0-9-]/, '').downcase
end

def ago date
  "#{Time.now().to_i/60/60/24 - date.to_i/1000/60/60/24} days ago"
end

def sites
# Index ► Ruby:scrape ► read Sites:dir
  Dir.glob 'sites/*' do |each|
    path, site = each.split '/'
    begin
      codes = {
        6 => "unknown host",
        28 => "request timeout"
      }
# Index ► Ruby:scrape ► get Wiki:sitemap
      text = `curl -s -m 12 -L http://#{site}/system/sitemap.json`
      raise "curl #{codes[$?.exitstatus]||$?}" if $?!=0
      raise "empty response" if text.length == 0
      if text[0] == '<'
        raise $~.captures[0] if text.match(/<title>(.+?)<\/title>/)
        raise "looks like html"
      end
      raise "not expected json" if text[0] != '['
      sitemap = JSON.parse text
      yield site, sitemap
    rescue => e
      puts "#{site}, sitemap: #{e.to_s[0..50].gsub(/\s+/,' ')}"
    end
  end
end

def scrape site, slug, aspect
  result = yield
  return if result.empty?
  dirname = "sites/#{site}/pages/#{slug}"
  FileUtils.mkdir_p dirname unless File.directory? dirname
# Index ► Ruby:scrape ► write Pages:words.txt ► write Pages:sites.txt
  File.open "#{dirname}/#{aspect}.txt", 'w' do |file|
    result.each do |word|
      file.puts word
    end
  end
  puts "\t\t\t#{aspect} #{result.length}"
end

def update site, pageinfo
  puts "\t#{pageinfo['title'].gsub(/\n/,'-')}, #{ago pageinfo['date']}"
  begin
    slug = pageinfo['slug']
    page = JSON.parse `curl -s -L http://#{site}/#{slug}.json`
    story = page['story'] || []
    journal = page['journal'] || []
    puts "\t\tstory #{story.length}, journal #{journal.length}"

    scrape site, slug, 'words' do
      words = []
      story.each do |item|
        next unless text = item['text']
        text.gsub! /[a-zA-Z0-9\+\/]{50,}/,''
        text.gsub! /<(.|\n)*?>/, ' ' if item['type'] == 'html'
        text.gsub! /\[((http|https|ftp):.*?) (.*?)\]/, '\3'
        text.scan /[A-Za-z]+/ do |word|
          word = word.downcase
          words.push word unless words.include? word
        end
      end
      words.sort
    end

    scrape site, slug, 'links' do
      words = []
      story.each do |item|
        words.push item['slug'] if item['slug']
        next unless text = item['text']
        text.scan /\[\[([^\]]+)\]\]/ do |word|
          word = asSlug word[0]
          words.push word unless words.include? word
        end
      end
      words.sort
    end

    scrape site, slug, 'sites' do
      words = []
      story.each do |item|
        if word = item['site']
          words.push word.downcase unless words.include? word.downcase
        end
        if item['type'] == 'roster'
          item['text'].scan /^(([a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)+)(:\d+)?)$/ do |word|
            word = word[0].downcase
            words.push word unless words.include? word
          end
          # item['text'].scan /^ROSTER ([A-Za-z0-9.-:]+\/[a-z0-9-]+)$/ do |word|
          #   word = word[0].downcase
          #   words.push word unless words.include? word
          # end
          # item['text'].scan /^REFERENCES ([A-Za-z0-9.-:]+\/[a-z0-9-]+)$/ do |word|
          #   word = word[0].downcase
          #   words.push word unless words.include? word
          # end
        end
      end
      journal.each do |item|
        if word = item['site']
          words.push word.downcase unless words.include? word.downcase
        end
      end
      words.sort
    end

    scrape site, slug, 'items' do
      words = []
      story.each do |item|
        id = item['alias'] || item['id']
        words.push id unless words.include? id
      end
      words.sort
    end

    scrape site, slug, 'plugins' do
      words = []
      story.each do |item|
        type = item['type']
        words.push type unless words.include? type
      end
      words.sort
    end


  rescue => e
    puts "\t#{pageinfo['slug']}, update: #{e}"
  end
end

def scraped site
  mark = "sites/#{site}/scraped"
  if File.exist? mark
    since = File.mtime(mark).to_i*1000
  else
    since = 0
  end
  FileUtils.touch mark
  since
end

scraped = []
sites do |site, sitemap|
  since = scraped site
  newest = 0
  puts "#{site}, #{sitemap.length} pages"
  sitemap.each do |pageinfo|
    date = pageinfo['date'] || 1
    newest = date if date > newest
    # next if pageinfo['slug'] != 'gtexplainerexamples'
    # update site, pageinfo
    update site, pageinfo if date > since
  end
  scraped << {site:site, pages:sitemap.length, date:newest}
end
# Debug ► Ruby:scrape ► writes Public:scraped.json
File.open('public/scraped.json', 'w') do |file|
  file.puts scraped.to_json
end