|
| 1 | +require 'fileutils' |
| 2 | +require 'yaml' |
| 3 | + |
1 | 4 | gem 'spidr', '~> 0.3'
|
2 | 5 | require 'spidr'
|
3 |
| -require 'fileutils' |
4 | 6 |
|
5 |
| -IMPORT_DIR = '_import' |
| 7 | +gem 'kramdown', '~> 0.13' |
| 8 | +require 'kramdown' |
| 9 | + |
| 10 | +OUTPUT_DIR = '_import' |
| 11 | +LAYOUTS = { |
| 12 | + :default => 'page', |
| 13 | + :post => 'news_post' |
| 14 | +} |
6 | 15 |
|
7 | 16 | desc 'Spiders ruby-lang.org and imports HTML content'
|
8 | 17 | task :import do
|
|
11 | 20 | agent.ignore_links_like /\.cgi[\/]?$/
|
12 | 21 | agent.ignore_links_like /\/[a-z_]+\/old-man\//
|
13 | 22 |
|
14 |
| - agent.every_failed_url do |url| |
15 |
| - puts "Not Found #{url}!" |
16 |
| - end |
17 |
| - |
18 | 23 | agent.every_ok_page do |page|
|
19 |
| - local_path = File.join(IMPORT_DIR,page.url.path[1..-1]) |
| 24 | + path = page.url.path[1..-1] |
20 | 25 |
|
21 |
| - if local_path[-1..-1] == '/' |
22 |
| - local_path += 'index.html' |
23 |
| - elsif File.extname(local_path) == '' |
24 |
| - local_path += '/index.html' |
| 26 | + layout = LAYOUTS[:default] |
| 27 | + |
| 28 | + if path =~ %r{^[a-z_-]+/news/\d{4}/\d{1,2}/\d{1,2}/[^/]+/$} |
| 29 | + # map news posts in to news/_posts/ |
| 30 | + dirs = path.split('/') |
| 31 | + local_path = File.join(OUTPUT_DIR,dirs[0,2],'_posts',dirs[2..-1].join('-')) + '.md' |
| 32 | + |
| 33 | + layout = LAYOUTS[:post] |
| 34 | + else |
| 35 | + # normal page |
| 36 | + local_path = File.join(OUTPUT_DIR,path) |
| 37 | + |
| 38 | + case File.extname(local_path) |
| 39 | + when '.html' |
| 40 | + local_path.gsub!(/\.html$/,'.md') |
| 41 | + when '' |
| 42 | + local_path += '/' unless local_path.end_with?('/') |
| 43 | + local_path += 'index.md' |
| 44 | + end |
25 | 45 | end
|
26 | 46 |
|
27 | 47 | # ensure the parent directory exists
|
28 | 48 | FileUtils.mkdir_p(File.dirname(local_path))
|
29 | 49 |
|
| 50 | + # don't overwrite existing files |
30 | 51 | unless File.exist?(local_path)
|
31 | 52 | puts "Saving #{page.url} -> #{local_path} ..."
|
32 | 53 |
|
33 |
| - File.open(local_path,'wb') do |file| |
| 54 | + File.open(local_path,'w') do |file| |
34 | 55 | if page.html?
|
35 |
| - layout = 'default' |
36 |
| - title = page.title |
37 |
| - page_div = page.doc.at('#page') |
| 56 | + header = { |
| 57 | + 'layout' => layout, |
| 58 | + 'title' => page.title.strip, |
| 59 | + 'lang' => path.split('/',2).first |
| 60 | + } |
| 61 | + |
| 62 | + # add the YAML header |
| 63 | + YAML.dump(header,file) |
38 | 64 |
|
39 |
| - if (header = page_div.at('#head')) |
40 |
| - if (header.inner_text.strip == title) |
41 |
| - layout = 'page' |
| 65 | + # YAML header separator |
| 66 | + file.puts '---' |
| 67 | + |
| 68 | + if (content_div = page.at('#content')) |
| 69 | + # remove all comments |
| 70 | + content_div.traverse do |node| |
| 71 | + node.remove if node.comment? |
42 | 72 | end
|
43 |
| - end |
44 | 73 |
|
45 |
| - file.puts( |
46 |
| - '---', |
47 |
| - "layout: #{layout}", |
48 |
| - "title: #{title}", |
49 |
| - '---' |
50 |
| - ) |
51 |
| - |
52 |
| - if (layout == 'default' && page_div) |
53 |
| - file.puts(page_div.to_html) |
54 |
| - elsif (content_div = page_div.at('#content')) |
55 |
| - file.puts(content_div.inner_html) |
| 74 | + # remove all page anchors |
| 75 | + content_div.search('//a[@id]').remove |
| 76 | + |
| 77 | + # replace all caps spans with their text |
| 78 | + content_div.search('span.caps').each do |span| |
| 79 | + span.replace(span.inner_text) |
| 80 | + end |
| 81 | + |
| 82 | + # map all code elements to their inner_text |
| 83 | + content_div.search('pre > code').each do |code| |
| 84 | + code.replace(code.children.map { |node| |
| 85 | + if node.name == 'br' |
| 86 | + $/ |
| 87 | + else |
| 88 | + node.inner_text |
| 89 | + end |
| 90 | + }.join) |
| 91 | + end |
| 92 | + |
| 93 | + # replace the #extended div with it's children |
| 94 | + if (extended_div = content_div.at('#extended')) |
| 95 | + extended_div.replace(extended_div.inner_html) |
| 96 | + end |
| 97 | + |
| 98 | + # convert from HTML to Markdown |
| 99 | + content = Kramdown::Document.new( |
| 100 | + content_div.inner_html, |
| 101 | + :input => :html |
| 102 | + ).to_kramdown |
| 103 | + |
| 104 | + file.puts(content) |
56 | 105 | end
|
57 | 106 | else
|
58 | 107 | file.write(page.body)
|
|
0 commit comments