Skip to content

Commit c6137a2

Browse files
committed
Updated the import task to automatically convert to Markdown, and scrub the HTML.
1 parent 1e6cd05 commit c6137a2

File tree

1 file changed

+79
-30
lines changed

1 file changed

+79
-30
lines changed

_tasks/import.rb

Lines changed: 79 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
1+
require 'fileutils'
2+
require 'yaml'
3+
14
gem 'spidr', '~> 0.3'
25
require 'spidr'
3-
require 'fileutils'
46

5-
IMPORT_DIR = '_import'
7+
gem 'kramdown', '~> 0.13'
8+
require 'kramdown'
9+
10+
OUTPUT_DIR = '_import'
11+
LAYOUTS = {
12+
:default => 'page',
13+
:post => 'news_post'
14+
}
615

716
desc 'Spiders ruby-lang.org and imports HTML content'
817
task :import do
@@ -11,48 +20,88 @@
1120
agent.ignore_links_like /\.cgi[\/]?$/
1221
agent.ignore_links_like /\/[a-z_]+\/old-man\//
1322

14-
agent.every_failed_url do |url|
15-
puts "Not Found #{url}!"
16-
end
17-
1823
agent.every_ok_page do |page|
19-
local_path = File.join(IMPORT_DIR,page.url.path[1..-1])
24+
path = page.url.path[1..-1]
2025

21-
if local_path[-1..-1] == '/'
22-
local_path += 'index.html'
23-
elsif File.extname(local_path) == ''
24-
local_path += '/index.html'
26+
layout = LAYOUTS[:default]
27+
28+
if path =~ %r{^[a-z_-]+/news/\d{4}/\d{1,2}/\d{1,2}/[^/]+/$}
29+
# map news posts in to news/_posts/
30+
dirs = path.split('/')
31+
local_path = File.join(OUTPUT_DIR,dirs[0,2],'_posts',dirs[2..-1].join('-')) + '.md'
32+
33+
layout = LAYOUTS[:post]
34+
else
35+
# normal page
36+
local_path = File.join(OUTPUT_DIR,path)
37+
38+
case File.extname(local_path)
39+
when '.html'
40+
local_path.gsub!(/\.html$/,'.md')
41+
when ''
42+
local_path += '/' unless local_path.end_with?('/')
43+
local_path += 'index.md'
44+
end
2545
end
2646

2747
# ensure the parent directory exists
2848
FileUtils.mkdir_p(File.dirname(local_path))
2949

50+
# don't overwrite existing files
3051
unless File.exist?(local_path)
3152
puts "Saving #{page.url} -> #{local_path} ..."
3253

33-
File.open(local_path,'wb') do |file|
54+
File.open(local_path,'w') do |file|
3455
if page.html?
35-
layout = 'default'
36-
title = page.title
37-
page_div = page.doc.at('#page')
56+
header = {
57+
'layout' => layout,
58+
'title' => page.title.strip,
59+
'lang' => path.split('/',2).first
60+
}
61+
62+
# add the YAML header
63+
YAML.dump(header,file)
3864

39-
if (header = page_div.at('#head'))
40-
if (header.inner_text.strip == title)
41-
layout = 'page'
65+
# YAML header separator
66+
file.puts '---'
67+
68+
if (content_div = page.at('#content'))
69+
# remove all comments
70+
content_div.traverse do |node|
71+
node.remove if node.comment?
4272
end
43-
end
4473

45-
file.puts(
46-
'---',
47-
"layout: #{layout}",
48-
"title: #{title}",
49-
'---'
50-
)
51-
52-
if (layout == 'default' && page_div)
53-
file.puts(page_div.to_html)
54-
elsif (content_div = page_div.at('#content'))
55-
file.puts(content_div.inner_html)
74+
# remove all page anchors
75+
content_div.search('//a[@id]').remove
76+
77+
# replace all caps spans with their text
78+
content_div.search('span.caps').each do |span|
79+
span.replace(span.inner_text)
80+
end
81+
82+
# map all code elements to their inner_text
83+
content_div.search('pre > code').each do |code|
84+
code.replace(code.children.map { |node|
85+
if node.name == 'br'
86+
$/
87+
else
88+
node.inner_text
89+
end
90+
}.join)
91+
end
92+
93+
# replace the #extended div with it's children
94+
if (extended_div = content_div.at('#extended'))
95+
extended_div.replace(extended_div.inner_html)
96+
end
97+
98+
# convert from HTML to Markdown
99+
content = Kramdown::Document.new(
100+
content_div.inner_html,
101+
:input => :html
102+
).to_kramdown
103+
104+
file.puts(content)
56105
end
57106
else
58107
file.write(page.body)

0 commit comments

Comments
 (0)