Updated the import task to automatically convert to Markdown, and scrub the HTML.

postmodern · postmodern · commit c6137a236114 · 2011-08-07T19:22:55.000-07:00
diff --git a/_tasks/import.rb b/_tasks/import.rb
@@ -1,8 +1,17 @@
+require 'fileutils'
+require 'yaml'
+
 gem 'spidr', '~> 0.3'
 require 'spidr'
-require 'fileutils'
 
-IMPORT_DIR = '_import'
+gem 'kramdown', '~> 0.13'
+require 'kramdown'
+
+OUTPUT_DIR = '_import'
+LAYOUTS = {
+  :default => 'page',
+  :post => 'news_post'
+}
 
 desc 'Spiders ruby-lang.org and imports HTML content'
 task :import do
@@ -11,48 +20,88 @@
     agent.ignore_links_like /\.cgi[\/]?$/
     agent.ignore_links_like /\/[a-z_]+\/old-man\//
 
-    agent.every_failed_url do |url|
-      puts "Not Found #{url}!"
-    end
-
     agent.every_ok_page do |page|
-      local_path = File.join(IMPORT_DIR,page.url.path[1..-1])
+      path = page.url.path[1..-1]
 
-      if local_path[-1..-1] == '/'
-        local_path += 'index.html'
-      elsif File.extname(local_path) == ''
-        local_path += '/index.html'
+      layout = LAYOUTS[:default]
+
+      if path =~ %r{^[a-z_-]+/news/\d{4}/\d{1,2}/\d{1,2}/[^/]+/$}
+        # map news posts in to news/_posts/
+        dirs = path.split('/')
+        local_path = File.join(OUTPUT_DIR,dirs[0,2],'_posts',dirs[2..-1].join('-')) + '.md'
+
+        layout = LAYOUTS[:post]
+      else
+        # normal page
+        local_path = File.join(OUTPUT_DIR,path)
+
+        case File.extname(local_path)
+        when '.html'
+          local_path.gsub!(/\.html$/,'.md')
+        when ''
+          local_path += '/' unless local_path.end_with?('/')
+          local_path += 'index.md'
+        end
       end
 
       # ensure the parent directory exists
       FileUtils.mkdir_p(File.dirname(local_path))
 
+      # don't overwrite existing files
       unless File.exist?(local_path)
         puts "Saving #{page.url} -> #{local_path} ..."
 
-        File.open(local_path,'wb') do |file|
+        File.open(local_path,'w') do |file|
           if page.html?
-            layout = 'default'
-            title = page.title
-            page_div = page.doc.at('#page')
+            header = {
+              'layout' => layout,
+              'title' => page.title.strip,
+              'lang' => path.split('/',2).first
+            }
+
+            # add the YAML header
+            YAML.dump(header,file)
 
-            if (header = page_div.at('#head'))
-              if (header.inner_text.strip == title)
-                layout = 'page'
+            # YAML header separator
+            file.puts '---'
+
+            if (content_div = page.at('#content'))
+              # remove all comments
+              content_div.traverse do |node|
+                node.remove if node.comment?
               end
-            end
 
-            file.puts(
-              '---',
-              "layout: #{layout}",
-              "title: #{title}",
-              '---'
-            )
-
-            if (layout == 'default' && page_div)
-              file.puts(page_div.to_html)
-            elsif (content_div = page_div.at('#content'))
-              file.puts(content_div.inner_html)
+              # remove all page anchors
+              content_div.search('//a[@id]').remove
+
+              # replace all caps spans with their text
+              content_div.search('span.caps').each do |span|
+                span.replace(span.inner_text)
+              end
+
+              # map all code elements to their inner_text
+              content_div.search('pre > code').each do |code|
+                code.replace(code.children.map { |node|
+                  if node.name == 'br'
+                    $/
+                  else
+                    node.inner_text
+                  end
+                }.join)
+              end
+
+              # replace the #extended div with it's children
+              if (extended_div = content_div.at('#extended'))
+                extended_div.replace(extended_div.inner_html)
+              end
+
+              # convert from HTML to Markdown
+              content = Kramdown::Document.new(
+                content_div.inner_html,
+                :input => :html
+              ).to_kramdown
+
+              file.puts(content)
             end
           else
             file.write(page.body)