@@ -8,7 +8,7 @@ def self.require_deps
88 rubygems
99 fileutils
1010 safe_yaml
11- hpricot
11+ nokogiri
1212 time
1313 open-uri
1414 open_uri_redirections
@@ -22,16 +22,16 @@ def self.specify_options(c)
2222 end
2323
2424 # Will modify post DOM tree
25- def self . download_images ( title , post_hpricot , assets_folder )
26- images = ( post_hpricot / "img" )
25+ def self . download_images ( title , post_doc , assets_folder )
26+ images = post_doc . css ( "img" )
2727 return if images . empty?
2828
29- Jekyll . logger . info "Downloading images for " , title
29+ Jekyll . logger . info "Downloading:" , " images for #{ title } "
3030 images . each do |i |
3131 uri = URI ::DEFAULT_PARSER . escape ( i [ "src" ] )
3232
3333 dst = File . join ( assets_folder , File . basename ( uri ) )
34- i [ "src" ] = File . join ( "{{ site.baseurl }}" , dst )
34+ i [ "src" ] = File . join ( "{{site.baseurl}}" , dst )
3535 Jekyll . logger . info uri
3636 if File . exist? ( dst )
3737 Jekyll . logger . info "Already in cache. Clean assets folder if you want a redownload."
@@ -54,15 +54,18 @@ def self.download_images(title, post_hpricot, assets_folder)
5454
5555 class Item
5656 def initialize ( node )
57+ raise "Node is nil" if node . nil?
58+
5759 @node = node
5860 end
5961
6062 def text_for ( path )
61- @node . at ( path ) . inner_text
63+ subnode = @node . at_xpath ( "./#{ path } " ) || @node . at ( path ) || @node . children . find { |child | child . name == path }
64+ subnode . text
6265 end
6366
6467 def title
65- @title ||= text_for ( : title) . strip
68+ @title ||= text_for ( " title" ) . strip
6669 end
6770
6871 def permalink_title
@@ -76,12 +79,10 @@ def permalink_title
7679 end
7780
7881 def permalink
79- # Hpricot thinks "link" is a self closing tag so it puts the text of the link after the tag
80- # but sometimes it works right! I think it's the xml declaration
8182 @permalink ||= begin
8283 uri = text_for ( "link" )
83- uri = @node . at ( "link" ) . following [ 0 ] if uri . empty?
84- URI ( uri . to_s ) . path
84+ uri = @node . at ( "link" ) . next_sibling . text if uri . empty?
85+ URI ( uri . to_s . strip ) . path
8586 end
8687 end
8788
@@ -127,12 +128,8 @@ def published?
127128
128129 def excerpt
129130 @excerpt ||= begin
130- text = Hpricot ( text_for ( "excerpt:encoded" ) ) . inner_text
131- if text . empty?
132- nil
133- else
134- text
135- end
131+ text = Nokogiri ::HTML ( text_for ( "excerpt:encoded" ) ) . text
132+ text . empty? ? nil : text
136133 end
137134 end
138135 end
@@ -144,29 +141,32 @@ def self.process(options)
144141 FileUtils . mkdir_p ( assets_folder )
145142
146143 import_count = Hash . new ( 0 )
147- doc = Hpricot ::XML ( File . read ( source ) )
144+ doc = Nokogiri ::XML ( File . read ( source ) )
148145 # Fetch authors data from header
149146 authors = Hash [
150- ( doc / :channel / "wp:author" ) . map do |author |
151- [ author . at ( "wp:author_login" ) . inner_text . strip , {
152- "login" => author . at ( "wp:author_login" ) . inner_text . strip ,
153- "email" => author . at ( "wp:author_email" ) . inner_text ,
154- "display_name" => author . at ( "wp:author_display_name" ) . inner_text ,
155- "first_name" => author . at ( "wp:author_first_name" ) . inner_text ,
156- "last_name" => author . at ( "wp:author_last_name" ) . inner_text ,
157- } , ]
147+ doc . xpath ( "//channel/wp:author" ) . map do |author |
148+ [
149+ author . xpath ( "./wp:author_login" ) . text . strip ,
150+ {
151+ "login" => author . xpath ( "./wp:author_login" ) . text . strip ,
152+ "email" => author . xpath ( "./wp:author_email" ) . text ,
153+ "display_name" => author . xpath ( "./wp:author_display_name" ) . text ,
154+ "first_name" => author . xpath ( "./wp:author_first_name" ) . text ,
155+ "last_name" => author . xpath ( "./wp:author_last_name" ) . text ,
156+ } ,
157+ ]
158158 end
159159 ] rescue { }
160160
161- ( doc / : channel / : item) . each do |node |
161+ doc . css ( " channel > item" ) . each do |node |
162162 item = Item . new ( node )
163- categories = node . search ( 'category[@ domain="category"]' ) . map ( &:inner_text ) . reject { |c | c == "Uncategorized" } . uniq
164- tags = node . search ( 'category[@ domain="post_tag"]' ) . map ( &:inner_text ) . uniq
163+ categories = node . css ( 'category[domain="category"]' ) . map ( &:text ) . reject { |c | c == "Uncategorized" } . uniq
164+ tags = node . css ( 'category[domain="post_tag"]' ) . map ( &:text ) . uniq
165165
166166 metas = { }
167- node . search ( " wp:postmeta") . each do |meta |
168- key = meta . at ( " wp:meta_key") . inner_text
169- value = meta . at ( " wp:meta_value") . inner_text
167+ node . xpath ( "./ wp:postmeta") . each do |meta |
168+ key = meta . at_xpath ( "./ wp:meta_key") . text
169+ value = meta . at_xpath ( "./ wp:meta_value") . text
170170 metas [ key ] = value
171171 end
172172
@@ -189,7 +189,7 @@ def self.process(options)
189189 }
190190
191191 begin
192- content = Hpricot ( item . text_for ( "content:encoded" ) )
192+ content = Nokogiri :: HTML ( item . text_for ( "content:encoded" ) )
193193 header [ "excerpt" ] = item . excerpt if item . excerpt
194194
195195 if fetch
@@ -221,7 +221,7 @@ def self.process(options)
221221 end
222222
223223 import_count . each do |key , value |
224- Jekyll . logger . info "Imported #{ value } #{ key } s "
224+ Jekyll . logger . info "Imported" , " #{ value } #{ Util . pluralize ( key , value ) } "
225225 end
226226 end
227227
0 commit comments