Skip to content

Commit 94b4044

Browse files
committed
Reindex R documentation, include 2 manuals
Now each page is indexed by their title (by default), and each index term declared for it on the index 2 manuals are included, the data import/export as its own category (as it is rather short), and each top-level section of the R introduction manual (as it is quite a bit longer). Add some manual cleanup. Some pages still seem missing: - either belonging to non-default packages, i.e. it is normal that they miss - or corresponding to index words without their own package (!)
1 parent c3b9337 commit 94b4044

File tree

3 files changed

+89
-19
lines changed

3 files changed

+89
-19
lines changed

lib/docs/filters/r/clean_html.rb

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,45 @@ class R
33
class CleanHtmlFilter < Filter
44
def call
55
slug_parts = slug.split('/')
6-
if slug_parts[0] == 'library'
6+
7+
if root_page?
8+
css('a[href$="/00index"]').each do |pkg|
9+
pkg['href'] = "/r-#{pkg['href'].split('/')[1]}/"
10+
end
11+
12+
elsif slug_parts[0] == 'library'
713
title = at_css('h2')
814
title.inner_html = "<code>#{slug_parts[3]}</code> #{title.content}"
915

1016
summary = at_css('table[summary]')
1117
summary.remove if summary
1218

1319
elsif slug_parts[-2] == 'manual'
20+
css('table.menu, div.header, hr, h2.contents-heading, div.contents, table.index-cp, table.index-vr, table[summary]').remove
21+
22+
css('h2').each do |node|
23+
node.remove if node.content.end_with? ' index'
24+
end
25+
1426
css('span[id] + h1, span[id] + h2, span[id] + h3, span[id] + h4, span[id] + h5, span[id] + h6').each do |node|
15-
id = node.previous['id']
16-
node.previous.remove
17-
node['id'] = id.sub(/-1$/, '') if id
27+
# We need the first of the series of span with ids
28+
span = node.previous_element
29+
while span.previous
30+
prev = span.previous_element
31+
break unless prev.name == 'span' and prev['id']
32+
span.remove
33+
span = prev
34+
end
35+
36+
node['id'] = span['id']
37+
span.remove
38+
39+
css('div.example').each do |node|
40+
node.replace(node.children)
41+
end
1842
end
19-
css('table.menu, div.header, hr').remove
43+
44+
css('h1 + h1').remove
2045

2146
css('.footnote h5').each do |node|
2247
anchor = node.at_css('a[id]')

lib/docs/filters/r/entries.rb

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,16 @@ module Docs
22
class R
33
class EntriesFilter < Docs::EntriesFilter
44

5-
@@include_manual = false
6-
@@include_misc = false
5+
PKG_INDEX_ENTRIES = Hash.new []
76

87
def initialize(*)
98
super
9+
10+
if slug_parts[-1] == '00Index'
11+
css('tr a').each do |link|
12+
PKG_INDEX_ENTRIES[link['href']] += [link.text]
13+
end
14+
end
1015
end
1116

1217
def slug_parts
@@ -18,36 +23,53 @@ def is_package?
1823
end
1924

2025
def is_manual?
21-
slug_parts[-2] == 'manual'
26+
slug_parts[1] == 'manual'
2227
end
2328

2429
def get_name
25-
return slug_parts[3] + ' − ' + at_css('h2').content if is_package?
30+
return at_css('h2').content if is_package?
2631
title = at_css('h1.settitle')
2732
title ? title.content : at_css('h1, h2').content
2833
end
2934

3035
def get_type
3136
return slug_parts[1] if is_package?
3237
return at_css('h1.settitle').content if is_manual?
33-
'Miscellaneous'
3438
end
3539

3640
def include_default_entry?
37-
if is_manual? or slug_parts[-1] == '00Index' or slug_parts[-1] == 'index'
38-
return false
39-
end
40-
is_package? or self.include_misc
41+
is_package? and not slug_parts[-1] == '00Index'
42+
end
43+
44+
def manual_section(node)
45+
title = node.content.sub /^((Appendix )?[A-Z]|[0-9]+)(\.[0-9]+)* /, ''
46+
title unless ['References', 'Preface', 'Acknowledgements'].include?(title) or title.end_with?(' index')
4147
end
4248

4349
def additional_entries
44-
return [] unless is_manual? and self.include_manual
50+
if is_package? and slug_parts[-1] != '00Index'
51+
page = slug_parts[-1]
52+
return [page] + PKG_INDEX_ENTRIES.fetch(page, [])
53+
end
54+
55+
return [] unless is_manual?
4556

4657
entries = []
47-
css('div.contents > ul > li').each do |node|
48-
node.css('a').each do |link|
49-
link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, ''
50-
entries << [link_name, link['href'].split('#')[1], name]
58+
unless slug_parts[-1].downcase == 'r-intro'
59+
# Single top-level category
60+
css('div.contents > ul a').each do |link|
61+
link_name = manual_section(link)
62+
entries << [link_name, link['href'].split('#')[1], name] unless link_name.nil?
63+
end
64+
else
65+
# Split 1st level of manual into different categories
66+
css('div.contents > ul > li').each do |node|
67+
type = manual_section(node.at_css('a'))
68+
next if type.nil?
69+
node.css('> ul a').each do |link|
70+
link_name = link.content.sub /^[0-9A-Z]+(\.[0-9]+)* /, ''
71+
entries << [link_name, link['href'].split('#')[1], type]
72+
end
5173
end
5274
end
5375
return entries

lib/docs/scrapers/r.rb

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,33 @@ class R < FileScraper
2121
HTML
2222

2323
# Never want those
24+
options[:skip_patterns] = [
25+
/\/DESCRIPTION$/,
26+
/\/NEWS(\.[^\/]*)?$/,
27+
/\/demo$/,
28+
/\.pdf$/
29+
]
30+
31+
## We want to fix links like so − but only if the targets don’t exist,
32+
## as these target packages or keywords that do not have their own file,
33+
## but exist on another page, and we properly record it.
34+
#
35+
#options[:fix_urls] = ->(url) do
36+
# url.sub!(%r'/library/([^/]+)/doc/index.html$') { |m| "/r-#{$1.parameterize.downcase}/" }
37+
# url.sub!(%r'/library/([^/]+)/html/([^/]+).html$') { |m| "/library/#{$1.parameterize.downcase}/html/#{$2.parameterize.downcase}" }
38+
#end
39+
2440
options[:skip] = %w(
2541
doc/html/packages-head-utf8.html
2642
doc/html/SearchOn.html
2743
doc/html/Search.html
44+
doc/html/UserManuals.html
45+
doc/html/faq.html
46+
doc/manual/R-FAQ.html
47+
doc/manual/R-admin.html
48+
doc/manual/R-exts.html
49+
doc/manual/R-ints.html
50+
doc/manual/R-lang.html
2851
)
2952

3053
end

0 commit comments

Comments
 (0)