Skip to content

Commit 4676f84

Browse files
authored
Merge pull request #2512 from spamguy/github-scraper
Fix GitHub scraper
2 parents 7579a06 + 040082f commit 4676f84

File tree

9 files changed

+60
-27
lines changed

9 files changed

+60
-27
lines changed

lib/docs/filters/github/clean_html.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ module Docs
22
class Github
33
class CleanHtmlFilter < Filter
44
def call
5+
# Remove h1 wrapper to render it correctly.
6+
css('.markdown-heading h1').each do |node|
7+
node.parent.replace(node)
8+
end
9+
510
css('.anchor').each do |node|
611
node.parent['id'] = node['href'].remove('#')
712
node.remove

lib/docs/filters/nginx_lua_module/entries.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ class EntriesFilter < Docs::EntriesFilter
44
def additional_entries
55
entries = []
66

7-
css('#directives + ul > li > a').each do |node|
7+
css('h2:contains("Directives") + ul > li > a').each do |node|
88
entries << [node.content, node['href'].remove('#'), 'Directives']
99
end
1010

11-
css('#nginx-api-for-lua + ul > li > a').each do |node|
11+
css('h2:contains("Nginx API for Lua") + ul > li > a').each do |node|
1212
next if node.content == 'Introduction'
1313
entries << [node.content, node['href'].remove('#'), 'Nginx API for Lua']
1414
end

lib/docs/filters/sanctuary_def/entries.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@ def additional_entries
3939
when "h3"
4040
type = node.text
4141
when "h4"
42+
# Parent <div>'s ID set in github/clean_html.
43+
id = node.parent.attributes["id"].value
4244
name = node.text.split(' :: ')[0]
43-
id = node.attributes["id"].value
45+
4446
entries << [name, id, type]
4547
end
4648
end

lib/docs/filters/sanctuary_type_classes/clean_html.rb

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,6 @@ def call
88
node.name = 'h3'
99
}
1010

11-
# correct and unify link ids
12-
css('h3').each { |node|
13-
node.attributes["id"].value = node.text.split(' :: ')[0]
14-
}
15-
1611
doc
1712
end
1813
end

lib/docs/filters/sanctuary_type_classes/entries.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ def additional_entries
3838
case node.name
3939
when "h2"
4040
type = node.text
41-
if node.attributes["id"].value == "type-class-hierarchy"
41+
if node.parent.attributes["id"]&.value == "type-class-hierarchy"
4242
name = node.text
43-
id = node.attributes["id"].value
43+
id = node.parent.attributes["id"].value
4444
entries << [name, id, type]
4545
end
4646
when "h4"

lib/docs/scrapers/github.rb

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ def process_response?(response)
1616
end
1717

1818
def parse(response)
19-
parsed = JSON.parse(response.response_body)
19+
embedded_json = response
20+
.response_body
21+
.match(/react-app\.embeddedData">(.+?)<\/script>/)
22+
&.captures
23+
&.first
24+
parsed = JSON.parse(embedded_json)
25+
2026
[parsed['payload']['blob']['richText'], parsed['title']]
2127
end
2228
end

lib/docs/scrapers/koa.rb

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@
22

33
module Docs
44
class Koa < Github
5-
self.base_url = 'https://github.com/koajs/koa/tree/master/docs'
6-
self.release = '2.15.0'
7-
8-
self.root_path = 'api/index.md'
95
self.initial_paths = %w[
106
error-handling
117
faq
@@ -26,20 +22,47 @@ class Koa < Github
2622

2723
html_filters.push 'koa/clean_html', 'koa/entries'
2824

29-
options[:skip] = %w[middleware.gif]
25+
options[:skip_patterns] = [/\.gif/]
3026
options[:trailing_slash] = false
3127
options[:container] = '.markdown-body'
3228

33-
options[:fix_urls] = ->(url) do
34-
url.sub! 'https://koajs.com/#error-handling', Koa.base_url + '/error-handling.md'
35-
url
36-
end
29+
3730

3831
options[:attribution] = <<-HTML
3932
&copy; 2020 Koa contributors<br>
4033
Licensed under the MIT License.
4134
HTML
4235

36+
version do
37+
self.base_url = 'https://github.com/koajs/koa/blob/v3.0.0/docs'
38+
self.root_path = 'api/index.md'
39+
self.release = '3.0.0'
40+
options[:fix_urls] = ->(url) do
41+
url.sub! 'https://koajs.com/#error-handling', self.base_url + '/error-handling.md'
42+
url
43+
end
44+
end
45+
46+
version '2' do
47+
self.base_url = 'https://github.com/koajs/koa/blob/v2.16.1/docs'
48+
self.root_path = 'api/index.md'
49+
self.release = '2.16.1'
50+
options[:fix_urls] = ->(url) do
51+
url.sub! 'https://koajs.com/#error-handling', self.base_url + '/error-handling.md'
52+
url
53+
end
54+
end
55+
56+
version '1' do
57+
self.base_url = 'https://github.com/koajs/koa/blob/1.7.1/docs'
58+
self.root_path = 'api/index.md'
59+
self.release = '1.7.1'
60+
options[:fix_urls] = ->(url) do
61+
url.sub! 'https://koajs.com/#error-handling', self.base_url + '/error-handling.md'
62+
url
63+
end
64+
end
65+
4366
def get_latest_version(opts)
4467
get_npm_version('koa', opts)
4568
end

lib/docs/scrapers/nginx_lua_module.rb

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,24 @@ module Docs
22
class NginxLuaModule < Github
33
self.name = 'nginx / Lua Module'
44
self.slug = 'nginx_lua_module'
5-
self.release = '0.10.13'
6-
self.base_url = "https://github.com/openresty/lua-nginx-module/tree/v#{self.release}/"
5+
self.release = '0.10.28'
6+
self.base_url = "https://github.com/openresty/lua-nginx-module/blob/v#{self.release}/"
7+
self.root_path = 'README.markdown'
78
self.links = {
89
code: 'https://github.com/openresty/lua-nginx-module'
910
}
1011

1112
html_filters.push 'nginx_lua_module/clean_html', 'nginx_lua_module/entries', 'title'
1213

1314
options[:root_title] = 'ngx_http_lua_module'
14-
options[:container] = '#readme > article'
15-
15+
options[:container] = '.markdown-body'
16+
options[:max_image_size] = 256_000
1617
options[:attribution] = <<-HTML
1718
&copy; 2009&ndash;2017 Xiaozhe Wang (chaoslawful)<br>
18-
&copy; 2009&ndash;2018 Yichun "agentzh" Zhang (章亦春), OpenResty Inc.<br>
19+
&copy; 2009&ndash;2019 Yichun "agentzh" Zhang (章亦春), OpenResty Inc.<br>
1920
Licensed under the BSD License.
2021
HTML
22+
options[:skip_patterns] = [/\.png/]
2123

2224
def get_latest_version(opts)
2325
tags = get_github_tags('openresty', 'lua-nginx-module', opts)

lib/docs/scrapers/q.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
module Docs
2-
class Q < Github
2+
class Q < UrlScraper
33
self.name = 'Q'
44
self.release = '1.5.1'
55
self.base_url = 'https://github.com/kriskowal/q/wiki/'
@@ -16,7 +16,7 @@ class Q < Github
1616
options[:skip_links] = true
1717

1818
options[:attribution] = <<-HTML
19-
&copy; 2009&ndash;2017 Kristopher Michael Kowal<br>
19+
&copy; 2009&ndash;2018 Kristopher Michael Kowal<br>
2020
Licensed under the MIT License.
2121
HTML
2222

0 commit comments

Comments
 (0)