Skip to content

Commit cda737c

Browse files
committed
basic scrapping working
1 parent 94803a4 commit cda737c

File tree

5 files changed

+136
-1
lines changed

5 files changed

+136
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ public/fonts
66
public/docs/**/*
77
docs/**/*
88
!docs/*.md
9+
vendor

lib/docs/filters/trio/clean_html.rb

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
module Docs
2+
class Trio
3+
class CleanHtmlFilter < Filter
4+
def call
5+
@doc = at_css('div[role="main"]')
6+
css('.section, [itemprop=articleBody]').each do |node|
7+
node.replace node.children
8+
end
9+
10+
css('.headerlink').remove
11+
12+
css('dt').each do |node|
13+
new_node = doc.document.create_element "h3"
14+
new_node.content = node.inner_text[0...-1]
15+
node.replace new_node
16+
end
17+
doc
18+
end
19+
end
20+
end
21+
end

lib/docs/filters/trio/entries.rb

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
module Docs
2+
class Trio
3+
class EntriesFilter < Docs::EntriesFilter
4+
def get_name
5+
at_css('h1').text[0...-1]
6+
end
7+
8+
def get_type
9+
at_css('h1').text[0...-1]
10+
end
11+
12+
def additional_entries
13+
css('.descname').each_with_object [] do |node, entries|
14+
name = node.previous.text + node.text
15+
id = node.parent['id']
16+
entries << [name, id]
17+
end
18+
end
19+
end
20+
end
21+
end

lib/docs/scrapers/trio.rb

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
module Docs
2+
class Trio < UrlScraper
3+
self.type = 'simple'
4+
self.release = '0.11'
5+
self.base_url = 'https://trio.readthedocs.io/en/latest/'
6+
self.root_path = 'index.html'
7+
self.links = {
8+
home: 'https://trio.readthedocs.io/',
9+
code: 'https://github.com/python-trio/trio'
10+
}
11+
12+
html_filters.push 'trio/entries', 'trio/clean_html'
13+
14+
options[:attribution] = <<-HTML
15+
HTML
16+
options[:only_patterns] = [
17+
/reference-core/,
18+
/reference-io/,
19+
/reference-testing/,
20+
/reference-hazmat/,
21+
]
22+
23+
end
24+
end

public/docs/docs.json

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,69 @@
1-
[]
1+
[
2+
{
3+
"name": "Chef",
4+
"slug": "chef~12",
5+
"type": "sphinx_simple",
6+
"links": {
7+
"home": "https://www.chef.io/",
8+
"code": "https://github.com/chef/chef"
9+
},
10+
"version": "12",
11+
"release": "12.13",
12+
"mtime": 1556264506,
13+
"db_size": 7170006
14+
},
15+
{
16+
"name": "CSS",
17+
"slug": "css",
18+
"type": "mdn",
19+
"mtime": 1543099045,
20+
"db_size": 12415944
21+
},
22+
{
23+
"name": "DOM",
24+
"slug": "dom",
25+
"type": "mdn",
26+
"mtime": 1543157862,
27+
"db_size": 33998524
28+
},
29+
{
30+
"name": "DOM Events",
31+
"slug": "dom_events",
32+
"type": "mdn",
33+
"mtime": 1543099589,
34+
"db_size": 1752500
35+
},
36+
{
37+
"name": "HTML",
38+
"slug": "html",
39+
"type": "mdn",
40+
"mtime": 1543097764,
41+
"db_size": 4141596
42+
},
43+
{
44+
"name": "HTTP",
45+
"slug": "http",
46+
"type": "mdn",
47+
"mtime": 1543099392,
48+
"db_size": 4731727
49+
},
50+
{
51+
"name": "JavaScript",
52+
"slug": "javascript",
53+
"type": "mdn",
54+
"mtime": 1543098529,
55+
"db_size": 6462141
56+
},
57+
{
58+
"name": "Trio",
59+
"slug": "trio",
60+
"type": "simple",
61+
"links": {
62+
"home": "https://trio.readthedocs.io/",
63+
"code": "https://github.com/python-trio/trio"
64+
},
65+
"release": "0.11",
66+
"mtime": 1556272773,
67+
"db_size": 736670
68+
}
69+
]

0 commit comments

Comments
 (0)