Skip to content

Commit eaec6ec

Browse files
committed
duckdb docs (v1.1) - scrape v1
1 parent 5e98957 commit eaec6ec

File tree

3 files changed

+155
-0
lines changed

3 files changed

+155
-0
lines changed

lib/docs/filters/duckdb/clean_html.rb

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
module Docs
2+
class Duckdb
3+
class CleanHtmlFilter < Filter
4+
def call
5+
# First extract the main content
6+
@doc = at_css('main')
7+
return doc if @doc.nil?
8+
9+
# Remove navigation and header elements
10+
css('.headerline', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove
11+
12+
# Clean up code blocks
13+
css('pre').each do |node|
14+
# Detect language from class or parent div
15+
if node['class']&.include?('sql') || node.at_css('code.sql')
16+
node['data-language'] = 'sql'
17+
elsif node['class']&.include?('language-sql')
18+
node['data-language'] = 'sql'
19+
end
20+
node.content = node.content.strip
21+
end
22+
23+
# Remove unnecessary attributes but keep essential ones
24+
css('div, span, p').each do |node|
25+
node.remove_attribute('style')
26+
node.remove_attribute('class') unless node['class'] =~ /highlight/
27+
end
28+
29+
# Remove empty elements
30+
css('div, span').each do |node|
31+
node.remove if node.content.strip.empty?
32+
end
33+
34+
# Remove script tags
35+
css('script').remove
36+
37+
doc
38+
end
39+
end
40+
end
41+
end

lib/docs/filters/duckdb/entries.rb

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
module Docs
2+
class Duckdb
3+
class EntriesFilter < Docs::EntriesFilter
4+
def get_name
5+
at_css('h1')&.content || 'DuckDB'
6+
end
7+
8+
def get_type
9+
case subpath
10+
when /\Asql\//
11+
'SQL Reference'
12+
when /\Aapi\//
13+
'Client APIs'
14+
when /\Aguides\//
15+
'How-to Guides'
16+
when /\Adata\//
17+
'Data Import'
18+
when /\Aoperations_manual\//
19+
'Operations Manual'
20+
when /\Adev\//
21+
'Development'
22+
when /\Ainternals\//
23+
'Internals'
24+
when /\Aextensions\//
25+
'Extensions'
26+
when /\Aarchive\//
27+
'Archive'
28+
else
29+
'Documentation'
30+
end
31+
end
32+
33+
def additional_entries
34+
entries = []
35+
css('h2[id]', 'h3[id]').each do |node|
36+
name = node.content.strip
37+
# Clean up the name
38+
name = name.gsub(/[\r\n\t]/, ' ').squeeze(' ')
39+
entries << [name, node['id'], get_type]
40+
end
41+
entries
42+
end
43+
end
44+
end
45+
end

lib/docs/scrapers/duckdb.rb

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
module Docs
2+
class Duckdb < UrlScraper
3+
self.name = 'DuckDB'
4+
self.type = 'duckdb'
5+
self.root_path = 'index.html'
6+
self.links = {
7+
home: 'https://duckdb.org/',
8+
code: 'https://github.com/duckdb/duckdb'
9+
}
10+
11+
html_filters.push 'duckdb/entries', 'duckdb/clean_html'
12+
13+
options[:container] = '.documentation'
14+
15+
options[:skip_patterns] = [
16+
/installation/,
17+
/archive/,
18+
/reference/,
19+
]
20+
21+
options[:skip] = %w(
22+
docs/archive/
23+
docs/installation/
24+
docs/api/
25+
)
26+
27+
options[:attribution] = <<-HTML
28+
&copy; Copyright 2018&ndash;2024 Stichting DuckDB Foundation<br>
29+
Licensed under the MIT License.
30+
HTML
31+
32+
version '1.1' do
33+
self.release = '1.1.x'
34+
self.base_url = 'http://localhost:8000/docs/'
35+
end
36+
37+
# version '1.0' do
38+
# self.release = '1.0.x'
39+
# self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
40+
41+
# html_filters.push 'duckdb/clean_html'
42+
# end
43+
44+
# version '0.9' do
45+
# self.release = '0.9.x'
46+
# self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
47+
48+
# html_filters.push 'duckdb/clean_html'
49+
# end
50+
51+
# version '0.8' do
52+
# self.release = '0.8.x'
53+
# self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
54+
55+
# html_filters.push 'duckdb/clean_html'
56+
# end
57+
58+
# version '0.7' do
59+
# self.release = '0.7.x'
60+
# self.base_url = "https://duckdb.org/docs/archive/#{self.version}/"
61+
62+
# html_filters.push 'duckdb/clean_html'
63+
# end
64+
65+
def get_latest_version(opts)
66+
get_github_tags('duckdb', 'duckdb', opts)
67+
end
68+
end
69+
end

0 commit comments

Comments
 (0)