Skip to content

Commit d0582c6

Browse files
authored
Merge pull request #2371 from sgoley/docs/duckdb-localhost
duckdb docs (v1.1) - scrape v1
2 parents c07ccf5 + deedda3 commit d0582c6

File tree

8 files changed

+144
-0
lines changed

8 files changed

+144
-0
lines changed

assets/javascripts/news.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
[
2+
[
3+
"2024-11-23",
4+
"New documentation: <a href=\"/duckdb/\">DuckDB</a>"
5+
],
26
[
37
"2024-08-20",
48
"New documentation: <a href=\"/man/\">Linux man pages</a>"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# frozen_string_literal: true
2+
3+
module Docs
4+
class Duckdb
5+
class AttributionFilter < Docs::AttributionFilter
6+
def attribution_link
7+
url = current_url.to_s.sub! 'http://localhost:8000', 'https://duckdb.org'
8+
%(<a href="#{url}" class="_attribution-link">#{url}</a>)
9+
end
10+
end
11+
end
12+
end

lib/docs/filters/duckdb/clean_html.rb

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
module Docs
2+
class Duckdb
3+
class CleanHtmlFilter < Filter
4+
def call
5+
# First extract the main content
6+
@doc = at_css('#main_content_wrap', 'main')
7+
return doc if @doc.nil?
8+
9+
doc.prepend_child at_css('.title').remove
10+
at_css('.title').name = 'h1'
11+
12+
# Remove navigation and header elements
13+
css('.headerline', '.headlinebar', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove
14+
15+
# Clean up code blocks
16+
css('div.highlighter-rouge').each do |node|
17+
node['data-language'] = node['class'][/language-(\w+)/, 1] if node['class']
18+
node.content = node.content.strip
19+
node.name = 'pre'
20+
end
21+
22+
# Remove unnecessary attributes
23+
css('div, span, p').each do |node|
24+
node.remove_attribute('style')
25+
node.remove_attribute('class')
26+
end
27+
28+
# Remove empty elements
29+
css('div, span').each do |node|
30+
node.remove if node.content.strip.empty?
31+
end
32+
33+
# Remove script tags
34+
css('script').remove
35+
36+
doc
37+
end
38+
end
39+
end
40+
end

lib/docs/filters/duckdb/entries.rb

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
module Docs
2+
class Duckdb
3+
class EntriesFilter < Docs::EntriesFilter
4+
def get_name
5+
at_css('h1', '.title').content
6+
end
7+
8+
def get_type
9+
case subpath
10+
when /\Asql\//
11+
'SQL Reference'
12+
when /\Aapi\//
13+
'Client APIs'
14+
when /\Aguides\//
15+
'How-to Guides'
16+
when /\Adata\//
17+
'Data Import'
18+
when /\Aoperations_manual\//
19+
'Operations Manual'
20+
when /\Adev\//
21+
'Development'
22+
when /\Ainternals\//
23+
'Internals'
24+
when /\Aextensions\//
25+
'Extensions'
26+
when /\Aarchive\//
27+
'Archive'
28+
else
29+
'Documentation'
30+
end
31+
end
32+
33+
def additional_entries
34+
entries = []
35+
css('h2[id]', 'h3[id]').each do |node|
36+
name = node.content.strip
37+
# Clean up the name
38+
name = name.gsub(/[\r\n\t]/, ' ').squeeze(' ')
39+
entries << [name, node['id'], get_type]
40+
end
41+
entries
42+
end
43+
end
44+
end
45+
end

lib/docs/scrapers/duckdb.rb

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
module Docs
2+
class Duckdb < UrlScraper
3+
self.name = 'DuckDB'
4+
self.type = 'duckdb'
5+
self.root_path = 'index.html'
6+
self.links = {
7+
home: 'https://duckdb.org/',
8+
code: 'https://github.com/duckdb/duckdb'
9+
}
10+
11+
# https://duckdb.org/docs/guides/offline-copy.html
12+
# curl -O https://duckdb.org/duckdb-docs.zip; bsdtar xf duckdb-docs.zip; cd duckdb-docs; python -m http.server
13+
self.release = '1.1.3'
14+
self.base_url = 'http://localhost:8000/docs/'
15+
16+
html_filters.push 'duckdb/entries', 'duckdb/clean_html'
17+
text_filters.replace 'attribution', 'duckdb/attribution'
18+
19+
options[:container] = '.documentation'
20+
21+
options[:skip_patterns] = [
22+
/installation/,
23+
/archive/,
24+
/reference/,
25+
]
26+
27+
options[:skip] = %w(
28+
docs/archive/
29+
docs/installation/
30+
docs/api/
31+
)
32+
33+
options[:attribution] = <<-HTML
34+
&copy; Copyright 2018&ndash;2024 Stichting DuckDB Foundation<br>
35+
Licensed under the MIT License.
36+
HTML
37+
38+
def get_latest_version(opts)
39+
get_github_tags('duckdb', 'duckdb', opts)[0]['name']
40+
end
41+
end
42+
end

public/icons/docs/duckdb/16.png

902 Bytes
Loading
1.53 KB
Loading

public/icons/docs/duckdb/SOURCE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
https://github.com/duckdb/duckdb/tree/main/logo

0 commit comments

Comments
 (0)