community-extensions/extensions/html_query/description.yml at 2f12508beffacc69957cda7d3cf09ad367ae7250 · duckdb/community-extensions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
extension:
  name: html_query
  description: Query HTML using CSS selectors, extract JSON from LD+JSON, JavaScript variables, and Next.js RSC
  version: '2026020501'
  language: Rust
  build: cargo
  license: MIT
  excluded_platforms: "wasm_mvp;wasm_eh;wasm_threads;linux_amd64_musl"
  requires_toolchains: "rust;python3"
  maintainers:
    - onnimonni

repo:
  github: midwork-finds-jobs/duckdb_html_query
  ref: ad8083fbd6b67128e4103a3ed1e66afc828e6815

docs:
  hello_world: |
    -- Extract text from first matching element
    SELECT html_query('<html><title>Hello World</title></html>', 'title', '@text');
    ┌─────────────┐
    │ html_query  │
    │   varchar   │
    ├─────────────┤
    │ Hello World │
    └─────────────┘

    -- Extract all matching elements as VARCHAR[]
    SELECT html_query_all('<div><p>First</p><p>Second</p></div>', 'p', '@text');
    ┌────────────────┐
    │ html_query_all │
    │   varchar[]    │
    ├────────────────┤
    │ [First,Second] │
    └────────────────┘

    -- Extract JSON from LD+JSON scripts (returns array)
    SELECT html_extract_json(
      '<script type="application/ld+json">{"name":"Product"}</script>',
      'script[type="application/ld+json"]'
    );
    ┌──────────────────────────┐
    │    html_extract_json     │
    │         varchar          │
    ├──────────────────────────┤
    │ [{"name":"Product"}]     │
    └──────────────────────────┘

  extended_description: |
    html_query is a DuckDB extension for querying HTML using CSS selectors.

    ## Functions

    | Function | Returns | Description |
    |----------|---------|-------------|
    | `html_query(html, selector?, extract?)` | VARCHAR | First matching element |
    | `html_query_all(html, selector?, extract?)` | VARCHAR[] | All matching elements as list |
    | `html_extract_json(html, selector, var_pattern?)` | JSON array | JSON from script tags |

    All functions accept both VARCHAR and BLOB input types.

    ### Extract Parameter

    The `extract` parameter specifies what to extract from matched elements:

    | Value | Description |
    |-------|-------------|
    | (omitted) | Full HTML of element |
    | `@text` or `text` | Inner text content |
    | `@href`, `href` | href attribute |
    | `@src`, `src` | src attribute |
    | `data-test-id` | Any attribute name |
    | `['@href', '@text']` | Multiple attributes as JSON object |

    ### `html_query(html, selector, extract)`

    Extract first HTML element matching CSS selector.

    **Examples:**
    ```sql
    SELECT html_query(html, 'title', '@text') FROM pages;
    SELECT html_query(html, 'a.nav-link', '@href') FROM pages;
    SELECT html_query(html, 'div.content') FROM pages;  -- returns HTML
    ```

    ### `html_query_all(html, selector, extract)`

    Extract all HTML elements matching CSS selector as VARCHAR[].

    **Examples:**
    ```sql
    SELECT html_query_all(html, 'a', '@href') FROM pages;
    -- Returns: [/home, /about, /contact]

    SELECT list_extract(html_query_all(html, 'a', '@href'), 2) FROM pages;
    -- Access second element

    SELECT html_query_all(html, 'a', ['@href', '@text']) FROM pages;
    -- Returns: [{"href":"/home","text":"Home"}, ...]
    ```

    ### `html_extract_json(html, selector, var_pattern)`

    Extract JSON from HTML scripts. Supports LD+JSON, JavaScript variables, and Next.js RSC.
    Always returns a JSON array.

    **Examples:**
    ```sql
    -- Extract LD+JSON (decodes HTML entities)
    SELECT html_extract_json(html, 'script[type="application/ld+json"]')->0 FROM pages;

    -- Extract JS variable (decodes hex escapes like \x22)
    SELECT html_extract_json(html, 'script', 'var config')->0 FROM pages;

    -- Extract JSON from Next.js RSC payloads by key name
    SELECT html_extract_json(html, 'script', '@nextjs_rsc:productDisplay')->0 FROM pages;
    ```

    ## CSS Selectors

    - Tag: `div`, `p`, `a`
    - Class: `.classname`
    - ID: `#idname`
    - Attribute: `[href]`, `[type="application/ld+json"]`
    - Pseudo: `:first-child`, `:last-child`, `:nth-child(n)`
    - Combinators: `div > p`, `div p`