Skip to content

Commit 7e4e4dc

Browse files
committed
chore: move timescale docs import into repo
Signed-off-by: Matthew Peveler <mpeveler@timescale.com>
1 parent 7339ee3 commit 7e4e4dc

File tree

10 files changed

+2540
-6
lines changed

10 files changed

+2540
-6
lines changed

.env.sample

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ PGHOST=postgres
33
PGDATABASE=tsdb
44
PGPORT=5432
55
PGUSER=readonly_mcp_user
6-
PGPASSWORD=todo
6+
PGPASSWORD=todo
7+
DB_SCHEMA=docs

import/pyproject.toml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[project]
2+
name = "docs-importer"
3+
version = "0.1.0"
4+
description = "Add your description here"
5+
readme = "README.md"
6+
requires-python = ">=3.13"
7+
dependencies = [
8+
"bs4>=0.0.2",
9+
"langchain-text-splitters>=0.3.9",
10+
"markdownify>=1.1.0",
11+
"openai>=1.97.1",
12+
"psycopg[binary,pool]>=3.2.9",
13+
"python-dotenv[cli]>=1.1.1",
14+
"scrapy>=2.13.3",
15+
]

import/timescale_docs.py

Lines changed: 1066 additions & 0 deletions
Large diffs are not rendered by default.

import/timescale_docs_config.toml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Configuration for domain-specific element removal
2+
# Add CSS selectors to ignore for each domain
3+
4+
[domain_selectors]
5+
docs_tigerdata_com = [
6+
"script",
7+
"style",
8+
"nav",
9+
"footer",
10+
"#plan-availability",
11+
".sr-only",
12+
".code-block-copy-button"
13+
]
14+
# Add more domains as needed
15+
16+
# Default selectors applied to all domains
17+
[default_selectors]
18+
selectors = [
19+
"script",
20+
"style",
21+
"nav",
22+
"footer"
23+
]

import/uv.lock

Lines changed: 1075 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import 'dotenv/config';
2+
import { Client } from 'pg';
3+
4+
const schema = process.env.DB_SCHEMA;
5+
if (!schema) {
6+
throw new Error('DB_SCHEMA is not defined');
7+
}
8+
9+
export const description = 'Create schema and memory table';
10+
11+
export async function up() {
12+
const client = new Client();
13+
14+
try {
15+
await client.connect();
16+
await client.query('BEGIN');
17+
await client.query(/* sql */ `
18+
CREATE EXTENSION IF NOT EXISTS vector;
19+
20+
CREATE TABLE IF NOT EXISTS ${schema}.postgres(
21+
id int8 NOT NULL PRIMARY KEY generated by default as identity
22+
, version int2 NOT NULL
23+
, header text NOT NULL
24+
, header_path text[] NOT NULL
25+
, header_depth int4 NOT NULL
26+
, content text NOT NULL
27+
, token_count int8 NOT NULL
28+
, embedding vector(1536)
29+
);
30+
31+
CREATE TABLE ${schema}.timescale_pages (
32+
id int4 PRIMARY KEY generated by default as identity
33+
, url TEXT UNIQUE NOT NULL
34+
, domain TEXT NOT NULL
35+
, filename TEXT NOT NULL
36+
, content_length INTEGER
37+
, scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
38+
, chunking_method TEXT DEFAULT 'header'
39+
, chunks_count INTEGER DEFAULT 0
40+
);
41+
42+
CREATE TABLE IF NOT EXISTS ${schema}.timescale_chunks (
43+
id int4 PRIMARY KEY generated by default as identity
44+
, page_id INTEGER REFERENCES ${schema}.timescale_pages(id) ON DELETE CASCADE
45+
, chunk_index INTEGER NOT NULL
46+
, sub_chunk_index INTEGER NOT NULL DEFAULT 0
47+
, content TEXT NOT NULL
48+
, metadata JSONB
49+
, embedding vector(1536)
50+
, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
51+
);
52+
`);
53+
await client.query('COMMIT');
54+
} catch (e) {
55+
await client.query('ROLLBACK');
56+
throw e;
57+
} finally {
58+
await client.end();
59+
}
60+
}
61+
62+
export async function down() {
63+
const client = new Client();
64+
65+
try {
66+
await client.connect();
67+
await client.query(/* sql */ `
68+
DROP INDEX IF EXISTS ${schema}.idx_memory_key;
69+
DROP TABLE IF EXISTS ${schema}.memory
70+
`);
71+
} finally {
72+
await client.end();
73+
}
74+
}

package-lock.json

Lines changed: 161 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)