Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions src/cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1197,3 +1197,55 @@ describe("mcp http daemon", () => {
try { require("fs").unlinkSync(pidPath()); } catch {}
});
});

describe("CLI Embed File Size Limit", () => {
test("status shows skipped count when files exceed size limit", async () => {
const env = await createIsolatedTestEnv("sizelimit");
await runQmd(["collection", "add", "."], { ...env });
const { stdout, exitCode } = await runQmd(["status"], {
...env,
env: { QMD_MAX_EMBED_FILE_BYTES: "1" },
});
expect(exitCode).toBe(0);
expect(stdout).toContain("Skipped");
expect(stdout).toContain("exceed");
});

test("status shows no skipped line when files are under default limit", async () => {
const env = await createIsolatedTestEnv("sizelimit-default");
await runQmd(["collection", "add", "."], { ...env });
const { stdout, exitCode } = await runQmd(["status"], { ...env });
expect(exitCode).toBe(0);
expect(stdout).not.toContain("Skipped");
});

test("embed skips files exceeding size limit", async () => {
const env = await createIsolatedTestEnv("embed-skip");
await runQmd(["collection", "add", "."], { ...env });
const { stdout, stderr, exitCode } = await runQmd(["embed"], {
...env,
env: { QMD_MAX_EMBED_FILE_BYTES: "1" },
});
expect(exitCode).toBe(0);
expect(stderr).toContain("Skipping");
expect(stdout).toContain("skipped");
expect(stdout).toContain("No non-empty documents to embed");
});

test("embed --no-size-limit does not skip files", async () => {
const env = await createIsolatedTestEnv("embed-nolimit");
await runQmd(["collection", "add", "."], { ...env });
const { stderr } = await runQmd(["embed", "--no-size-limit"], {
...env,
env: { QMD_MAX_EMBED_FILE_BYTES: "1" },
});
// With --no-size-limit, no files should be skipped (even with tiny env limit)
expect(stderr).not.toContain("Skipping");
});

test("help text mentions --no-size-limit", async () => {
const { stdout, exitCode } = await runQmd(["--help"]);
expect(exitCode).toBe(0);
expect(stdout).toContain("--no-size-limit");
});
});
76 changes: 76 additions & 0 deletions src/embed-config.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/**
* embed-config.test.ts - Tests for embed configuration helpers
*
* Run with: bun test embed-config.test.ts
*/

import { describe, test, expect, beforeEach, afterEach } from "bun:test";
import { getMaxEmbedFileBytes } from "./qmd.js";
import { DEFAULT_MAX_EMBED_FILE_BYTES } from "./store.js";

describe("getMaxEmbedFileBytes", () => {
let originalEnv: string | undefined;

beforeEach(() => {
originalEnv = process.env.QMD_MAX_EMBED_FILE_BYTES;
delete process.env.QMD_MAX_EMBED_FILE_BYTES;
});

afterEach(() => {
if (originalEnv !== undefined) {
process.env.QMD_MAX_EMBED_FILE_BYTES = originalEnv;
} else {
delete process.env.QMD_MAX_EMBED_FILE_BYTES;
}
});

test("returns default when env var is unset", () => {
expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
expect(getMaxEmbedFileBytes()).toBe(5 * 1024 * 1024);
});

test("respects valid numeric env var", () => {
process.env.QMD_MAX_EMBED_FILE_BYTES = "1048576"; // 1MB
expect(getMaxEmbedFileBytes()).toBe(1048576);
});

test("respects large values", () => {
process.env.QMD_MAX_EMBED_FILE_BYTES = "10485760"; // 10MB
expect(getMaxEmbedFileBytes()).toBe(10485760);
});

test("floors fractional values to integer", () => {
process.env.QMD_MAX_EMBED_FILE_BYTES = "1500.7";
expect(getMaxEmbedFileBytes()).toBe(1500);
});

test("falls back to default for non-numeric string", () => {
process.env.QMD_MAX_EMBED_FILE_BYTES = "abc";
expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
});

test("falls back to default for empty string", () => {
process.env.QMD_MAX_EMBED_FILE_BYTES = "";
expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
});

test("falls back to default for zero", () => {
process.env.QMD_MAX_EMBED_FILE_BYTES = "0";
expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
});

test("falls back to default for negative value", () => {
process.env.QMD_MAX_EMBED_FILE_BYTES = "-100";
expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
});

test("falls back to default for Infinity", () => {
process.env.QMD_MAX_EMBED_FILE_BYTES = "Infinity";
expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
});

test("falls back to default for NaN", () => {
process.env.QMD_MAX_EMBED_FILE_BYTES = "NaN";
expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
});
});
55 changes: 47 additions & 8 deletions src/qmd.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import {
isDocid,
matchFilesByGlob,
getHashesNeedingEmbedding,
getEmbedBreakdown,
getHashesForEmbedding,
clearAllEmbeddings,
insertEmbedding,
Expand Down Expand Up @@ -62,6 +63,7 @@ import {
DEFAULT_RERANK_MODEL,
DEFAULT_GLOB,
DEFAULT_MULTI_GET_MAX_BYTES,
DEFAULT_MAX_EMBED_FILE_BYTES,
createStore,
getDefaultDbPath,
} from "./store.js";
Expand Down Expand Up @@ -269,7 +271,8 @@ function showStatus(): void {
// Overall stats
const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
const needsEmbedding = getHashesNeedingEmbedding(db);
const maxEmbedSize = getMaxEmbedFileBytes();
const { needsEmbedding, tooLarge } = getEmbedBreakdown(db, maxEmbedSize);

// Most recent update across all collections
const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
Expand Down Expand Up @@ -301,6 +304,9 @@ function showStatus(): void {
if (needsEmbedding > 0) {
console.log(` ${c.yellow}Pending: ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
}
if (tooLarge > 0) {
console.log(` ${c.dim}Skipped: ${tooLarge} exceed ${formatBytes(maxEmbedSize)} size limit${c.reset}`);
}
if (mostRecent.latest) {
const lastUpdate = new Date(mostRecent.latest);
console.log(` Updated: ${formatTimeAgo(lastUpdate)}`);
Expand Down Expand Up @@ -1482,7 +1488,20 @@ function renderProgressBar(percent: number, width: number = 30): string {
return bar;
}

async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
export function getMaxEmbedFileBytes(): number {
const env = process.env.QMD_MAX_EMBED_FILE_BYTES;
if (!env) return DEFAULT_MAX_EMBED_FILE_BYTES;
const parsed = Number(env);
if (!Number.isFinite(parsed) || parsed <= 0) {
process.stderr.write(
`${c.yellow}Warning: Invalid QMD_MAX_EMBED_FILE_BYTES="${env}", using default ${formatBytes(DEFAULT_MAX_EMBED_FILE_BYTES)}${c.reset}\n`
);
return DEFAULT_MAX_EMBED_FILE_BYTES;
}
return Math.floor(parsed);
}

async function vectorIndex({ model = DEFAULT_EMBED_MODEL, force = false, noSizeLimit = false }: { model?: string; force?: boolean; noSizeLimit?: boolean } = {}): Promise<void> {
const db = getDb();
const now = new Date().toISOString();

Expand All @@ -1507,12 +1526,23 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
let multiChunkDocs = 0;

// Chunk all documents using actual token counts
const maxEmbedSize = noSizeLimit ? Infinity : getMaxEmbedFileBytes();
let skippedFiles = 0;

process.stderr.write(`Chunking ${hashesToEmbed.length} documents by token count...\n`);
for (const item of hashesToEmbed) {
const encoder = new TextEncoder();
const bodyBytes = encoder.encode(item.body).length;
const bodyBytes = Buffer.byteLength(item.body, 'utf8');
if (bodyBytes === 0) continue; // Skip empty

// Content size limit check
if (bodyBytes > maxEmbedSize) {
process.stderr.write(
`${c.yellow}Skipping ${item.path} (${formatBytes(bodyBytes)} exceeds ${formatBytes(maxEmbedSize)} limit)${c.reset}\n`
);
skippedFiles++;
continue;
}

const title = extractTitle(item.body, item.path);
const displayName = item.path;
const chunks = await chunkDocumentByTokens(item.body); // Uses actual tokenizer
Expand All @@ -1527,12 +1557,16 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
seq,
pos: chunks[seq]!.pos,
tokens: chunks[seq]!.tokens,
bytes: encoder.encode(chunks[seq]!.text).length,
bytes: Buffer.byteLength(chunks[seq]!.text, 'utf8'),
displayName,
});
}
}

if (skippedFiles > 0) {
console.log(`${c.yellow}${skippedFiles} file(s) skipped (exceeded ${formatBytes(maxEmbedSize)} file size limit). Use --no-size-limit to include all files.${c.reset}`);
}

if (allChunks.length === 0) {
console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`);
closeDb();
Expand All @@ -1541,7 +1575,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =

const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0);
const totalChunks = allChunks.length;
const totalDocs = hashesToEmbed.length;
const totalDocs = hashesToEmbed.length - skippedFiles;

console.log(`${c.bold}Embedding ${totalDocs} documents${c.reset} ${c.dim}(${totalChunks} chunks, ${formatBytes(totalBytes)})${c.reset}`);
if (multiChunkDocs > 0) {
Expand Down Expand Up @@ -2046,6 +2080,7 @@ function parseCLI() {
mask: { type: "string" }, // glob pattern
// Embed options
force: { type: "boolean", short: "f" },
"no-size-limit": { type: "boolean" },
// Update options
pull: { type: "boolean" }, // git pull before update
refresh: { type: "boolean" },
Expand Down Expand Up @@ -2116,7 +2151,7 @@ function showHelp(): void {
console.log(" qmd multi-get <pattern> [-l N] [--max-bytes N] - Get multiple docs by glob or comma-separated list");
console.log(" qmd status - Show index status and collections");
console.log(" qmd update [--pull] - Re-index all collections (--pull: git pull first)");
console.log(" qmd embed [-f] - Create vector embeddings (800 tokens/chunk, 15% overlap)");
console.log(" qmd embed [-f] [--no-size-limit] - Create vector embeddings (800 tokens/chunk, 15% overlap)");
console.log(" qmd cleanup - Remove cache and orphaned data, vacuum DB");
console.log(" qmd search <query> - Full-text search (BM25)");
console.log(" qmd vsearch <query> - Vector similarity search");
Expand Down Expand Up @@ -2147,6 +2182,10 @@ function showHelp(): void {
console.log(" --max-bytes <num> - Skip files larger than N bytes (default: 10240)");
console.log(" --json/--csv/--md/--xml/--files - Output format (same as search)");
console.log("");
console.log("Embed options:");
console.log(" -f, --force - Force re-index all embeddings");
console.log(" --no-size-limit - Embed all files regardless of size (default limit: 5MB)");
console.log("");
console.log("Models (auto-downloaded from HuggingFace):");
console.log(" Embedding: embeddinggemma-300M-Q8_0");
console.log(" Reranking: qwen3-reranker-0.6b-q8_0");
Expand Down Expand Up @@ -2333,7 +2372,7 @@ if (import.meta.main) {
break;

case "embed":
await vectorIndex(DEFAULT_EMBED_MODEL, !!cli.values.force);
await vectorIndex({ force: !!cli.values.force, noSizeLimit: !!cli.values["no-size-limit"] });
break;

case "pull": {
Expand Down
57 changes: 57 additions & 0 deletions src/store.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import {
isDocid,
STRONG_SIGNAL_MIN_SCORE,
STRONG_SIGNAL_MIN_GAP,
getEmbedBreakdown,
type Store,
type DocumentResult,
type SearchResult,
Expand Down Expand Up @@ -2625,3 +2626,59 @@ describe("isDocid", () => {
expect(isDocid("abc123.md")).toBe(false);
});
});

describe("getEmbedBreakdown", () => {
let store: Store;

beforeEach(async () => {
store = await createTestStore();
});

afterEach(async () => {
await cleanupTestDb(store);
});

test("all docs need embedding when under size limit", async () => {
await insertTestDocument(store.db, "col", { name: "a", body: "short" });
await insertTestDocument(store.db, "col", { name: "b", body: "also short" });
const result = getEmbedBreakdown(store.db, 1_000_000);
expect(result.needsEmbedding).toBe(2);
expect(result.tooLarge).toBe(0);
});

test("all docs too large when over size limit", async () => {
await insertTestDocument(store.db, "col", { name: "a", body: "some content here" });
await insertTestDocument(store.db, "col", { name: "b", body: "more content here" });
const result = getEmbedBreakdown(store.db, 1); // 1 byte limit
expect(result.needsEmbedding).toBe(0);
expect(result.tooLarge).toBe(2);
});

test("mixed sizes split correctly", async () => {
const small = "hi";
const large = "x".repeat(500);
await insertTestDocument(store.db, "col", { name: "small", body: small });
await insertTestDocument(store.db, "col", { name: "large", body: large });
const result = getEmbedBreakdown(store.db, 100);
expect(result.needsEmbedding).toBe(1);
expect(result.tooLarge).toBe(1);
});

test("already embedded docs are excluded", async () => {
const body = "embedded content";
const hash = await hashContent(body);
await insertTestDocument(store.db, "col", { name: "emb", body, hash });
// Simulate existing embedding
store.db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'test', datetime('now'))`).run(hash);
const result = getEmbedBreakdown(store.db, 1_000_000);
expect(result.needsEmbedding).toBe(0);
expect(result.tooLarge).toBe(0);
});

test("inactive docs are excluded", async () => {
await insertTestDocument(store.db, "col", { name: "inactive", body: "content", active: 0 });
const result = getEmbedBreakdown(store.db, 1_000_000);
expect(result.needsEmbedding).toBe(0);
expect(result.tooLarge).toBe(0);
});
});
14 changes: 14 additions & 0 deletions src/store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
export const DEFAULT_GLOB = "**/*.md";
export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
export const DEFAULT_MAX_EMBED_FILE_BYTES = 5 * 1024 * 1024; // 5MB

// Chunking: 800 tokens per chunk with 15% overlap
export const CHUNK_SIZE_TOKENS = 800;
Expand Down Expand Up @@ -913,6 +914,19 @@ export function getHashesNeedingEmbedding(db: Database): number {
return result.count;
}

export function getEmbedBreakdown(db: Database, maxBytes: number): { needsEmbedding: number; tooLarge: number } {
const result = db.prepare(`
SELECT
COUNT(DISTINCT CASE WHEN LENGTH(c.doc) <= ? THEN d.hash END) as needs_embedding,
COUNT(DISTINCT CASE WHEN LENGTH(c.doc) > ? THEN d.hash END) as too_large
FROM documents d
JOIN content c ON d.hash = c.hash
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
WHERE d.active = 1 AND v.hash IS NULL
`).get(maxBytes, maxBytes) as { needs_embedding: number; too_large: number };
return { needsEmbedding: result.needs_embedding, tooLarge: result.too_large };
}

export type IndexHealthInfo = {
needsEmbedding: number;
totalDocs: number;
Expand Down