Skip to content

Commit 4234169

Browse files
authored
Doc Memory updates (#1422)
* DocMemory: html import switched to use markdown intermediate format * Improved chunking to handle large markdown files with nested content * Automatically split: * Large lists into sub-lists * Large tables into sub-tables * Updated test commands to download from http
1 parent 6bdb8c2 commit 4234169

File tree

6 files changed

+383
-102
lines changed

6 files changed

+383
-102
lines changed

ts/examples/chat/src/memory/knowproDoc.ts

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@ import {
1717
ProgressBar,
1818
StopWatch,
1919
} from "interactive-app";
20-
import { ensureDir, getFileName } from "typeagent";
20+
import { changeFileExt, ensureDir, getFileName, readAllText } from "typeagent";
2121
import {
2222
createIndexingEventHandler,
2323
setKnowledgeExtractorV2,
2424
sourcePathToMemoryIndexPath,
2525
} from "./knowproCommon.js";
2626
import { argSourceFile } from "../common.js";
27+
import { getFileNameFromUrl, toUrl } from "examples-lib";
28+
import { getHtml } from "aiclient";
29+
import * as tp from "textpro";
30+
import { pathToFileURL } from "url";
31+
import chalk from "chalk";
2732

2833
export type KnowproDocContext = {
2934
printer: KnowProPrinter;
@@ -47,6 +52,7 @@ export async function createKnowproDocMemoryCommands(
4752

4853
commands.kpDocImport = docImport;
4954
commands.kpDocLoad = docLoad;
55+
commands.kpDocHtmlToMd = htmlToMd;
5056

5157
function docImportDef(): CommandMetadata {
5258
return {
@@ -126,6 +132,58 @@ export async function createKnowproDocMemoryCommands(
126132
writeDocInfo(context.docMemory);
127133
}
128134

135+
function htmlToMdDef(): CommandMetadata {
136+
return {
137+
description: "Convert Html to MD and save to a file",
138+
args: {
139+
path: arg("File path or url"),
140+
},
141+
options: {
142+
rootTag: arg("Root tag to start converting from", "body"),
143+
destPath: arg("Destination path to save file"),
144+
},
145+
};
146+
}
147+
commands.kpDocHtmlToMd.metadata = htmlToMdDef();
148+
async function htmlToMd(args: string[]) {
149+
const namedArgs = parseNamedArguments(args, htmlToMdDef());
150+
let filePath = namedArgs.filePath;
151+
if (!filePath) {
152+
return;
153+
}
154+
let html = "";
155+
let srcUrl = toUrl(filePath);
156+
if (srcUrl !== undefined) {
157+
const htmlResult = await getHtml(srcUrl.href);
158+
if (!htmlResult.success) {
159+
context.printer.writeError(htmlResult.message);
160+
return;
161+
}
162+
html = htmlResult.data;
163+
const fileName = getFileNameFromUrl(srcUrl);
164+
filePath = fileName
165+
? path.join(context.basePath, fileName)
166+
: undefined;
167+
} else {
168+
html = await readAllText(filePath);
169+
}
170+
let markdown = tp.htmlToMarkdown(html, namedArgs.rootTag);
171+
context.printer.writeLine(markdown);
172+
173+
const destPath = namedArgs.destPath
174+
? namedArgs.destPath
175+
: filePath
176+
? changeFileExt(filePath, ".md")
177+
: undefined;
178+
if (destPath) {
179+
fs.writeFileSync(destPath, markdown);
180+
context.printer.writeInColor(
181+
chalk.blueBright,
182+
`Url: ${pathToFileURL(destPath)}`,
183+
);
184+
}
185+
}
186+
129187
async function buildDocIndex(namedArgs: NamedArgs): Promise<void> {
130188
if (!context.docMemory) {
131189
return;

ts/examples/chat/src/memory/knowproTest.ts

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import {
1212
ProgressBar,
1313
} from "interactive-app";
1414
import { KnowproContext, searchDef } from "./knowproMemory.js";
15-
import { argDestFile, argSourceFile } from "../common.js";
15+
import { argChunkSize, argDestFile, argSourceFile } from "../common.js";
1616
import * as kp from "knowpro";
1717
import * as kpTest from "knowpro-test";
1818
import * as cm from "conversation-memory";
@@ -25,7 +25,6 @@ import {
2525
} from "typeagent";
2626
import chalk from "chalk";
2727
import { openai } from "aiclient";
28-
import * as fs from "fs";
2928
import {
3029
createIndexingEventHandler,
3130
sourcePathToMemoryIndexPath,
@@ -48,14 +47,14 @@ export async function createKnowproTestCommands(
4847
commands.kpTestVerifyAnswerBatch = verifyAnswerBatch;
4948
commands.kpTestHtml = testHtml;
5049
commands.kpTestHtmlText = testHtmlText;
51-
commands.kpTestHtmlMd = testHtmlMd;
50+
commands.kpTestMdParse = testMdParse;
5251
commands.kpTestHtmlParts = testHtmlParts;
5352
commands.kpTestChoices = testMultipleChoice;
5453
commands.kpTestSearch = testSearchScope;
5554

5655
async function testHtml(args: string[]) {
5756
const html = await readAllText(args[0]);
58-
const simpleHtml = tp.simplifyHtml(html);
57+
const simpleHtml = tp.htmlSimplify(html);
5958
context.printer.writeLine(simpleHtml);
6059
}
6160

@@ -65,45 +64,61 @@ export async function createKnowproTestCommands(
6564
context.printer.writeLine(text);
6665
}
6766

68-
function testHtmlMdDef(): CommandMetadata {
67+
function testMdParseDef(): CommandMetadata {
6968
return {
7069
description: "Html to MD",
7170
args: {
7271
filePath: arg("File path"),
7372
},
7473
options: {
75-
rootTag: arg("Root tag", "body"),
76-
knowledge: argBool("Extract knowledge", true),
74+
chunkSize: argChunkSize(4096),
75+
knowledge: argBool("Show knowledge", false),
7776
},
7877
};
7978
}
80-
commands.kpTestHtmlMd.metadata = testHtmlMdDef();
81-
async function testHtmlMd(args: string[]) {
82-
const namedArgs = parseNamedArguments(args, testHtmlMdDef());
83-
const filePath = namedArgs.filePath;
79+
commands.kpTestMdParse.metadata = testMdParseDef();
80+
async function testMdParse(args: string[]) {
81+
const namedArgs = parseNamedArguments(args, testMdParseDef());
82+
let filePath = namedArgs.filePath;
8483
if (!filePath) {
8584
return;
8685
}
87-
let html = await readAllText(filePath);
88-
let markdown = tp.htmlToMarkdown(html, namedArgs.rootTag);
89-
context.printer.writeLine(markdown);
90-
91-
const destPath = changeFileExt(filePath, ".md");
92-
fs.writeFileSync(destPath, markdown);
93-
94-
if (!namedArgs.knowledge) {
95-
return;
96-
}
97-
98-
let mdDom = tp.tokenizeMarkdown(markdown);
86+
let markdown = await readAllText(filePath);
87+
let mdDom = tp.markdownTokenize(markdown);
9988
//context.printer.writeJsonInColor(chalk.gray, mdDom);
89+
const chunkSize = namedArgs.chunkSize;
90+
const chunkSizeBuffer = chunkSize + chunkSize * 0.25;
10091
const [textBlocks, knowledgeBlocks] =
101-
tp.textAndKnowledgeBlocksFromMarkdown(mdDom);
92+
tp.markdownToTextAndKnowledgeBlocks(mdDom, chunkSize);
10293
assert(textBlocks.length === knowledgeBlocks.length);
94+
let largeChunkCount = 0;
10395
for (let i = 0; i < textBlocks.length; ++i) {
104-
context.printer.writeLine("=====");
105-
context.printer.writeLine(textBlocks[i]);
106-
context.printer.writeJsonInColor(chalk.gray, knowledgeBlocks[i]);
96+
const textBlock = textBlocks[i];
97+
if (textBlock.length > chunkSizeBuffer) {
98+
largeChunkCount++;
99+
context.printer.writeLineInColor(
100+
chalk.redBright,
101+
`[${textBlock.length}]`,
102+
);
103+
} else {
104+
context.printer.writeLineInColor(
105+
chalk.green,
106+
`[${textBlock.length}]`,
107+
);
108+
}
109+
context.printer.writeLine(textBlock);
110+
111+
if (namedArgs.knowledge) {
112+
context.printer.writeJsonInColor(
113+
chalk.gray,
114+
knowledgeBlocks[i],
115+
);
116+
}
117+
}
118+
if (largeChunkCount > 0) {
119+
context.printer.writeError(
120+
`${largeChunkCount} chunks > [${chunkSize}, ${chunkSizeBuffer}]`,
121+
);
107122
}
108123
}
109124

ts/examples/examplesLib/src/file.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,25 @@ export function getAbsolutePath(relativePath: string): string {
2727
export function getTextOrFile(text: string): string {
2828
return isFilePath(text) ? fs.readFileSync(text, "utf-8") : text;
2929
}
30+
31+
export function toUrl(str: string): URL | undefined {
32+
try {
33+
return new URL(str);
34+
} catch {}
35+
return undefined;
36+
}
37+
38+
export function isUrl(str: string): boolean {
39+
return toUrl(str) !== undefined;
40+
}
41+
42+
export function getFileNameFromUrl(url: string | URL): string | undefined {
43+
if (typeof url === "string") {
44+
url = new URL(url);
45+
}
46+
const urlParts = url.pathname.split("/");
47+
if (urlParts) {
48+
return urlParts.length > 0 ? urlParts[urlParts.length - 1] : "";
49+
}
50+
return undefined;
51+
}

ts/packages/memory/conversation/src/docImport.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ export async function importTextFile(
5151
case ".htm":
5252
parts = docPartsFromHtml(
5353
docText,
54-
true,
54+
false,
5555
maxCharsPerChunk,
5656
sourceUrl,
5757
);
@@ -175,10 +175,10 @@ export function docPartsFromHtml(
175175
*/
176176
export function docPartsFromMarkdown(
177177
markdown: string,
178-
maxCharsPerChunk?: number,
178+
maxCharsPerChunk: number,
179179
sourceUrl?: string,
180180
): DocPart[] {
181-
const [textBlocks, knowledgeBlocks] = tp.textAndKnowledgeBlocksFromMarkdown(
181+
const [textBlocks, knowledgeBlocks] = tp.markdownToTextAndKnowledgeBlocks(
182182
markdown,
183183
maxCharsPerChunk,
184184
);

ts/packages/textPro/src/html.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export function htmlToText(html: string): string {
2424
* @param html html text
2525
* @returns simplified html
2626
*/
27-
export function simplifyHtml(html: string): string {
27+
export function htmlSimplify(html: string): string {
2828
const editor = new HtmlEditor(html);
2929
editor.simplify();
3030
return editor.getHtml();

0 commit comments

Comments
 (0)