diff --git a/README.md b/README.md index 0ac087e..9dd304d 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ parse(html: string, options?: Options): ParseResult; - `html`: HTML string to parse. - `options (optional)` - `tokenAdapter`: The adapter option for changing tokens information. + - `rawContentTags` (string[]) : Specifies tag names whose child contents should be treated as raw text, meaning the parser will not interpret characters like < and > as HTML syntax inside these tags. **Returns** diff --git a/src/constants/tokenizer-context-types.ts b/src/constants/tokenizer-context-types.ts index 23f0ca7..2031e9b 100644 --- a/src/constants/tokenizer-context-types.ts +++ b/src/constants/tokenizer-context-types.ts @@ -18,4 +18,5 @@ export enum TokenizerContextTypes { CommentOpen = "CommentOpen", CommentContent = "CommentContent", CommentClose = "CommentClose", + CustomTagRawContent = "CustomTagRawContent", } diff --git a/src/parser/parse.ts b/src/parser/parse.ts index 35b9491..e487cbe 100644 --- a/src/parser/parse.ts +++ b/src/parser/parse.ts @@ -7,7 +7,7 @@ import { Options } from "../types/parse"; export function parse(html: string, options?: Options): ParseResult { const tokenAdapter = (options && options.tokenAdapter) || defaultTokenAdapter; - const { tokens } = tokenize(html, tokenAdapter, options?.templateInfos); + const { tokens } = tokenize(html, tokenAdapter, options); const { ast } = constructTree(tokens); return { ast: clearParent(ast), diff --git a/src/tokenizer/__tests__/__input__/custom-tag-raw-content.html b/src/tokenizer/__tests__/__input__/custom-tag-raw-content.html new file mode 100644 index 0000000..2addf7f --- /dev/null +++ b/src/tokenizer/__tests__/__input__/custom-tag-raw-content.html @@ -0,0 +1,17 @@ + +# Hello, world! + +```cpp{4-6,9} +#include + +class Example { + Example() { + std::cout << "Hello, world!" << std::endl; + } + + Example(std::string name) { + std::cout << "Hello, " << name << std::endl; + } +}; +``` + \ No newline at end of file diff --git a/src/tokenizer/__tests__/__output__/custom-tag-raw-content.ts b/src/tokenizer/__tests__/__output__/custom-tag-raw-content.ts new file mode 100644 index 0000000..6264a5b --- /dev/null +++ b/src/tokenizer/__tests__/__output__/custom-tag-raw-content.ts @@ -0,0 +1,161 @@ +import { TokenTypes } from "../../../constants"; +import { AnyToken } from "../../../types"; + +const OUTPUT: AnyToken[] = [ + { + type: TokenTypes.OpenTagStart, + value: "", + range: [18, 19], + loc: { + start: { + column: 18, + line: 1, + }, + end: { + line: 1, + column: 19, + }, + }, + }, + { + type: TokenTypes.Text, + value: ` +# Hello, world! + +\`\`\`cpp{4-6,9} +#include + +class Example { + Example() { + std::cout << "Hello, world!" << std::endl; + } + + Example(std::string name) { + std::cout << "Hello, " << name << std::endl; + } +}; +\`\`\` +`, + range: [19, 260], + loc: { + start: { + column: 19, + line: 1, + }, + end: { + line: 17, + column: 0, + }, + }, + parts: [], + }, + { + type: TokenTypes.CloseTag, + value: "", + range: [260, 271], + loc: { + start: { + column: 0, + line: 17, + }, + end: { + line: 17, + column: 11, + }, + }, + }, +]; + +export default OUTPUT; diff --git a/src/tokenizer/__tests__/tokenize.spec.ts b/src/tokenizer/__tests__/tokenize.spec.ts index 853a8bb..91e02a2 100644 --- a/src/tokenizer/__tests__/tokenize.spec.ts +++ b/src/tokenizer/__tests__/tokenize.spec.ts @@ -1,6 +1,6 @@ import * as fs from "fs"; import * as path from "path"; -import { tokenize } from "../tokenize"; +import { tokenize, TokenizeOptions } from "../tokenize"; import OPENING_CLOSING_TEXT from "./__output__/opening-closing-text"; import NESTED_TAGS from "./__output__/nested-tags"; import COMMENTS from "./__output__/comments"; @@ -30,7 +30,7 @@ import TEMPLATE_COMMENT from "./__output__/templates-comment"; import TEMPLATE_SCRIPT_CONTENT from "./__output__/templates-script-content"; import TEMPLATE_STYLE_CONTENT from "./__output__/templates-style-content"; import TEMPLATE_CONTENT_END from "./__output__/templates-content-end"; - +import CUSTOM_TAG_RAW_CONTENT from "./__output__/custom-tag-raw-content"; import { defaultTokenAdapter } from "../../token-adapter"; import { Range, TemplateInfo } from "../../types"; @@ -98,78 +98,107 @@ describe("tokenize", () => { "templates-attributes-key.html", TEMPLATE_ATTRIBUTES_KEY, null, - [[5, 11]] as Range[], + { + templateInfos: [[5, 11]] as Range[], + }, ], [ "Template Attributes Key (wrapper)", "templates-attributes-key.html", TEMPLATE_ATTRIBUTES_KEY_WRAPPER, null, - [ - { - open: [5, 7], - close: [10, 11], - }, - ] as TemplateInfo[], + { + templateInfos: [ + { + open: [5, 7], + close: [10, 11], + }, + ] as TemplateInfo[], + }, ], [ "Template Attributes Value Bare", "templates-attributes-value-bare.html", TEMPLATE_ATTRIBUTES_VALUE_BARE, null, - [[8, 13]] as Range[], + { + templateInfos: [[8, 13]] as Range[], + }, ], [ "Template Attributes Value Wrapped", "templates-attributes-value-wrapped.html", TEMPLATE_ATTRIBUTES_VALUE_WRAPPED, null, - [[9, 14]] as Range[], + { + templateInfos: [[9, 14]] as Range[], + }, ], [ "Template Attributes Value Wrapped 2", "templates-attributes-value-wrapped-2.html", TEMPLATE_ATTRIBUTES_VALUE_WRAPPED_2, null, - [ - [16, 22], - [23, 31], - ] as Range[], + { + templateInfos: [ + [16, 22], + [23, 31], + ] as Range[], + }, ], [ "Templates Data", "templates-data.html", TEMPLATE_DATA, null, - [[5, 16]] as Range[], + { + templateInfos: [[5, 16]] as Range[], + }, ], [ "Templates Comment", "templates-comment.html", TEMPLATE_COMMENT, null, - [[4, 14]] as Range[], + { + templateInfos: [[4, 14]] as Range[], + }, ], [ "Templates Script Content", "templates-script-content.html", TEMPLATE_SCRIPT_CONTENT, null, - [[8, 18]] as Range[], + { + templateInfos: [[8, 18]] as Range[], + }, ], [ "Templates Style Content", "templates-style-content.html", TEMPLATE_STYLE_CONTENT, null, - [[7, 17]] as Range[], + { + templateInfos: [[7, 17]] as Range[], + }, ], [ "Templates Content End", "templates-content-end.html", TEMPLATE_CONTENT_END, null, - [[0, 10]] as Range[], + { + templateInfos: [[0, 10]] as Range[], + }, + ], + [ + "Custom Tag Raw Content", + "custom-tag-raw-content.html", + CUSTOM_TAG_RAW_CONTENT, + null, + { + rawContentTags: ["markdown"], + }, ], ])( "%s", @@ -178,18 +207,14 @@ describe("tokenize", () => { input, output, process: null | ((html: string) => string) = null, - ranges: null | TemplateInfo[] + options: TokenizeOptions | null = null ) => { const inputPath = path.join(__dirname, "__input__", input); let html = fs.readFileSync(inputPath, "utf-8"); if (process) { html = process(html); } - const { tokens } = tokenize( - html, - defaultTokenAdapter, - ranges ?? undefined - ); + const { tokens } = tokenize(html, defaultTokenAdapter, options || {}); expect(tokens).toEqual(output); } ); diff --git a/src/tokenizer/handlers/custom-tag-raw-content.ts b/src/tokenizer/handlers/custom-tag-raw-content.ts new file mode 100644 index 0000000..9ad3dea --- /dev/null +++ b/src/tokenizer/handlers/custom-tag-raw-content.ts @@ -0,0 +1,63 @@ +import { + TokenizerContextTypes, + INCOMPLETE_CLOSING_TAG_PATTERN, + TokenTypes, +} from "../../constants"; +import { calculateTokenPosition, createParts } from "../../utils"; +import { Range, TokenizerState } from "../../types"; +import { CharsBuffer } from "../chars-buffer"; + +export function parse(chars: CharsBuffer, state: TokenizerState) { + if ( + chars.value() === "<" || + chars.value() === "" + ); + + if (regex.test(chars.value())) { + return parseClosingCustomTag(state); + } + + state.accumulatedContent.concatBuffer(state.decisionBuffer); + state.decisionBuffer.clear(); + state.sourceCode.next(); +} + +function parseClosingCustomTag(state: TokenizerState) { + if (state.accumulatedContent.value() !== "") { + const position = calculateTokenPosition(state, { keepBuffer: false }); + state.tokens.push({ + type: TokenTypes.Text, + value: state.accumulatedContent.value(), + range: position.range, + loc: position.loc, + parts: createParts(state, TokenTypes.Text), + }); + } + + const range: Range = [ + state.sourceCode.index() - (state.decisionBuffer.length() - 1), + state.sourceCode.index() + 1, + ]; + + state.tokens.push({ + type: TokenTypes.CloseTag, + value: state.decisionBuffer.value(), + range, + loc: state.sourceCode.getLocationOf(range), + }); + + state.accumulatedContent.clear(); + state.decisionBuffer.clear(); + state.contextParams[TokenizerContextTypes.CustomTagRawContent] = undefined; + state.currentContext = TokenizerContextTypes.Data; + state.sourceCode.next(); +} diff --git a/src/tokenizer/handlers/index.ts b/src/tokenizer/handlers/index.ts index cbd182b..5156b86 100644 --- a/src/tokenizer/handlers/index.ts +++ b/src/tokenizer/handlers/index.ts @@ -17,6 +17,7 @@ export * as openTagEnd from "./open-tag-end"; export * as openTagStart from "./open-tag-start"; export * as scriptTagContent from "./script-tag-content"; export * as styleTagContent from "./style-tag-content"; +export * as customTagRawContent from "./custom-tag-raw-content"; export const noop: TokenizeHandler = { parse: () => void 0, }; diff --git a/src/tokenizer/handlers/open-tag-end.ts b/src/tokenizer/handlers/open-tag-end.ts index 2744b6c..deae96f 100644 --- a/src/tokenizer/handlers/open-tag-end.ts +++ b/src/tokenizer/handlers/open-tag-end.ts @@ -39,8 +39,16 @@ function parseClosingCornerBrace(state: TokenizerState) { state.accumulatedContent.clear(); state.decisionBuffer.clear(); - state.currentContext = - contextsMap[tagName || "default"] || contextsMap["default"]; + if (state.rawContentTags?.includes(tagName)) { + state.contextParams[TokenizerContextTypes.CustomTagRawContent] = { + tagName, + }; + state.currentContext = TokenizerContextTypes.CustomTagRawContent; + } else { + state.currentContext = + contextsMap[tagName || "default"] || contextsMap["default"]; + } + state.sourceCode.next(); state.contextParams[TokenizerContextTypes.OpenTagEnd] = undefined; diff --git a/src/tokenizer/tokenize.ts b/src/tokenizer/tokenize.ts index d7bc264..27c88e3 100644 --- a/src/tokenizer/tokenize.ts +++ b/src/tokenizer/tokenize.ts @@ -18,6 +18,7 @@ import { styleTagContent, attributeValue, attributeValueWrapped, + customTagRawContent, noop, } from "./handlers"; import { TokenizeHandler } from "../types"; @@ -42,6 +43,7 @@ const contextHandlers: Record = { [TokenizerContextTypes.DoctypeAttributeWrapped]: doctypeAttributeWrapped, [TokenizerContextTypes.DoctypeAttributeBare]: doctypeAttributeBare, [TokenizerContextTypes.CommentContent]: commentContent, + [TokenizerContextTypes.CustomTagRawContent]: customTagRawContent, [TokenizerContextTypes.CommentOpen]: noop, [TokenizerContextTypes.CommentClose]: noop, }; @@ -61,21 +63,30 @@ function tokenizeChars(state: TokenizerState) { } } +export type TokenizeOptions = { + templateInfos?: TemplateInfo[]; + rawContentTags?: string[]; +}; + export function tokenize( source = "", tokenAdapter: TokenAdapter, - templateInfos?: TemplateInfo[] + options: TokenizeOptions = { + templateInfos: undefined, + rawContentTags: undefined, + } ): { state: TokenizerState; tokens: AnyToken[] } { const tokens: AnyToken[] = []; const state: TokenizerState = { currentContext: TokenizerContextTypes.Data, contextParams: {}, - mode: templateInfos ? "template" : "default", - templateInfos: templateInfos || [], + mode: options.templateInfos ? "template" : "default", + templateInfos: options.templateInfos || [], decisionBuffer: new CharsBuffer(), accumulatedContent: new CharsBuffer(), tokenAdapter, - sourceCode: new SourceCode(source, templateInfos || []), + rawContentTags: options.rawContentTags, + sourceCode: new SourceCode(source, options.templateInfos || []), tokens: { push(token: AnyToken) { tokens.push({ diff --git a/src/tree-constructor/__tests__/construct-tree.spec.ts b/src/tree-constructor/__tests__/construct-tree.spec.ts index e9f8d0b..49b34cd 100644 --- a/src/tree-constructor/__tests__/construct-tree.spec.ts +++ b/src/tree-constructor/__tests__/construct-tree.spec.ts @@ -43,7 +43,6 @@ import TEMPLATE_COMMENT from "../../tokenizer/__tests__/__output__/templates-com import TEMPLATE_SCRIPT_CONTENT from "../../tokenizer/__tests__/__output__/templates-script-content"; import TEMPLATE_STYLE_CONTENT from "../../tokenizer/__tests__/__output__/templates-style-content"; import TEMPLATE_CONTENT_END from "../../tokenizer/__tests__/__output__/templates-content-end"; - import { clearParent } from "../../utils"; import { toMatchFile } from "jest-file-snapshot"; diff --git a/src/types/parse.ts b/src/types/parse.ts index d0925f1..8a71546 100644 --- a/src/types/parse.ts +++ b/src/types/parse.ts @@ -10,5 +10,6 @@ export type ParseResult = { export type Options = { tokenAdapter?: TokenAdapter; + rawContentTags?: string[]; templateInfos?: TemplateInfo[]; }; diff --git a/src/types/tokenizer-state.ts b/src/types/tokenizer-state.ts index 68f87ab..ded3b82 100644 --- a/src/types/tokenizer-state.ts +++ b/src/types/tokenizer-state.ts @@ -18,6 +18,9 @@ type ContextParams = { [TokenizerContextTypes.DoctypeAttributeWrapped]?: { wrapper: string; }; + [TokenizerContextTypes.CustomTagRawContent]?: { + tagName: string; + }; }; export type TokenizerState = { @@ -27,6 +30,7 @@ export type TokenizerState = { decisionBuffer: CharsBuffer; accumulatedContent: CharsBuffer; templateInfos: TemplateInfo[]; + rawContentTags?: string[]; tokenAdapter: TokenAdapter; sourceCode: SourceCode; tokens: {