Skip to content

Commit bb549e2

Browse files
authored
Merge pull request #48 from jakeboone02/cjk
Implement `partialUnitMatching` for CJK and other spaceless languages
2 parents 70cee11 + c40ed3b commit bb549e2

File tree

9 files changed

+402
-82
lines changed

9 files changed

+402
-82
lines changed

CHANGELOG.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
### Added
1111

12+
- Extensive internationalization (i18n) support:
13+
- [#46] `groupHeaderPatterns`: customizable words/patterns for group headers
14+
- [#46] `rangeSeparators`: customizable range separator words/patterns (e.g., "bis", "oder", "à")
15+
- [#46] `descriptionStripPrefixes`: customizable prefix words/patterns to strip from descriptions
16+
- [#46] `trailingQuantityContext`: customizable context words indicating trailing quantities
17+
- [#48] `partialUnitMatching` option for CJK and other spaceless languages — when enabled, the parser scans descriptions for known UOM substrings registered via `additionalUOMs`
18+
- [#48] Updated `numeric-quantity` to v3.1.0, which adds support for non-ASCII decimal numeral systems (Arabic-Indic, Devanagari, Bengali, Thai, Fullwidth, and 70+ other Unicode `\p{Nd}` digit blocks).
1219
- [#46] `parseIngredient` now accepts `Array<string>` as well as `string`. Each element of the array is treated as a single ingredient line.
13-
- [#46] Internationalization (i18n) support for parsing keywords
14-
- `groupHeaderPatterns`: customizable words/patterns for group headers
15-
- `rangeSeparators`: customizable range separator words/patterns (e.g., "bis", "oder", "à")
16-
- `descriptionStripPrefixes`: customizable prefix words/patterns to strip from descriptions
17-
- `trailingQuantityContext`: customizable context words indicating trailing quantities
1820
- [#46] `includeMeta` option to include source metadata (`sourceText` and `sourceIndex`) on each parsed ingredient
1921
- [#46] Deprecated legacy exports (`fors`, `forsRegEx`, `rangeSeparatorWords`, `rangeSeparatorRegEx`, `ofs`, `ofRegEx`, `froms`, `fromRegEx`) in favor of new configurable defaults and regex builders
2022

@@ -224,6 +226,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
224226
[#40]: https://github.com/jakeboone02/parse-ingredient/pull/40
225227
[#46]: https://github.com/jakeboone02/parse-ingredient/pull/46
226228
[#47]: https://github.com/jakeboone02/parse-ingredient/pull/47
229+
[#48]: https://github.com/jakeboone02/parse-ingredient/pull/48
227230

228231
<!-- Release comparison links -->
229232

README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,47 @@ parseIngredient(
374374
// ]
375375
```
376376

377+
### `partialUnitMatching`
378+
379+
When `true`, if normal whitespace-based parsing fails to identify a unit of measure, the parser scans the description for known UOM strings registered via `additionalUOMs`. This is useful for CJK languages (Japanese, Chinese, Korean) where words are not separated by spaces.
380+
381+
```js
382+
parseIngredient('砂糖大さじ2', {
383+
partialUnitMatching: true,
384+
additionalUOMs: {
385+
'大さじ': { short: '大さじ', plural: '大さじ', alternates: [] },
386+
},
387+
});
388+
// [
389+
// {
390+
// quantity: 2,
391+
// quantity2: null,
392+
// unitOfMeasure: '大さじ',
393+
// unitOfMeasureID: '大さじ',
394+
// description: '砂糖',
395+
// isGroupHeader: false,
396+
// }
397+
// ]
398+
```
399+
400+
The scan also works with Latin UOMs already known to the library (e.g., `g`, `ml`) and mixed-language ingredient lists:
401+
402+
```js
403+
parseIngredient('砂糖大さじ2\nバター10g\n1 cup flour', {
404+
partialUnitMatching: true,
405+
additionalUOMs: {
406+
'大さじ': { short: '大さじ', plural: '大さじ', alternates: [] },
407+
},
408+
});
409+
// [
410+
// { quantity: 2, unitOfMeasure: '大さじ', description: '砂糖', ... },
411+
// { quantity: 10, unitOfMeasure: 'g', description: 'バター', ... },
412+
// { quantity: 1, unitOfMeasure: 'cup', description: 'flour', ... },
413+
// ]
414+
```
415+
416+
When multiple UOM strings could match, the longest match wins (e.g., `大さじ` is preferred over ``).
417+
377418
## Unit Conversion
378419

379420
### `convertUnit`

bun.lock

Lines changed: 92 additions & 62 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,20 @@
5454
"codesandbox-ci": "bash .codesandbox/ci.sh"
5555
},
5656
"devDependencies": {
57-
"@types/bun": "^1.3.7",
58-
"@types/node": "^25.1.0",
59-
"@types/web": "^0.0.323",
60-
"@typescript/native-preview": "^7.0.0-dev.20260129.1",
57+
"@types/bun": "^1.3.9",
58+
"@types/node": "^25.2.3",
59+
"@types/web": "^0.0.333",
60+
"@typescript/native-preview": "^7.0.0-dev.20260213.1",
6161
"np": "^11.0.2",
62-
"oxlint": "^1.42.0",
63-
"oxlint-tsgolint": "^0.11.3",
62+
"oxlint": "^1.47.0",
63+
"oxlint-tsgolint": "^0.12.1",
6464
"prettier": "3.8.1",
6565
"prettier-plugin-organize-imports": "4.3.0",
66-
"tsdown": "^0.20.1",
66+
"tsdown": "^0.20.3",
6767
"typedoc": "^0.28.16",
6868
"typescript": "^5.9.3"
6969
},
7070
"dependencies": {
71-
"numeric-quantity": "^3.0.0"
71+
"numeric-quantity": "^3.2.0"
7272
}
7373
}

src/constants.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ export const defaultOptions: Required<ParseIngredientOptions> = {
9393
descriptionStripPrefixes: defaultDescriptionStripPrefixes as unknown as (string | RegExp)[],
9494
trailingQuantityContext: defaultTrailingQuantityContext as unknown as string[],
9595
includeMeta: false,
96+
partialUnitMatching: false,
9697
} as const;
9798

9899
// --- Legacy Exports (for backward compatibility) ---

src/parseIngredient.ts

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ import {
1010
} from './constants';
1111
import { identifyUnit } from './convertUnit';
1212
import type { Ingredient, ParseIngredientOptions } from './types';
13+
import { buildUnitLookupMaps, collectUOMStrings, getDefaultUnitLookupMaps } from './unitLookup';
1314

1415
const newLineRegExp = /\r?\n/;
16+
const nextWordRegExp = /^([\p{L}\p{N}_]+(?:[.-]?[\p{L}\p{N}_]+)*[-.]?)(?:\s+|$)/iu;
1517

1618
/**
1719
* Parses a string or array of strings into an array of recipe ingredient objects
@@ -40,6 +42,14 @@ export const parseIngredient = (
4042
const trailingContextRegex = buildTrailingContextRegex(opts.trailingQuantityContext);
4143
const trailingQuantityRegex = buildTrailingQuantityRegex(opts.rangeSeparators);
4244

45+
const uomStrings = opts.partialUnitMatching
46+
? collectUOMStrings(
47+
Object.keys(opts.additionalUOMs).length > 0
48+
? buildUnitLookupMaps(opts.additionalUOMs)
49+
: getDefaultUnitLookupMaps()
50+
)
51+
: [];
52+
4353
const ingredientArray = (
4454
Array.isArray(ingredientText) ? ingredientText : ingredientText.split(newLineRegExp)
4555
)
@@ -193,7 +203,7 @@ export const parseIngredient = (
193203
let finalDesc = remainingDesc;
194204

195205
// Try multi-word unit combinations (greedy matching: prefer longer matches over shorter ones)
196-
const nextWords = remainingDesc.match(/^([\p{L}\p{N}_]+(?:[.-]?[\p{L}\p{N}_]+)*[-.]?)(?:\s+|$)/iu);
206+
const nextWords = remainingDesc.match(nextWordRegExp);
197207
if (nextWords) {
198208
const twoWordCombo = firstWord + ' ' + nextWords[1];
199209
const twoWordID = identifyUnit(twoWordCombo, options);
@@ -217,6 +227,32 @@ export const parseIngredient = (
217227
}
218228
}
219229

230+
// Fallback: scan description for known UOM substrings (for CJK/spaceless text)
231+
if (!oIng.unitOfMeasureID && opts.partialUnitMatching && oIng.description) {
232+
const descLower = oIng.description.toLowerCase();
233+
for (const uomStr of uomStrings) {
234+
const idx = descLower.indexOf(uomStr.toLowerCase());
235+
if (idx === -1) continue;
236+
237+
const matchedText = oIng.description.substring(idx, idx + uomStr.length);
238+
const uomID = identifyUnit(matchedText, options);
239+
if (!uomID) continue;
240+
241+
const before = oIng.description.substring(0, idx).trim();
242+
const after = oIng.description.substring(idx + uomStr.length).trim();
243+
const newDesc = [before, after].filter(Boolean).join(' ');
244+
245+
// Don't extract UOM if it would leave description empty
246+
// (consistent with "2 cup" keeping "cup" as description, not UOM)
247+
if (!newDesc) continue;
248+
249+
oIng.unitOfMeasureID = uomID;
250+
oIng.unitOfMeasure = opts.normalizeUOM ? uomID : matchedText;
251+
oIng.description = newDesc;
252+
break;
253+
}
254+
}
255+
220256
if (!opts.allowLeadingOf && oIng.description.match(stripPrefixRegex)) {
221257
oIng.description = oIng.description.replace(stripPrefixRegex, '');
222258
}

0 commit comments

Comments
 (0)