jakeboone02
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 5 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 41 additions & 0 deletions b/‎README.md‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎bun.lock‎
Lines changed: 92 additions & 62 deletions b/‎bun.lock‎
Lines changed: 92 additions & 62 deletions
diff --git a/‎package.json‎
Lines changed: 8 additions & 8 deletions b/‎package.json‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/constants.ts‎
Lines changed: 1 addition & 0 deletions b/‎src/constants.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/parseIngredient.ts‎
Lines changed: 37 additions & 1 deletion b/‎src/parseIngredient.ts‎
Lines changed: 37 additions & 1 deletion
@@ -9,12 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Extensive internationalization (i18n) support:
+  - [#46] `groupHeaderPatterns`: customizable words/patterns for group headers
+  - [#46] `rangeSeparators`: customizable range separator words/patterns (e.g., "bis", "oder", "à")
+  - [#46] `descriptionStripPrefixes`: customizable prefix words/patterns to strip from descriptions
+  - [#46] `trailingQuantityContext`: customizable context words indicating trailing quantities
+  - [#48] `partialUnitMatching` option for CJK and other spaceless languages — when enabled, the parser scans descriptions for known UOM substrings registered via `additionalUOMs`
+  - [#48] Updated `numeric-quantity` to v3.1.0, which adds support for non-ASCII decimal numeral systems (Arabic-Indic, Devanagari, Bengali, Thai, Fullwidth, and 70+ other Unicode `\p{Nd}` digit blocks).
 - [#46] `parseIngredient` now accepts `Array<string>` as well as `string`. Each element of the array is treated as a single ingredient line.
-- [#46] Internationalization (i18n) support for parsing keywords
-  - `groupHeaderPatterns`: customizable words/patterns for group headers
-  - `rangeSeparators`: customizable range separator words/patterns (e.g., "bis", "oder", "à")
-  - `descriptionStripPrefixes`: customizable prefix words/patterns to strip from descriptions
-  - `trailingQuantityContext`: customizable context words indicating trailing quantities
 - [#46] `includeMeta` option to include source metadata (`sourceText` and `sourceIndex`) on each parsed ingredient
 - [#46] Deprecated legacy exports (`fors`, `forsRegEx`, `rangeSeparatorWords`, `rangeSeparatorRegEx`, `ofs`, `ofRegEx`, `froms`, `fromRegEx`) in favor of new configurable defaults and regex builders
 
@@ -224,6 +226,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 [#40]: https://github.com/jakeboone02/parse-ingredient/pull/40
 [#46]: https://github.com/jakeboone02/parse-ingredient/pull/46
 [#47]: https://github.com/jakeboone02/parse-ingredient/pull/47
+[#48]: https://github.com/jakeboone02/parse-ingredient/pull/48
 
 <!-- Release comparison links -->
 
 
@@ -374,6 +374,47 @@ parseIngredient(
 // ]
 ```
 
+### `partialUnitMatching`
+
+When `true`, if normal whitespace-based parsing fails to identify a unit of measure, the parser scans the description for known UOM strings registered via `additionalUOMs`. This is useful for CJK languages (Japanese, Chinese, Korean) where words are not separated by spaces.
+
+```js
+parseIngredient('砂糖大さじ2', {
+  partialUnitMatching: true,
+  additionalUOMs: {
+    '大さじ': { short: '大さじ', plural: '大さじ', alternates: [] },
+  },
+});
+// [
+//   {
+//     quantity: 2,
+//     quantity2: null,
+//     unitOfMeasure: '大さじ',
+//     unitOfMeasureID: '大さじ',
+//     description: '砂糖',
+//     isGroupHeader: false,
+//   }
+// ]
+```
+
+The scan also works with Latin UOMs already known to the library (e.g., `g`, `ml`) and mixed-language ingredient lists:
+
+```js
+parseIngredient('砂糖大さじ2\nバター10g\n1 cup flour', {
+  partialUnitMatching: true,
+  additionalUOMs: {
+    '大さじ': { short: '大さじ', plural: '大さじ', alternates: [] },
+  },
+});
+// [
+//   { quantity: 2, unitOfMeasure: '大さじ', description: '砂糖', ... },
+//   { quantity: 10, unitOfMeasure: 'g', description: 'バター', ... },
+//   { quantity: 1, unitOfMeasure: 'cup', description: 'flour', ... },
+// ]
+```
+
+When multiple UOM strings could match, the longest match wins (e.g., `大さじ` is preferred over `大`).
+
 ## Unit Conversion
 
 ### `convertUnit`
 
@@ -54,20 +54,20 @@
     "codesandbox-ci": "bash .codesandbox/ci.sh"
   },
   "devDependencies": {
-    "@types/bun": "^1.3.7",
-    "@types/node": "^25.1.0",
-    "@types/web": "^0.0.323",
-    "@typescript/native-preview": "^7.0.0-dev.20260129.1",
+    "@types/bun": "^1.3.9",
+    "@types/node": "^25.2.3",
+    "@types/web": "^0.0.333",
+    "@typescript/native-preview": "^7.0.0-dev.20260213.1",
     "np": "^11.0.2",
-    "oxlint": "^1.42.0",
-    "oxlint-tsgolint": "^0.11.3",
+    "oxlint": "^1.47.0",
+    "oxlint-tsgolint": "^0.12.1",
     "prettier": "3.8.1",
     "prettier-plugin-organize-imports": "4.3.0",
-    "tsdown": "^0.20.1",
+    "tsdown": "^0.20.3",
     "typedoc": "^0.28.16",
     "typescript": "^5.9.3"
   },
   "dependencies": {
-    "numeric-quantity": "^3.0.0"
+    "numeric-quantity": "^3.2.0"
   }
 }
@@ -93,6 +93,7 @@ export const defaultOptions: Required<ParseIngredientOptions> = {
   descriptionStripPrefixes: defaultDescriptionStripPrefixes as unknown as (string | RegExp)[],
   trailingQuantityContext: defaultTrailingQuantityContext as unknown as string[],
   includeMeta: false,
+  partialUnitMatching: false,
 } as const;
 
 // --- Legacy Exports (for backward compatibility) ---
 
@@ -10,8 +10,10 @@ import {
 } from './constants';
 import { identifyUnit } from './convertUnit';
 import type { Ingredient, ParseIngredientOptions } from './types';
+import { buildUnitLookupMaps, collectUOMStrings, getDefaultUnitLookupMaps } from './unitLookup';
 
 const newLineRegExp = /\r?\n/;
+const nextWordRegExp = /^([\p{L}\p{N}_]+(?:[.-]?[\p{L}\p{N}_]+)*[-.]?)(?:\s+|$)/iu;
 
 /**
  * Parses a string or array of strings into an array of recipe ingredient objects
@@ -40,6 +42,14 @@ export const parseIngredient = (
   const trailingContextRegex = buildTrailingContextRegex(opts.trailingQuantityContext);
   const trailingQuantityRegex = buildTrailingQuantityRegex(opts.rangeSeparators);
 
+  const uomStrings = opts.partialUnitMatching
+    ? collectUOMStrings(
+        Object.keys(opts.additionalUOMs).length > 0
+          ? buildUnitLookupMaps(opts.additionalUOMs)
+          : getDefaultUnitLookupMaps()
+      )
+    : [];
+
   const ingredientArray = (
     Array.isArray(ingredientText) ? ingredientText : ingredientText.split(newLineRegExp)
   )
@@ -193,7 +203,7 @@ export const parseIngredient = (
         let finalDesc = remainingDesc;
 
         // Try multi-word unit combinations (greedy matching: prefer longer matches over shorter ones)
-        const nextWords = remainingDesc.match(/^([\p{L}\p{N}_]+(?:[.-]?[\p{L}\p{N}_]+)*[-.]?)(?:\s+|$)/iu);
+        const nextWords = remainingDesc.match(nextWordRegExp);
         if (nextWords) {
           const twoWordCombo = firstWord + ' ' + nextWords[1];
           const twoWordID = identifyUnit(twoWordCombo, options);
@@ -217,6 +227,32 @@ export const parseIngredient = (
       }
     }
 
+    // Fallback: scan description for known UOM substrings (for CJK/spaceless text)
+    if (!oIng.unitOfMeasureID && opts.partialUnitMatching && oIng.description) {
+      const descLower = oIng.description.toLowerCase();
+      for (const uomStr of uomStrings) {
+        const idx = descLower.indexOf(uomStr.toLowerCase());
+        if (idx === -1) continue;
+
+        const matchedText = oIng.description.substring(idx, idx + uomStr.length);
+        const uomID = identifyUnit(matchedText, options);
+        if (!uomID) continue;
+
+        const before = oIng.description.substring(0, idx).trim();
+        const after = oIng.description.substring(idx + uomStr.length).trim();
+        const newDesc = [before, after].filter(Boolean).join(' ');
+
+        // Don't extract UOM if it would leave description empty
+        // (consistent with "2 cup" keeping "cup" as description, not UOM)
+        if (!newDesc) continue;
+
+        oIng.unitOfMeasureID = uomID;
+        oIng.unitOfMeasure = opts.normalizeUOM ? uomID : matchedText;
+        oIng.description = newDesc;
+        break;
+      }
+    }
+
     if (!opts.allowLeadingOf && oIng.description.match(stripPrefixRegex)) {
       oIng.description = oIng.description.replace(stripPrefixRegex, '');
     }