@@ -45,7 +45,7 @@ export interface NEREntity {
4545 /** The detected text content */
4646 text : string
4747 /** Entity type classification */
48- type : 'person' | 'financial' | 'credit_card' | 'iban' | 'phone' | 'email' | 'ip' | 'url' | 'domain' | 'saudi_id' | 'ssn' | 'passport' | 'dob' | 'mac_address' | 'api_key' | 'license_plate' | 'medical_record' | 'drivers_license' | 'gps' | 'vin' | 'company_code'
48+ type : 'person' | 'financial' | 'credit_card' | 'iban' | 'phone' | 'email' | 'ip' | 'url' | 'domain' | 'saudi_id' | 'ssn' | 'passport' | 'dob' | 'mac_address' | 'api_key' | 'license_plate' | 'medical_record' | 'drivers_license' | 'gps' | 'vin' | 'company_code' | 'address'
4949 /** Start position in source text (0-indexed) */
5050 start : number
5151 /** End position in source text (exclusive) */
@@ -252,6 +252,11 @@ export async function extractEntities(text: string, userCustomNames?: string[]):
252252 detectCompanyCodes ( text , addEntity )
253253 logDebug ( 'Company codes detection complete' , { found : entities . length - before } )
254254
255+ // 23. Extract addresses (with explicit context only)
256+ before = countBefore ( )
257+ detectAddresses ( text , addEntity )
258+ logDebug ( 'Addresses detection complete' , { found : entities . length - before } )
259+
255260 // Deduplicate entities (same position)
256261 const seen = new Set < string > ( )
257262 const uniqueEntities = entities . filter ( ( e ) => {
@@ -1583,6 +1588,70 @@ function detectCompanyCodes(
15831588 }
15841589}
15851590
1591+ /**
1592+ * Detects addresses with explicit context keywords (conservative approach).
1593+ * Only detects addresses that are explicitly labeled to minimize false positives.
1594+ *
1595+ * Supported contexts:
1596+ * - "Address:", "Shipping address:", "Billing address:"
1597+ * - "Ship to:", "Deliver to:", "Send to:"
1598+ * - "Location:", "Residence:", "Home:"
1599+ * - "P.O. Box", "PO Box"
1600+ *
1601+ * @internal
1602+ */
1603+ function detectAddresses (
1604+ text : string ,
1605+ addEntity : ( entity : NEREntity ) => void
1606+ ) : void {
1607+ // Context patterns that indicate an address follows
1608+ const addressContextPatterns = [
1609+ // Explicit address labels - capture until end of line or next label
1610+ / \b (?: a d d r e s s | m a i l i n g \s + a d d r e s s | s h i p p i n g \s + a d d r e s s | b i l l i n g \s + a d d r e s s | h o m e \s + a d d r e s s | w o r k \s + a d d r e s s | s t r e e t \s + a d d r e s s | p h y s i c a l \s + a d d r e s s | p o s t a l \s + a d d r e s s | r e s i d e n t i a l \s + a d d r e s s ) [: \s] + ( [ ^ \n \r ] { 10 , 100 } ) / gi,
1611+ // Ship to / Deliver to patterns
1612+ / \b (?: s h i p \s + t o | d e l i v e r \s + t o | s e n d \s + t o | m a i l \s + t o ) [: \s] + ( [ ^ \n \r ] { 10 , 100 } ) / gi,
1613+ // Location patterns
1614+ / \b (?: l o c a t i o n | r e s i d e n c e | d o m i c i l e ) [: \s] + ( [ ^ \n \r ] { 10 , 100 } ) / gi,
1615+ // P.O. Box patterns (very common in Saudi/GCC)
1616+ / \b ( P \. ? O \. ? \s * B o x \s + \d + (?: [ , \s ] + [ A - Z a - z \s ] + ) ? (?: [ , \s ] + \d { 5 } ) ? ) / gi,
1617+ // Saudi/GCC format with building/street
1618+ / \b (?: ع ن و ا ن | ا ل ع ن و ا ن ) [: \s] + ( [ ^ \n \r ] { 10 , 100 } ) / g,
1619+ ]
1620+
1621+ for ( const pattern of addressContextPatterns ) {
1622+ pattern . lastIndex = 0
1623+ let match : RegExpExecArray | null
1624+ while ( ( match = pattern . exec ( text ) ) !== null ) {
1625+ // Get the captured address part or full match for P.O. Box
1626+ let addressText = ( match [ 1 ] || match [ 0 ] ) . trim ( )
1627+
1628+ // Clean up trailing punctuation and common end markers
1629+ addressText = addressText
1630+ . replace ( / [ , ; . ] + $ / , '' )
1631+ . replace ( / \s + ( p h o n e | t e l | f a x | e m a i l | c o n t a c t | m o b i l e ) .* $ / i, '' )
1632+ . trim ( )
1633+
1634+ // Skip if too short (likely not a real address)
1635+ if ( addressText . length < 10 ) continue
1636+
1637+ // Skip if it looks like just a name or single word
1638+ if ( ! / \d / . test ( addressText ) && addressText . split ( / \s + / ) . length < 3 ) continue
1639+
1640+ // Find actual position of address text in the match
1641+ const addressStart = match [ 0 ] . indexOf ( addressText )
1642+ const start = match . index + ( addressStart >= 0 ? addressStart : 0 )
1643+
1644+ addEntity ( {
1645+ text : addressText ,
1646+ type : 'address' ,
1647+ start : start ,
1648+ end : start + addressText . length ,
1649+ confidence : 90
1650+ } )
1651+ }
1652+ }
1653+ }
1654+
15861655/**
15871656 * Detects person names from text (returns only custom names).
15881657 *
0 commit comments