Skip to content

Commit c2f4fcd

Browse files
iYassrclaude
andcommitted
Add address detection with explicit context (Option A)
Conservative approach - only detects addresses with explicit labels: - "Address:", "Shipping Address:", "Billing Address:" - "Ship to:", "Deliver to:", "Send to:" - "Location:", "Residence:" - P.O. Box patterns - Arabic label (عنوان) Added to both standalone detector (24 types) and web app. Added 2 new tests for address detection. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent a352526 commit c2f4fcd

File tree

3 files changed

+131
-1
lines changed

3 files changed

+131
-1
lines changed

docs/app/index.html

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,6 +1270,44 @@ <h3>Sanitized</h3>
12701270
}
12711271
}
12721272

1273+
// 24. Addresses (with explicit context only - conservative approach)
1274+
const addressPatterns = [
1275+
// Explicit address labels
1276+
/\b(?:address|mailing\s+address|shipping\s+address|billing\s+address|home\s+address|work\s+address|street\s+address|physical\s+address|postal\s+address)[:\s]+([^\n\r]{10,100})/gi,
1277+
// Ship to / Deliver to patterns
1278+
/\b(?:ship\s+to|deliver\s+to|send\s+to|mail\s+to)[:\s]+([^\n\r]{10,100})/gi,
1279+
// Location patterns
1280+
/\b(?:location|residence)[:\s]+([^\n\r]{10,100})/gi,
1281+
// P.O. Box patterns
1282+
/\b(P\.?O\.?\s*Box\s+\d+(?:[,\s]+[A-Za-z\s]+)?(?:[,\s]+\d{5})?)/gi,
1283+
// Arabic address label
1284+
/\b(?:عنوان|العنوان)[:\s]+([^\n\r]{10,100})/g
1285+
];
1286+
for (const pattern of addressPatterns) {
1287+
pattern.lastIndex = 0;
1288+
while ((match = pattern.exec(text)) !== null) {
1289+
let addressText = (match[1] || match[0]).trim();
1290+
// Clean up
1291+
addressText = addressText
1292+
.replace(/[,;.]+$/, '')
1293+
.replace(/\s+(phone|tel|fax|email|contact|mobile).*$/i, '')
1294+
.trim();
1295+
// Skip if too short or no number and less than 3 words
1296+
if (addressText.length < 10) continue;
1297+
if (!/\d/.test(addressText) && addressText.split(/\s+/).length < 3) continue;
1298+
1299+
const addressStart = match[0].indexOf(addressText);
1300+
const start = match.index + (addressStart >= 0 ? addressStart : 0);
1301+
addEntity({
1302+
text: addressText,
1303+
type: 'address',
1304+
start: start,
1305+
end: start + addressText.length,
1306+
confidence: 90
1307+
});
1308+
}
1309+
}
1310+
12731311
return entities.sort((a, b) => a.start - b.start);
12741312
}
12751313

electron/services/detector.ts

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ export interface NEREntity {
4545
/** The detected text content */
4646
text: string
4747
/** Entity type classification */
48-
type: 'person' | 'financial' | 'credit_card' | 'iban' | 'phone' | 'email' | 'ip' | 'url' | 'domain' | 'saudi_id' | 'ssn' | 'passport' | 'dob' | 'mac_address' | 'api_key' | 'license_plate' | 'medical_record' | 'drivers_license' | 'gps' | 'vin' | 'company_code'
48+
type: 'person' | 'financial' | 'credit_card' | 'iban' | 'phone' | 'email' | 'ip' | 'url' | 'domain' | 'saudi_id' | 'ssn' | 'passport' | 'dob' | 'mac_address' | 'api_key' | 'license_plate' | 'medical_record' | 'drivers_license' | 'gps' | 'vin' | 'company_code' | 'address'
4949
/** Start position in source text (0-indexed) */
5050
start: number
5151
/** End position in source text (exclusive) */
@@ -252,6 +252,11 @@ export async function extractEntities(text: string, userCustomNames?: string[]):
252252
detectCompanyCodes(text, addEntity)
253253
logDebug('Company codes detection complete', { found: entities.length - before })
254254

255+
// 23. Extract addresses (with explicit context only)
256+
before = countBefore()
257+
detectAddresses(text, addEntity)
258+
logDebug('Addresses detection complete', { found: entities.length - before })
259+
255260
// Deduplicate entities (same position)
256261
const seen = new Set<string>()
257262
const uniqueEntities = entities.filter((e) => {
@@ -1583,6 +1588,70 @@ function detectCompanyCodes(
15831588
}
15841589
}
15851590

1591+
/**
1592+
* Detects addresses with explicit context keywords (conservative approach).
1593+
* Only detects addresses that are explicitly labeled to minimize false positives.
1594+
*
1595+
* Supported contexts:
1596+
* - "Address:", "Shipping address:", "Billing address:"
1597+
* - "Ship to:", "Deliver to:", "Send to:"
1598+
* - "Location:", "Residence:", "Home:"
1599+
* - "P.O. Box", "PO Box"
1600+
*
1601+
* @internal
1602+
*/
1603+
function detectAddresses(
1604+
text: string,
1605+
addEntity: (entity: NEREntity) => void
1606+
): void {
1607+
// Context patterns that indicate an address follows
1608+
const addressContextPatterns = [
1609+
// Explicit address labels - capture until end of line or next label
1610+
/\b(?:address|mailing\s+address|shipping\s+address|billing\s+address|home\s+address|work\s+address|street\s+address|physical\s+address|postal\s+address|residential\s+address)[:\s]+([^\n\r]{10,100})/gi,
1611+
// Ship to / Deliver to patterns
1612+
/\b(?:ship\s+to|deliver\s+to|send\s+to|mail\s+to)[:\s]+([^\n\r]{10,100})/gi,
1613+
// Location patterns
1614+
/\b(?:location|residence|domicile)[:\s]+([^\n\r]{10,100})/gi,
1615+
// P.O. Box patterns (very common in Saudi/GCC)
1616+
/\b(P\.?O\.?\s*Box\s+\d+(?:[,\s]+[A-Za-z\s]+)?(?:[,\s]+\d{5})?)/gi,
1617+
// Saudi/GCC format with building/street
1618+
/\b(?:عنوان|العنوان)[:\s]+([^\n\r]{10,100})/g,
1619+
]
1620+
1621+
for (const pattern of addressContextPatterns) {
1622+
pattern.lastIndex = 0
1623+
let match: RegExpExecArray | null
1624+
while ((match = pattern.exec(text)) !== null) {
1625+
// Get the captured address part or full match for P.O. Box
1626+
let addressText = (match[1] || match[0]).trim()
1627+
1628+
// Clean up trailing punctuation and common end markers
1629+
addressText = addressText
1630+
.replace(/[,;.]+$/, '')
1631+
.replace(/\s+(phone|tel|fax|email|contact|mobile).*$/i, '')
1632+
.trim()
1633+
1634+
// Skip if too short (likely not a real address)
1635+
if (addressText.length < 10) continue
1636+
1637+
// Skip if it looks like just a name or single word
1638+
if (!/\d/.test(addressText) && addressText.split(/\s+/).length < 3) continue
1639+
1640+
// Find actual position of address text in the match
1641+
const addressStart = match[0].indexOf(addressText)
1642+
const start = match.index + (addressStart >= 0 ? addressStart : 0)
1643+
1644+
addEntity({
1645+
text: addressText,
1646+
type: 'address',
1647+
start: start,
1648+
end: start + addressText.length,
1649+
confidence: 90
1650+
})
1651+
}
1652+
}
1653+
}
1654+
15861655
/**
15871656
* Detects person names from text (returns only custom names).
15881657
*

tests/web-app.spec.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,4 +287,27 @@ test.describe('Web App - Browser Version', () => {
287287
await expect(table).toContainText('iban')
288288
await expect(table).toContainText('financial')
289289
})
290+
291+
test('detects addresses with explicit context', async ({ page }) => {
292+
await page.click('#toggleTextInput')
293+
await page.fill('#textInput', 'Address: 123 Main Street, Boston, MA 02101')
294+
await page.click('#analyzeTextBtn')
295+
296+
await page.waitForSelector('#step2:not(.hidden)', { timeout: 10000 })
297+
298+
const table = page.locator('#detectionTable')
299+
await expect(table).toContainText('address')
300+
await expect(table).toContainText('123 Main Street')
301+
})
302+
303+
test('detects P.O. Box addresses', async ({ page }) => {
304+
await page.click('#toggleTextInput')
305+
await page.fill('#textInput', 'Send mail to P.O. Box 12345, Riyadh')
306+
await page.click('#analyzeTextBtn')
307+
308+
await page.waitForSelector('#step2:not(.hidden)', { timeout: 10000 })
309+
310+
const table = page.locator('#detectionTable')
311+
await expect(table).toContainText('P.O. Box 12345')
312+
})
290313
})

0 commit comments

Comments
 (0)