Skip to content

Commit 8736916

Browse files
authored
feat: regex for swiss-specific identifiers
feat: regex for swiss-specific identifiers
2 parents b4d3ada + 47192da commit 8736916

File tree

12 files changed

+133
-89
lines changed

12 files changed

+133
-89
lines changed

justfile

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,11 @@ dev:
4545
fetch:
4646
just external::fetch
4747

48-
# Manage secrets.
49-
[group('modules')]
50-
mod secrets 'tools/just/secrets.just'
48+
# Test deploy on minikube instance
49+
test:
50+
bash ./tests/minikube_deploy
51+
minikube stop
52+
5153
# Manage OCI images.
5254
[group('modules')]
5355
mod image 'tools/just/image.just'
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
supported_languages:
22
- en
3+
- fr
4+
- it
5+
- de
36
default_score_threshold: 0

src/chart/conf/default-recognizers.yaml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
supported_languages:
22
- en
3+
- de
4+
- fr
5+
- it
36
global_regex_flags: 26
47

58
recognizers:
@@ -225,3 +228,42 @@ recognizers:
225228
- en
226229
type: predefined
227230
enabled: false
231+
232+
# SWISS-SPECIFIC RECOGNIZERS
233+
234+
- name: ChPhoneRecognizer
235+
supported_language: "fr"
236+
supported_entity: "PHONE_NUMBER"
237+
patterns:
238+
- name: "swiss phone number"
239+
regex: "((\\+|00)41|0)\\s?[1-9][0-9]{1}\\s?[0-9]{3}\\s?[0-9]{2}\\s?[0-9]{2}"
240+
score: 0.01
241+
context:
242+
- téléphone
243+
- Tel
244+
- Tél
245+
- phone
246+
- numéro
247+
248+
- name: ChAVSRecognizer
249+
supported_language: "fr"
250+
supported_entity: "AVS"
251+
patterns:
252+
- name: "swiss AVS / AHV number"
253+
regex: "756\\.[0-9]{4}.[0-9]{4}.[0-9]c"
254+
score: 0.01
255+
context:
256+
- AVS
257+
258+
- name: ChZipCode
259+
supported_language: "fr"
260+
patterns:
261+
- name: "zip code (weak)"
262+
regex: " \\d{4}[^0-9]"
263+
score: 0.01
264+
context:
265+
- zip
266+
- code
267+
- postal
268+
- adresse
269+
supported_entity: "ZIP"

src/chart/conf/default.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
nlp_engine_name: spacy
2+
models:
3+
- lang_code: en
4+
model_name: en_core_web_lg
5+
- lang_code: de
6+
model_name: en_core_web_lg
7+
- lang_code: fr
8+
model_name: en_core_web_lg
9+
- lang_code: it
10+
model_name: en_core_web_lg
11+
12+
ner_model_configuration:
13+
model_to_presidio_entity_mapping:
14+
PER: PERSON
15+
PERSON: PERSON
16+
NORP: NRP
17+
FAC: LOCATION
18+
LOC: LOCATION
19+
GPE: LOCATION
20+
LOCATION: LOCATION
21+
ORG: ORGANIZATION
22+
ORGANIZATION: ORGANIZATION
23+
DATE: DATE_TIME
24+
TIME: DATE_TIME
25+
26+
low_confidence_score_multiplier: 0.4
27+
low_score_entity_names:
28+
-
29+
labels_to_ignore:
30+
- ORGANIZATION # Has many false positives
31+
- CARDINAL
32+
- EVENT
33+
- LANGUAGE
34+
- LAW
35+
- MONEY
36+
- ORDINAL
37+
- PERCENT
38+
- PRODUCT
39+
- QUANTITY
40+
- WORK_OF_ART

src/chart/templates/analyzer-deployment.yaml

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@ spec:
2424
name: {{ default (printf "%s-recognizers-config" (trim $fullname)) .Values.analyzer.recognizersConfigMapName }}
2525
- name: {{ $fullname }}-analyzer-vol
2626
configMap:
27-
name: {{ default (printf "%s-config" (trim $fullname)) .Values.analyzer.analyzerConfigMapName }}
27+
name: {{ default (printf "%s-analyzer-config" (trim $fullname)) .Values.analyzer.analyzerConfigMapName }}
28+
- name: {{ $fullname }}-vol
29+
configMap:
30+
name: {{ default (printf "%s-config" (trim $fullname)) .Values.analyzer.defaultConfigMapName }}
2831
containers:
2932
- name: {{ $fullname }}
3033
image: {{ .Values.registry }}/{{ .Values.analyzer.name }}:{{ default .Chart.AppVersion .Values.tag }}
@@ -43,10 +46,13 @@ spec:
4346
value: {{ .Values.analyzer.service.internalPort | quote }}
4447
volumeMounts:
4548
- name: "{{ $fullname }}-recognizers-vol"
46-
#TO-DO ensure this is the right path in the container
47-
mountPath: /app/presidio_analyzer/conf/default_recognizers.yaml
49+
mountPath: /usr/bin/presidio_analyzer/conf/default_recognizers.yaml
50+
subPath: default-recognizers.yaml
4851
- name: "{{ $fullname }}-analyzer-vol"
49-
#TO-DO ensure this is the right path in the container
50-
mountPath: /app/presidio_analyzer/conf/default_analyzer.yaml
52+
mountPath: /usr/bin/presidio_analyzer/conf/default_analyzer.yaml
53+
subPath: default-analyzer.yaml
54+
- name: "{{ $fullname }}-vol"
55+
mountPath: /usr/bin/presidio_analyzer/conf/default.yaml
56+
subPath: default.yaml
5157
{{ if .Values.privateRegistry }}imagePullSecrets:
5258
- name: {{.Values.privateRegistry}}{{ end }}

src/chart/templates/configmap-default-analyzer.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
apiVersion: v1
44
kind: ConfigMap
55
metadata:
6-
name: "{{ $fullname }}-config"
6+
name: "{{ $fullname }}-analyzer-config"
77
data:
88
default-analyzer.yaml: |-
9-
{{ .Files.Get "../conf/presidio-analyzer/default-analyzer.yaml" | indent 2 }}
9+
{{ .Files.Get "conf/default-analyzer.yaml" | indent 4 }}

src/chart/templates/configmap-default-recognizers.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@ kind: ConfigMap
55
metadata:
66
name: "{{ $fullname }}-recognizers-config"
77
data:
8-
default-analyzer.yaml: |-
9-
{{ .Files.Get "../conf/presidio-analyzer/default-recognizers.yaml" | indent 2 }}
8+
default-recognizers.yaml: |-
9+
{{ .Files.Get "conf/default-recognizers.yaml" | indent 4 }}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{{ $fullname := include "presidio.analyzer.fullname" . }}
2+
3+
apiVersion: v1
4+
kind: ConfigMap
5+
metadata:
6+
name: "{{ $fullname }}-config"
7+
data:
8+
default.yaml: |-
9+
{{ .Files.Get "conf/default.yaml" | indent 4 }}

src/chart/values.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ analyzer:
1717
name: sdsc-ordes/presidio-analyzer
1818
replicas: 1
1919
imagePullPolicy: Always
20-
# analyzerConfigMapName: ""
21-
# recognizersConfigMapName: ""
20+
# analyzerConfigMapName: ""
21+
# defaultConfigMapName: ""
22+
# recognizersConfigMapName: ""
2223
container:
2324
resources:
2425
requests:

tests/minikube_deploy

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
minikube start
6+
7+
helm uninstall presidio || true
8+
helm install --create-namespace presidio ./src/chart
9+
kubectl port-forward svc/presidio-deid-presidio-analyzer 8080:80 &
10+
sleep 10
11+
curl http://localhost:8080/analyze -s \
12+
--header "Content-Type: application/json" \
13+
--request POST \
14+
--data '{"text": "numéro de téléphone de Jean est +41781231212 et son code postal est 1234. Son numéro AVS le 756.2222.2222.2c","language": "fr"}' \
15+
| jq

0 commit comments

Comments
 (0)