Skip to content

Commit 72b6f0a

Browse files
authored
OpusCleaner fixes for Japanese and Korean (#1041)
* Disable num-mismatch for Japanes (same reason as Chinese) * Normalize Japanese punctuation Only for en->ja * Disable num_mismatch for Korean
1 parent c77f019 commit 72b6f0a

File tree

4 files changed

+63
-32
lines changed

4 files changed

+63
-32
lines changed

pipeline/clean/opuscleaner/configs/en-ja/default.filters.json

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,69 @@
3030
},
3131
"language": null
3232
},
33+
{
34+
"filter": "regexp",
35+
"parameters": {
36+
"PATTERN": "s#([\\x{3040}-\\x{309F}\\x{30A0}-\\x{30FF}\\x{FF00}-\\x{FFEF}\\x{4E00}-\\x{9FAF}\\x{3000}-\\x{303F}\\x{3400}-\\x{4DBF}\\?])\\!#\\1\\x{ff01}#g"
37+
},
38+
"language": "ja"
39+
},
40+
{
41+
"filter": "regexp",
42+
"parameters": {
43+
"PATTERN": "s#([\\x{3040}-\\x{309F}\\x{30A0}-\\x{30FF}\\x{FF00}-\\x{FFEF}\\x{4E00}-\\x{9FAF}\\x{3000}-\\x{303F}\\x{3400}-\\x{4DBF}\\?])\\?#\\1\\x{ff1f}#g"
44+
},
45+
"language": "ja"
46+
},
47+
{
48+
"filter": "regexp",
49+
"parameters": {
50+
"PATTERN": "s#([\\x{3040}-\\x{309F}\\x{30A0}-\\x{30FF}\\x{FF00}-\\x{FFEF}\\x{4E00}-\\x{9FAF}\\x{3000}-\\x{303F}\\x{3400}-\\x{4DBF}\\?])\\:#\\1\\x{ff1a}#g"
51+
},
52+
"language": "ja"
53+
},
54+
{
55+
"filter": "regexp",
56+
"parameters": {
57+
"PATTERN": "s#\\.\\.\\.\\x{3002}#...#g"
58+
},
59+
"language": "ja"
60+
},
61+
{
62+
"filter": "regexp",
63+
"parameters": {
64+
"PATTERN": "s#\\x{30fb}\\x{30fb}\\x{30fb}#\\x{2026}#g"
65+
},
66+
"language": "ja"
67+
},
68+
{
69+
"filter": "regexp",
70+
"parameters": {
71+
"PATTERN": "s#\\. ?\\. ?\\. ?#\\x{2026}#g"
72+
},
73+
"language": "ja"
74+
},
75+
{
76+
"filter": "regexp",
77+
"parameters": {
78+
"PATTERN": "s#\\x{ff0c}#\\x{3001}#g"
79+
},
80+
"language": "ja"
81+
},
82+
{
83+
"filter": "regexp",
84+
"parameters": {
85+
"PATTERN": "s#([\\x{3040}-\\x{309F}\\x{30A0}-\\x{30FF}\\x{FF00}-\\x{FFEF}\\x{4E00}-\\x{9FAF}\\x{3000}-\\x{303F}\\x{3400}-\\x{4DBF}]),#\\x{3001}#g"
86+
},
87+
"language": "ja"
88+
},
89+
{
90+
"filter": "regexp",
91+
"parameters": {
92+
"PATTERN": "s#([\\x{3040}-\\x{309F}\\x{30A0}-\\x{30FF}\\x{FF00}-\\x{FFEF}\\x{4E00}-\\x{9FAF}\\x{3000}-\\x{303F}\\x{3400}-\\x{4DBF}])\\.\\b#\\x{3002}#g"
93+
},
94+
"language": "ja"
95+
},
3396
{
3497
"filter": "fix_wiki",
3598
"parameters": {
@@ -56,14 +119,6 @@
56119
},
57120
"language": null
58121
},
59-
{
60-
"filter": "num_mismatch",
61-
"parameters": {
62-
"RATIO": 1,
63-
"DEBUG": false
64-
},
65-
"language": null
66-
},
67122
{
68123
"filter": "fasttext_filter",
69124
"parameters": {

pipeline/clean/opuscleaner/configs/en-ko/default.filters.json

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,6 @@
5656
},
5757
"language": null
5858
},
59-
{
60-
"filter": "num_mismatch",
61-
"parameters": {
62-
"RATIO": 1,
63-
"DEBUG": false
64-
},
65-
"language": null
66-
},
6759
{
6860
"filter": "fasttext_filter",
6961
"parameters": {

pipeline/clean/opuscleaner/configs/ja-en/default.filters.json

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,6 @@
5656
},
5757
"language": null
5858
},
59-
{
60-
"filter": "num_mismatch",
61-
"parameters": {
62-
"RATIO": 1,
63-
"DEBUG": false
64-
},
65-
"language": null
66-
},
6759
{
6860
"filter": "fasttext_filter",
6961
"parameters": {

pipeline/clean/opuscleaner/configs/ko-en/default.filters.json

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,6 @@
5656
},
5757
"language": null
5858
},
59-
{
60-
"filter": "num_mismatch",
61-
"parameters": {
62-
"RATIO": 1,
63-
"DEBUG": false
64-
},
65-
"language": null
66-
},
6759
{
6860
"filter": "fasttext_filter",
6961
"parameters": {

0 commit comments

Comments
 (0)